update

cadia-lvl · Dec 10, 2023 · e498924 · e498924
1 parent 88e4126
commit e498924
Show file tree

Hide file tree

Showing 6 changed files with 138 additions and 387 deletions.
diff --git a/README.md b/README.md
@@ -110,7 +110,7 @@ This section provides instructions for using the `process.py` script, which perf
 
 1. Run the script:
     - python process.py
-2. When prompted, select the `icetagger.bat` file located in the extracted IceNLP directory (IceNLP-1.5.0\IceNLP\bat\icetagger).
+2. When prompted, select the `icetagger.bat` file located in the extracted IceNLP directory (`IceNLP-1.5.0\IceNLP\bat\icetagger`).
 3. Ensure the dataset file (`IMDB-Dataset-MideindTranslate.csv`) is located in the `Datasets` directory relative to the script.
 4. The script will process the dataset and output the processed data to `Datasets/IMDB-Dataset-MideindTranslate-processed-nefnir.csv`.
 
@@ -158,6 +158,20 @@ To use a different dataset:
 
 ## Baseline Classifiers
 
+This section provides instructions for using the `BaselineClassifiersBinary.ipynb` script, which trains SVC, Logistic Regression and Naive Bayes on English, Icelandic Google and Icelandic Miðeind datasets, it also generates classification reports for each model.
+
+### Prerequisites
+
+-   Python 3.x
+-   PyTorch
+-   Pandas library
+-   Scikit-learn library
+-   Other dependencies: `os`, `time`, `numpy`
+
+### Usage
+
+To into `BaselineClassifiersBinary.ipynb` and run the cells. You have to change the `ICELANDIC_GOOGLE_CSV`, `ICELANDIC_MIDEIND_CSV` and `ENGLISH_CSV` variables to point to the correct datasets. The cell will train and print out the classification reports for each model. It will also show a diagram. You can refer to the next cell if you want to print out the most important features, altough this is not necessary.
+
 ## Transformer Models
 
 This section provides instructions for using the `train.py` script, which trains a transformer model for sentiment analysis.
@@ -193,9 +207,34 @@ To use a different dataset:
 -   The dataset should be in CSV format with 'review' and 'sentiment' columns.
 -   Modify the `dataset_path` variable in the script to match your dataset's filename.
 
-# Style
+## Generating Classification Reports
+
+This section provides instructions for using the `generate_report.py` script, which generates a classification report for a trained model.
+This is useful mostly for the transformer models, as the baseline classifiers generate their own reports via the same libraries.
+
+This function will call the model and generate a classification report for the model. What it expects is the path to a folder of the model, the device to use,
+the pandas columns to use as X and y, and whether to return the accuracy or the classification report.
+
+### Installation
+
+1. Ensure Python 3.x is installed.
+2. Install the required Python packages:
+    - pip install transformers torch pandas scikit-learn
+
+### Usage
+
+1. Import generate_classification_report.py `import generate_classification_report as gcr`
+2. Load the CSV file with the data to be tested `df = pd.read_csv('IMDB-Dataset-GoogleTranslate.csv')`
+3. Invoke the function call call_model, which takes the parameters
+- X_all: All review columns
+- y_all: All sentiment columns
+- model: The model to be used (This is a path to a file, something like `'./electra-base-google-batch8-remove-noise-model/'`)
+- device: The device to be used (CUDA, cpu)
+- accuracy: Whether to return accuracy or return a classification report
+
+### Example
 
-[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
+Example of how to generate a report can be seen in `generate_report.ipynb` - also the `generate_classification_report.py`  `eval_files()` function, which is loading multiple models.
 
 # License
 

diff --git a/README.pdf b/README.pdf
diff --git a/src/BaselineClassifiersBinary.ipynb b/src/BaselineClassifiersBinary.ipynb
@@ -1,98 +1,5 @@
 {
  "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 32,
-   "id": "aae6d4d5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "import numpy as np\n",
-    "import html\n",
-    "import re\n",
-    "\n",
-    "csv = pd.read_csv(\"../Hannes-Movie-Reviews.csv\")\n",
-    "\n",
-    "\n",
-    "def lower_case(txt):\n",
-    "    return txt.lower()\n",
-    "\n",
-    "\n",
-    "def remove_brackets(txt):\n",
-    "    return re.sub(r\"[()\\[\\]{}<>]\", \"\", txt)\n",
-    "\n",
-    "\n",
-    "def fix_repeated_characters(txt):\n",
-    "    return re.sub(r\"(.)\\1{5,}\", r\"\\1\", txt)\n",
-    "\n",
-    "\n",
-    "def remove_special_characters(txt):\n",
-    "    pattern = r\"[^a-zA-záðéíóúýþæö.?!;:,\\s]\"\n",
-    "    txt = re.sub(pattern, \"\", txt)\n",
-    "    return txt\n",
-    "\n",
-    "\n",
-    "def clean_html(txt):\n",
-    "    clean = re.compile(\"<.*?>\")\n",
-    "    return re.sub(clean, \"\", txt)\n",
-    "\n",
-    "\n",
-    "def remove_overly_long_words(txt):\n",
-    "    return \" \".join([t for t in txt.split(\" \") if len(t) < 30])\n",
-    "\n",
-    "\n",
-    "def remove_noise(txt):\n",
-    "    txt = html.unescape(txt)\n",
-    "    txt = clean_html(txt)\n",
-    "    txt = remove_brackets(txt)\n",
-    "    txt = lower_case(txt)\n",
-    "    txt = remove_special_characters(txt)\n",
-    "    txt = fix_repeated_characters(txt)\n",
-    "    txt = remove_overly_long_words(txt)\n",
-    "    return txt.strip().replace(\"\\n\", \" \").replace('\"', \"\")\n",
-    "\n",
-    "\n",
-    "csv.drop([\"id\"], inplace=True, axis=1)\n",
-    "\n",
-    "csv[\"review\"] = csv[\"review\"].apply(\n",
-    "    lambda review: re.sub(r\"&#(\\d+);\", lambda m: chr(int(m.group(1))), review)\n",
-    ")\n",
-    "csv[\"review\"] = csv[\"review\"].apply(lambda review: remove_noise(review))\n",
-    "csv[\"sentiment\"] = csv[\"sentiment\"].apply(\n",
-    "    lambda sentiment: \"positive\" if sentiment >= 6 else \"negative\"\n",
-    ")\n",
-    "\n",
-    "\n",
-    "csv.head()\n",
-    "csv.to_csv(\"../Hannes-Movie-Reviews.csv\", index=False)\n",
-    "\n",
-    "# re.sub(r'&#(\\d+);',lambda m: chr(int(m.group(1))), f)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "id": "f7e80ba4",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'Þetta er \\ntext sem er skiptur\\n'"
-      ]
-     },
-     "execution_count": 24,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "\"\"\"Þetta er \n",
-    "text sem er skiptur\n",
-    "\"\"\""
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 14,
@@ -366,6 +273,10 @@
     "from sklearn.model_selection import train_test_split\n",
     "from sklearn.metrics import classification_report\n",
     "\n",
+    "ICELANDIC_GOOGLE_CSV = \"../IMDB-Dataset-GoogleTranslate-processed-nefnir.csv\"\n",
+    "ICELANDIC_MIDEIND_CSV = \"../IMDB-Dataset-MideindTranslate-processed-nefnir.csv\"\n",
+    "ENGLISH_CSV = \"../IMDB-Dataset-Processed.csv\"\n",
+    "\n",
     "\n",
     "def clear_df(data):\n",
     "    if \"Unnamed: 0\" in data.columns:\n",
@@ -442,13 +353,13 @@
     "    print(nb_pipeline)\n",
     "    print(classification_report(y_test, predict_nb, digits=4))\n",
     "\n",
-    "    # print(svc_pipeline)\n",
-    "    # print(classification_report(y_test, predict_svc, digits=4))\n",
+    "    print(svc_pipeline)\n",
+    "    print(classification_report(y_test, predict_svc, digits=4))\n",
     "\n",
-    "    # print(lr_pipeline)\n",
-    "    # print(classification_report(y_test, predict_lr, digits=4))\n",
+    "    print(lr_pipeline)\n",
+    "    print(classification_report(y_test, predict_lr, digits=4))\n",
     "\n",
-    "    # return { \"SVC\": (1,2,3,4 ), \"Naive Bayes\": (1,2,3,4), \"Logistic Regression\": (1,2,3,4) }, [\"F1\", \"Recall\", \"Precision\", \"Accuracy\"]\n",
+    "    \n",
     "    return (\n",
     "        (\n",
     "            {\n",
@@ -467,16 +378,8 @@
     "def plot_f1(f1s, title):\n",
     "    labels, values = f1s\n",
     "    plt.figure(figsize=(32, 16))\n",
-    "    fig, ax = plt.subplots(figsize=(24, 12))  # layout='constrained')\n",
+    "    fig, ax = plt.subplots(figsize=(24, 12))\n",
     "    plt.grid(color=\"grey\", linestyle=\"-.\")\n",
-    "    # for s in ['top', 'bottom', 'left', 'right']:\n",
-    "    #    ax.spines[s].set_visible(False)\n",
-    "    # ax.xaxis.set_ticks_position('none')\n",
-    "    # ax.yaxis.set_ticks_position('none')\n",
-    "\n",
-    "    # Add padding between axes and labels\n",
-    "    # ax.xaxis.set_tick_params(pad = 5)\n",
-    "    # ax.yaxis.set_tick_params(pad = 10)\n",
     "    x = np.arange(4)\n",
     "    width = 0.25\n",
     "    multiplier = 0\n",
@@ -496,77 +399,15 @@
     "    plt.show()\n",
     "\n",
     "\n",
-    "# data = classify(\"../IMDB-Dataset-GoogleTranslate-Processed.csv\")\n",
-    "data2, nb_mideind, svc_mideind, lr_mideind = classify(\n",
-    "    \"../IMDB-Dataset-MideindTranslate-processed-nefnir.csv\"\n",
-    ")\n",
-    "data3, nb_google, svc_google, lr_google = classify(\n",
-    "    \"../IMDB-Dataset-GoogleTranslate-processed-nefnir.csv\"\n",
-    ")\n",
-    "data1, nb_english, svc_english, lr_english = classify(\"../IMDB-Dataset-Processed.csv\")\n",
     "\n",
-    "# Hannes-Movie-Reviews-proccessed-nefnir-sentiment.csv\n",
-    "data4, nb_hannes1, svc_hannes1, lr_hannes1 = classify(\n",
-    "    None,\n",
-    "    \"../IMDB-Dataset-GoogleTranslate-processed-nefnir.csv\",\n",
-    "    \"../Hannes-Movie-Reviews-processed-nefnir-sentiment.csv\",\n",
-    ")\n",
-    "data5, nb_hannes2, svc_hannes2, lr_hannes2 = classify(\n",
-    "    None,\n",
-    "    \"../IMDB-Dataset-MideindTranslate-processed-nefnir.csv\",\n",
-    "    \"../Hannes-Movie-Reviews-processed-nefnir-sentiment.csv\",\n",
-    ")\n",
+    "data2, nb_mideind, svc_mideind, lr_mideind = classify(ICELANDIC_MIDEIND_CSV)\n",
+    "data3, nb_google, svc_google, lr_google = classify(ICELANDIC_GOOGLE_CSV)\n",
+    "data1, nb_english, svc_english, lr_english = classify(ENGLISH_CSV)\n",
     "\n",
-    "# data4, nb_hannes, svc_hannes, lr_hannes = classify(\"../Hannes-Movie-Reviews-proccessed-nefnir-sentiment.csv\")\n",
     "\n",
     "plot_f1(data1, \"English Classification Report\")\n",
     "plot_f1(data2, \"Icelandic Miðeind Classification Report\")\n",
-    "plot_f1(data3, \"Icelandic Google Classification Report\")\n",
-    "plot_f1(data4, \"Hannes Google Classification Report\")\n",
-    "plot_f1(data5, \"Hannes Miðeind Classification Report\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
-   "id": "c473c1d7",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "sentiment\n",
-       "positive    932\n",
-       "negative    179\n",
-       "Name: count, dtype: int64"
-      ]
-     },
-     "execution_count": 30,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "import pandas as pd\n",
-    "\n",
-    "df = pd.read_csv(\"../Hannes-Movie-Reviews-processed-nefnir.csv\")\n",
-    "df.drop([\"Unnamed: 0\", \"id\"], axis=1, inplace=True)\n",
-    "\n",
-    "\n",
-    "# df.assign(condition=lambda d: ('positive' if d['sentiment'] > 6 else 'negative'))\n",
-    "\n",
-    "# 6-7-8-9-10\n",
-    "# 1-2-3-4-5\n",
-    "for sentiment in df[\"sentiment\"]:\n",
-    "    if sentiment >= 6:\n",
-    "        df[\"sentiment\"].replace(sentiment, \"positive\", inplace=True)\n",
-    "    else:\n",
-    "        df[\"sentiment\"].replace(sentiment, \"negative\", inplace=True)\n",
-    "\n",
-    "df[\"sentiment\"].value_counts()\n",
-    "\n",
-    "\n",
-    "# df.to_csv(\"../Hannes-Movie-Reviews-proccessed-nefnir-sentiment.csv\")"
+    "plot_f1(data3, \"Icelandic Google Classification Report\")"
    ]
   },
   {
@@ -786,40 +627,6 @@
     "show(lr_hannes1, \"LR Google Hannes %s\")\n",
     "show(lr_hannes2, \"LR Miðeind Hannes %s\")"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e8a14482",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def evaluate_score(text):\n",
-    "    text = [text]\n",
-    "    s = logistic_regression_pipeline.predict(text)\n",
-    "    # find all features and coefficients that have the text and sum up the values\n",
-    "    s = sum(\n",
-    "        [\n",
-    "            i[1]\n",
-    "            for x, i in enumerate(\n",
-    "                zip(\n",
-    "                    logistic_regression_pipeline[0].get_feature_names_out(),\n",
-    "                    logistic_regression_pipeline[1].coef_[0],\n",
-    "                )\n",
-    "            )\n",
-    "            if i[0] in text[0].split(\" \")\n",
-    "        ]\n",
-    "    )\n",
-    "    if s >= 1:\n",
-    "        print(\"(%s) Positive, score is %f\" % (text[0], s))\n",
-    "    else:\n",
-    "        print(\"(%s) Negative, score is %f\" % (text[0], s))\n",
-    "\n",
-    "\n",
-    "evaluate_score(\"hræðilegur frábær\")\n",
-    "evaluate_score(\"slæmur vel besta\")\n",
-    "evaluate_score(\"lélegur vel\")"
-   ]
   }
  ],
  "metadata": {