basalam · hajarman · Jun 1, 2024 · Jun 13, 2024 · Jun 13, 2024
diff --git a/entity-classification-bslm-emd-and-KNN.ipynb b/entity-classification-bslm-emd-and-KNN.ipynb
@@ -0,0 +1,77 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "machine_shape": "hm",
+      "gpuType": "L4"
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "accelerator": "GPU"
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "RaguJswrxUUk",
+        "outputId": "8d1b4f04-91ae-42d4-ad6a-3a80c33951dc"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "1928/1928 [==============================] - 12s 6ms/step - loss: 0.0014 - accuracy: 1.0000\n",
+            "Test accuracy: 0.9999513626098633\n"
+          ]
+        }
+      ],
+      "source": [
+        "import pandas as pd\n",
+        "from huggingface_hub import hf_hub_download\n",
+        "from keras.preprocessing.text import tokenizer_from_json\n",
+        "from keras.preprocessing.sequence import pad_sequences\n",
+        "from keras.models import load_model\n",
+        "import numpy as np\n",
+        "\n",
+        "# Load the test dataset from Hugging Face with UTF-8 encoding\n",
+        "test_dataset_path = hf_hub_download(repo_id='borono/entity-classifier-Embedding', filename='test_dataset.csv')\n",
+        "test_dataset = pd.read_csv(test_dataset_path)\n",
+        "\n",
+        "# Load the model and tokenizer from Hugging Face\n",
+        "model_path = hf_hub_download(repo_id='borono/entity-classifier-Embedding', filename='KNN.h5')\n",
+        "tokenizer_path = hf_hub_download(repo_id='borono/entity-classifier-Embedding', filename='tokenizer.json')\n",
+        "\n",
+        "# Load tokenizer\n",
+        "with open(tokenizer_path, 'r', encoding='utf-8') as f:\n",
+        "    tokenizer_data = f.read()\n",
+        "\n",
+        "tokenizer = tokenizer_from_json(tokenizer_data)\n",
+        "\n",
+        "# Tokenize and pad the sequences\n",
+        "max_len = 100  # Set the max_len based on your model's training configuration\n",
+        "test_sequences = tokenizer.texts_to_sequences(test_dataset['title'])\n",
+        "X_test = pad_sequences(test_sequences, maxlen=max_len)\n",
+        "y_test = pd.to_numeric(test_dataset['entity'], errors='coerce').fillna(0).astype(int)\n",
+        "\n",
+        "# Load model\n",
+        "model = load_model(model_path)\n",
+        "\n",
+        "# Evaluate model on test dataset\n",
+        "loss, accuracy = model.evaluate(X_test, y_test)\n",
+        "print(f'Test accuracy: {accuracy}')\n"
+      ]
+    }
+  ]
+}
diff --git a/entity-classifier.ipynb b/entity-classifier.ipynb
@@ -0,0 +1,80 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "ZcRCj5etHVOt",
+        "outputId": "728deb85-bca3-427d-d8d4-7e5a845d1412"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Accuracy: 0.37\n"
+          ]
+        }
+      ],
+      "source": [
+        "import pandas as pd\n",
+        "import joblib\n",
+        "from huggingface_hub import hf_hub_download\n",
+        "from sklearn.metrics import accuracy_score\n",
+        "import numpy as np\n",
+        "\n",
+        "# Load the test dataset from Hugging Face with UTF-8 encoding\n",
+        "test_dataset_path = hf_hub_download(repo_id='borono/my-naive-bayes-model', filename='test_dataset_test.csv')\n",
+        "test_dataset = pd.read_csv(test_dataset_path)\n",
+        "\n",
+        "# Load the model and vectorizer from Hugging Face\n",
+        "model_path = hf_hub_download(repo_id='borono/my-naive-bayes-model', filename='naive_bayes_model.joblib')\n",
+        "vectorizer_path = hf_hub_download(repo_id='borono/my-naive-bayes-model', filename='tfidf_vectorizer.joblib')\n",
+        "\n",
+        "# Load the model and vectorizer using joblib\n",
+        "model = joblib.load(model_path)\n",
+        "vectorizer = joblib.load(vectorizer_path)\n",
+        "\n",
+        "# Assuming 'entity' is the target column in your test dataset\n",
+        "y_true = test_dataset['entity'].astype(str)\n",
+        "titles = test_dataset['title']\n",
+        "\n",
+        "# Define batch size\n",
+        "batch_size = 1000\n",
+        "num_batches = int(np.ceil(len(titles) / batch_size))\n",
+        "\n",
+        "y_pred = []\n",
+        "\n",
+        "# Process the test data in batches\n",
+        "for i in range(num_batches):\n",
+        "    start_idx = i * batch_size\n",
+        "    end_idx = (i + 1) * batch_size\n",
+        "    X_batch = titles[start_idx:end_idx]\n",
+        "    X_batch_vectorized = vectorizer.transform(X_batch)\n",
+        "    y_batch_pred = model.predict(X_batch_vectorized)\n",
+        "    y_pred.extend(y_batch_pred.astype(str))\n",
+        "\n",
+        "# Evaluate the model's performance\n",
+        "accuracy = accuracy_score(y_true, y_pred)\n",
+        "print(f\"Accuracy: {accuracy:.2f}\")\n"
+      ]
+    }
+  ]
+}