From bc38ee78f8d77d9354b887f57d785bbbc7bfd6f5 Mon Sep 17 00:00:00 2001 From: the_borono <60317035+hajarman@users.noreply.github.com> Date: Sat, 1 Jun 2024 18:03:14 +0330 Subject: [PATCH 1/3] Add files via upload --- entity-classifier.ipynb | 80 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 entity-classifier.ipynb diff --git a/entity-classifier.ipynb b/entity-classifier.ipynb new file mode 100644 index 0000000..2933c11 --- /dev/null +++ b/entity-classifier.ipynb @@ -0,0 +1,80 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ZcRCj5etHVOt", + "outputId": "728deb85-bca3-427d-d8d4-7e5a845d1412" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Accuracy: 0.37\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import joblib\n", + "from huggingface_hub import hf_hub_download\n", + "from sklearn.metrics import accuracy_score\n", + "import numpy as np\n", + "\n", + "# Load the test dataset from Hugging Face with UTF-8 encoding\n", + "test_dataset_path = hf_hub_download(repo_id='borono/my-naive-bayes-model', filename='test_dataset_test.csv')\n", + "test_dataset = pd.read_csv(test_dataset_path)\n", + "\n", + "# Load the model and vectorizer from Hugging Face\n", + "model_path = hf_hub_download(repo_id='borono/my-naive-bayes-model', filename='naive_bayes_model.joblib')\n", + "vectorizer_path = hf_hub_download(repo_id='borono/my-naive-bayes-model', filename='tfidf_vectorizer.joblib')\n", + "\n", + "# Load the model and vectorizer using joblib\n", + "model = joblib.load(model_path)\n", + "vectorizer = joblib.load(vectorizer_path)\n", + "\n", + "# Assuming 'entity' is the target column in your test dataset\n", + "y_true = test_dataset['entity'].astype(str)\n", + "titles = test_dataset['title']\n", + "\n", + "# Define batch size\n", + "batch_size = 1000\n", + "num_batches = int(np.ceil(len(titles) / batch_size))\n", + "\n", + "y_pred = []\n", + "\n", + "# Process the test data in batches\n", + "for i in range(num_batches):\n", + " start_idx = i * batch_size\n", + " end_idx = (i + 1) * batch_size\n", + " X_batch = titles[start_idx:end_idx]\n", + " X_batch_vectorized = vectorizer.transform(X_batch)\n", + " y_batch_pred = model.predict(X_batch_vectorized)\n", + " y_pred.extend(y_batch_pred.astype(str))\n", + "\n", + "# Evaluate the model's performance\n", + "accuracy = accuracy_score(y_true, y_pred)\n", + "print(f\"Accuracy: {accuracy:.2f}\")\n" + ] + } + ] +} \ No newline at end of file From 3934ee3d4725a7c63bfca6fe1b4b498bb98d1c66 Mon Sep 17 00:00:00 2001 From: the_borono <60317035+hajarman@users.noreply.github.com> Date: Thu, 13 Jun 2024 08:22:43 +0330 Subject: [PATCH 2/3] Add files via upload --- withKNN.ipynb | 1200 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1200 insertions(+) create mode 100644 withKNN.ipynb diff --git a/withKNN.ipynb b/withKNN.ipynb new file mode 100644 index 0000000..fe7f1cd --- /dev/null +++ b/withKNN.ipynb @@ -0,0 +1,1200 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "5bdf3c1240eb4a07b5a2549871b57f5c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_73f941a6d1ad4dfc90784c03ddba3548", + "IPY_MODEL_20d71f3a91154d39911052f7440551e2", + "IPY_MODEL_4cfa3d4c6adc49ada6d56386e18b613f" + ], + "layout": "IPY_MODEL_bee693ef166248f18268ef77798a34b5" + } + }, + "73f941a6d1ad4dfc90784c03ddba3548": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_628760c86243444e94ccb4e6de751ece", + "placeholder": "​", + "style": "IPY_MODEL_785cb33550bb4d9bad900cdb23d1c5a0", + "value": "test_dataset_5.csv: 100%" + } + }, + "20d71f3a91154d39911052f7440551e2": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e89f9163e1dc499f9111b47107247562", + "max": 32526128, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_e253720f4807436d89924869758420f8", + "value": 32526128 + } + }, + "4cfa3d4c6adc49ada6d56386e18b613f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ed956ca752cc4b9489dbe51ae2d362ab", + "placeholder": "​", + "style": "IPY_MODEL_46bd9ea5b1364c0fb24f8ed6e936ec98", + "value": " 32.5M/32.5M [00:01<00:00, 24.2MB/s]" + } + }, + "bee693ef166248f18268ef77798a34b5": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "628760c86243444e94ccb4e6de751ece": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "785cb33550bb4d9bad900cdb23d1c5a0": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e89f9163e1dc499f9111b47107247562": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e253720f4807436d89924869758420f8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "ed956ca752cc4b9489dbe51ae2d362ab": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "46bd9ea5b1364c0fb24f8ed6e936ec98": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "837d3334ca2840508868993cebf49a5f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_914726d43a0f4380b5dff4a9ab1141da", + "IPY_MODEL_2a7f0b35a263419889da5b096ce75c32", + "IPY_MODEL_7c703d89cab849409d1a0cad2a0074c9" + ], + "layout": "IPY_MODEL_1a928b6709df4651b6edc4cbbdee85bc" + } + }, + "914726d43a0f4380b5dff4a9ab1141da": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e63b2e872cfc48618c9ad6003c07d0fd", + "placeholder": "​", + "style": "IPY_MODEL_3d73980058d146d3b3e207bdd8a893d1", + "value": "knn_model.pkl: 100%" + } + }, + "2a7f0b35a263419889da5b096ce75c32": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_131ac7d7b7a24a5dbd2007c55fb45625", + "max": 47371742, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_273d54e11bf64729a3bbaa529fe2c6b5", + "value": 47371742 + } + }, + "7c703d89cab849409d1a0cad2a0074c9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f3ebf840ed294ca09ae5ff67a2e41700", + "placeholder": "​", + "style": "IPY_MODEL_243c4ee2466d41f8813c23a608a8da47", + "value": " 47.4M/47.4M [00:00<00:00, 77.5MB/s]" + } + }, + "1a928b6709df4651b6edc4cbbdee85bc": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e63b2e872cfc48618c9ad6003c07d0fd": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3d73980058d146d3b3e207bdd8a893d1": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "131ac7d7b7a24a5dbd2007c55fb45625": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "273d54e11bf64729a3bbaa529fe2c6b5": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "f3ebf840ed294ca09ae5ff67a2e41700": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "243c4ee2466d41f8813c23a608a8da47": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e32f9f82ebc04deb94712f4c66a11902": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_f1152f922eb749bd9eb7cc9aa3a75577", + "IPY_MODEL_2c02e0b69c824a0fa7bf70fb56dfe6a9", + "IPY_MODEL_e06d39a2e97047c4a3a34e2fa75b1dfb" + ], + "layout": "IPY_MODEL_6908f73a070b462aae865ea493ac48a6" + } + }, + "f1152f922eb749bd9eb7cc9aa3a75577": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_712cac2b1d24408f8ea0bedfd39eb2ac", + "placeholder": "​", + "style": "IPY_MODEL_94224f6805d04a71a1ee7fef7fc89607", + "value": "tfidf_vectorizer.pkl: 100%" + } + }, + "2c02e0b69c824a0fa7bf70fb56dfe6a9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6bb6165fa5594c51903da7d77ad17236", + "max": 374750, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_2f47d3a33aa146518c906e174864a4f2", + "value": 374750 + } + }, + "e06d39a2e97047c4a3a34e2fa75b1dfb": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_dac103455d2e458f8175d4e1cff7b10e", + "placeholder": "​", + "style": "IPY_MODEL_c265ffac93744f738068b383c27017e4", + "value": " 375k/375k [00:00<00:00, 3.91MB/s]" + } + }, + "6908f73a070b462aae865ea493ac48a6": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "712cac2b1d24408f8ea0bedfd39eb2ac": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "94224f6805d04a71a1ee7fef7fc89607": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "6bb6165fa5594c51903da7d77ad17236": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2f47d3a33aa146518c906e174864a4f2": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "dac103455d2e458f8175d4e1cff7b10e": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c265ffac93744f738068b383c27017e4": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + } + } + } + }, + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 260, + "referenced_widgets": [ + "5bdf3c1240eb4a07b5a2549871b57f5c", + "73f941a6d1ad4dfc90784c03ddba3548", + "20d71f3a91154d39911052f7440551e2", + "4cfa3d4c6adc49ada6d56386e18b613f", + "bee693ef166248f18268ef77798a34b5", + "628760c86243444e94ccb4e6de751ece", + "785cb33550bb4d9bad900cdb23d1c5a0", + "e89f9163e1dc499f9111b47107247562", + "e253720f4807436d89924869758420f8", + "ed956ca752cc4b9489dbe51ae2d362ab", + "46bd9ea5b1364c0fb24f8ed6e936ec98", + "837d3334ca2840508868993cebf49a5f", + "914726d43a0f4380b5dff4a9ab1141da", + "2a7f0b35a263419889da5b096ce75c32", + "7c703d89cab849409d1a0cad2a0074c9", + "1a928b6709df4651b6edc4cbbdee85bc", + "e63b2e872cfc48618c9ad6003c07d0fd", + "3d73980058d146d3b3e207bdd8a893d1", + "131ac7d7b7a24a5dbd2007c55fb45625", + "273d54e11bf64729a3bbaa529fe2c6b5", + "f3ebf840ed294ca09ae5ff67a2e41700", + "243c4ee2466d41f8813c23a608a8da47", + "e32f9f82ebc04deb94712f4c66a11902", + "f1152f922eb749bd9eb7cc9aa3a75577", + "2c02e0b69c824a0fa7bf70fb56dfe6a9", + "e06d39a2e97047c4a3a34e2fa75b1dfb", + "6908f73a070b462aae865ea493ac48a6", + "712cac2b1d24408f8ea0bedfd39eb2ac", + "94224f6805d04a71a1ee7fef7fc89607", + "6bb6165fa5594c51903da7d77ad17236", + "2f47d3a33aa146518c906e174864a4f2", + "dac103455d2e458f8175d4e1cff7b10e", + "c265ffac93744f738068b383c27017e4" + ] + }, + "id": "8emiCpLzHkz3", + "outputId": "648ea0fc-337d-420a-fc59-c78a24e758c5" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n", + "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", + "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", + "You will be able to reuse this secret in all of your notebooks.\n", + "Please note that authentication is recommended but still optional to access public models or datasets.\n", + " warnings.warn(\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "test_dataset_5.csv: 0%| | 0.00/32.5M [00:00 Date: Thu, 13 Jun 2024 08:23:12 +0330 Subject: [PATCH 3/3] Add files via upload --- entity-classification-bslm-emd-and-KNN.ipynb | 77 ++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 entity-classification-bslm-emd-and-KNN.ipynb diff --git a/entity-classification-bslm-emd-and-KNN.ipynb b/entity-classification-bslm-emd-and-KNN.ipynb new file mode 100644 index 0000000..ee77b0b --- /dev/null +++ b/entity-classification-bslm-emd-and-KNN.ipynb @@ -0,0 +1,77 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "machine_shape": "hm", + "gpuType": "L4" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "RaguJswrxUUk", + "outputId": "8d1b4f04-91ae-42d4-ad6a-3a80c33951dc" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "1928/1928 [==============================] - 12s 6ms/step - loss: 0.0014 - accuracy: 1.0000\n", + "Test accuracy: 0.9999513626098633\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from huggingface_hub import hf_hub_download\n", + "from keras.preprocessing.text import tokenizer_from_json\n", + "from keras.preprocessing.sequence import pad_sequences\n", + "from keras.models import load_model\n", + "import numpy as np\n", + "\n", + "# Load the test dataset from Hugging Face with UTF-8 encoding\n", + "test_dataset_path = hf_hub_download(repo_id='borono/entity-classifier-Embedding', filename='test_dataset.csv')\n", + "test_dataset = pd.read_csv(test_dataset_path)\n", + "\n", + "# Load the model and tokenizer from Hugging Face\n", + "model_path = hf_hub_download(repo_id='borono/entity-classifier-Embedding', filename='KNN.h5')\n", + "tokenizer_path = hf_hub_download(repo_id='borono/entity-classifier-Embedding', filename='tokenizer.json')\n", + "\n", + "# Load tokenizer\n", + "with open(tokenizer_path, 'r', encoding='utf-8') as f:\n", + " tokenizer_data = f.read()\n", + "\n", + "tokenizer = tokenizer_from_json(tokenizer_data)\n", + "\n", + "# Tokenize and pad the sequences\n", + "max_len = 100 # Set the max_len based on your model's training configuration\n", + "test_sequences = tokenizer.texts_to_sequences(test_dataset['title'])\n", + "X_test = pad_sequences(test_sequences, maxlen=max_len)\n", + "y_test = pd.to_numeric(test_dataset['entity'], errors='coerce').fillna(0).astype(int)\n", + "\n", + "# Load model\n", + "model = load_model(model_path)\n", + "\n", + "# Evaluate model on test dataset\n", + "loss, accuracy = model.evaluate(X_test, y_test)\n", + "print(f'Test accuracy: {accuracy}')\n" + ] + } + ] +} \ No newline at end of file