Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 77 additions & 0 deletions entity-classification-bslm-emd-and-KNN.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"machine_shape": "hm",
"gpuType": "L4"
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "RaguJswrxUUk",
"outputId": "8d1b4f04-91ae-42d4-ad6a-3a80c33951dc"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"1928/1928 [==============================] - 12s 6ms/step - loss: 0.0014 - accuracy: 1.0000\n",
"Test accuracy: 0.9999513626098633\n"
]
}
],
"source": [
"import pandas as pd\n",
"from huggingface_hub import hf_hub_download\n",
"from keras.preprocessing.text import tokenizer_from_json\n",
"from keras.preprocessing.sequence import pad_sequences\n",
"from keras.models import load_model\n",
"import numpy as np\n",
"\n",
"# Load the test dataset from Hugging Face with UTF-8 encoding\n",
"test_dataset_path = hf_hub_download(repo_id='borono/entity-classifier-Embedding', filename='test_dataset.csv')\n",
"test_dataset = pd.read_csv(test_dataset_path)\n",
"\n",
"# Load the model and tokenizer from Hugging Face\n",
"model_path = hf_hub_download(repo_id='borono/entity-classifier-Embedding', filename='KNN.h5')\n",
"tokenizer_path = hf_hub_download(repo_id='borono/entity-classifier-Embedding', filename='tokenizer.json')\n",
"\n",
"# Load tokenizer\n",
"with open(tokenizer_path, 'r', encoding='utf-8') as f:\n",
" tokenizer_data = f.read()\n",
"\n",
"tokenizer = tokenizer_from_json(tokenizer_data)\n",
"\n",
"# Tokenize and pad the sequences\n",
"max_len = 100 # Set the max_len based on your model's training configuration\n",
"test_sequences = tokenizer.texts_to_sequences(test_dataset['title'])\n",
"X_test = pad_sequences(test_sequences, maxlen=max_len)\n",
"y_test = pd.to_numeric(test_dataset['entity'], errors='coerce').fillna(0).astype(int)\n",
"\n",
"# Load model\n",
"model = load_model(model_path)\n",
"\n",
"# Evaluate model on test dataset\n",
"loss, accuracy = model.evaluate(X_test, y_test)\n",
"print(f'Test accuracy: {accuracy}')\n"
]
}
]
}
80 changes: 80 additions & 0 deletions entity-classifier.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ZcRCj5etHVOt",
"outputId": "728deb85-bca3-427d-d8d4-7e5a845d1412"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Accuracy: 0.37\n"
]
}
],
"source": [
"import pandas as pd\n",
"import joblib\n",
"from huggingface_hub import hf_hub_download\n",
"from sklearn.metrics import accuracy_score\n",
"import numpy as np\n",
"\n",
"# Load the test dataset from Hugging Face with UTF-8 encoding\n",
"test_dataset_path = hf_hub_download(repo_id='borono/my-naive-bayes-model', filename='test_dataset_test.csv')\n",
"test_dataset = pd.read_csv(test_dataset_path)\n",
"\n",
"# Load the model and vectorizer from Hugging Face\n",
"model_path = hf_hub_download(repo_id='borono/my-naive-bayes-model', filename='naive_bayes_model.joblib')\n",
"vectorizer_path = hf_hub_download(repo_id='borono/my-naive-bayes-model', filename='tfidf_vectorizer.joblib')\n",
"\n",
"# Load the model and vectorizer using joblib\n",
"model = joblib.load(model_path)\n",
"vectorizer = joblib.load(vectorizer_path)\n",
"\n",
"# Assuming 'entity' is the target column in your test dataset\n",
"y_true = test_dataset['entity'].astype(str)\n",
"titles = test_dataset['title']\n",
"\n",
"# Define batch size\n",
"batch_size = 1000\n",
"num_batches = int(np.ceil(len(titles) / batch_size))\n",
"\n",
"y_pred = []\n",
"\n",
"# Process the test data in batches\n",
"for i in range(num_batches):\n",
" start_idx = i * batch_size\n",
" end_idx = (i + 1) * batch_size\n",
" X_batch = titles[start_idx:end_idx]\n",
" X_batch_vectorized = vectorizer.transform(X_batch)\n",
" y_batch_pred = model.predict(X_batch_vectorized)\n",
" y_pred.extend(y_batch_pred.astype(str))\n",
"\n",
"# Evaluate the model's performance\n",
"accuracy = accuracy_score(y_true, y_pred)\n",
"print(f\"Accuracy: {accuracy:.2f}\")\n"
]
}
]
}
Loading