AI_Ticket_Classifier/trainer.py at main · RohiBit/AI_Ticket_Classifier · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
import evaluate # Used for loading accuracy metric
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import os
import datasets # Used for cache control

# Configurations
# The cleaned data file with embeddings
CLEANED_DATA_FILE = 'knowledge_base_with_embeddings.csv'
# The column in CSV should containing the ticket text
TEXT_COLUMN = 'Full_Ticket_Text'
# The column in CSV containing the target team name
TARGET_TEAM_COLUMN = 'Assigned Team'

MODEL_NAME = 'distilbert-base-uncased' # Base model for fine-tuning
SAVE_DIR_TEAM = './fine_tuned_model_team' # The required output directory for app.py
NUM_EPOCHS = 5

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
accuracy_metric = evaluate.load("accuracy")

#Helper Functions

def tokenize_function(examples):
    """Tokenizes the text column."""
    # Uses the renamed column 'text_input' from the main function
    return tokenizer(examples['text_input'], truncation=True, padding="max_length", max_length=128)

def compute_metrics(eval_pred):
    """Computes accuracy during evaluation."""
    preds = np.argmax(eval_pred.predictions, axis=1)
    return accuracy_metric.compute(predictions=preds, references=eval_pred.label_ids)

# Main Training

def main():
    try:
        # Load local CSV data
        print(f"Loading local data from {CLEANED_DATA_FILE}...")
        df = pd.read_csv(CLEANED_DATA_FILE, sep='|')

        # 1.Prepare Data

        df = df[[TEXT_COLUMN, TARGET_TEAM_COLUMN]].dropna()

        # 2. Map Labels: Convert team names (strings) to unique integer labels
        unique_labels = df[TARGET_TEAM_COLUMN].unique()
        label_to_id = {label: i for i, label in enumerate(unique_labels)}
        id_to_label = {i: label for i, label in enumerate(unique_labels)}
        df['labels'] = df[TARGET_TEAM_COLUMN].map(label_to_id)

        # Rename text column for consistency
        df = df.rename(columns={TEXT_COLUMN: 'text_input'})

        # 3.Split the data
        train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['labels'])

        # Convert to Hugging Face Dataset format # Rename the text column to 'text_input' for consistent use with the tokenizer
        train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
        val_dataset = Dataset.from_pandas(val_df.reset_index(drop=True))

        print(f"Data loaded. Training samples: {len(train_dataset)}, Validation samples: {len(val_dataset)}")

    except FileNotFoundError:
        print(f"FATAL ERROR: Data file not found at {CLEANED_DATA_FILE}. Please check the file path.")
        return
    except KeyError as e:
        print(f"FATAL ERROR: Required column {e} not found in CSV. Please ensure you use the exact names: '{TEXT_COLUMN}' and '{TARGET_TEAM_COLUMN}'.")
        return
    except Exception as e:
        print(f"FATAL ERROR loading data: {e}")
        return

    # 4. Tokenization
    tokenized_train_dataset = train_dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=['text_input', TARGET_TEAM_COLUMN]
    )
    tokenized_val_dataset = val_dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=['text_input', TARGET_TEAM_COLUMN]
    )

    # 5. Model Initialization
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=len(unique_labels),
        id2label=id_to_label,
        label2id=label_to_id
    )

    # 6. Define Training Parameters and Trainer
    training_args = TrainingArguments(
        output_dir="./team_model_results",
        num_train_epochs=NUM_EPOCHS,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_dir='./team_logs',
        logging_steps=100,
        load_best_model_at_end=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    # 7. Train and Save the Model
    print("Starting training for Assigned Team model...")
    trainer.train()

    print(f"Saving model to {SAVE_DIR_TEAM}...")
    trainer.save_model(SAVE_DIR_TEAM)
    tokenizer.save_pretrained(SAVE_DIR_TEAM)
    print(f"Model and tokenizer saved successfully to {SAVE_DIR_TEAM}.")


if __name__ == "__main__":
    datasets.disable_caching()
    main()