han/bow.py at master · sinecode/han · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import argparse

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.linear_model import SGDClassifier

from config import Yelp, Yahoo, Amazon, Synthetic


def run_experiment(vectorizer, text_train, y_train, text_test, y_test):
    """
    Run the experiment with the specified vectorizer.

    Return the accuracy score on the test set.
    """
    pipe = make_pipeline(
        vectorizer,
        MaxAbsScaler(),
        SGDClassifier(loss="log", random_state=20),  # logistic regression
    )
    param_grid = {"sgdclassifier__alpha": [0.00001, 0.0001, 0.001, 0.01, 0.1]}
    grid = GridSearchCV(pipe, param_grid, cv=5, n_jobs=4)
    grid.fit(text_train, y_train)
    print(f"Best training score: {grid.best_score_}")
    print(f"Best params: {grid.best_params_}")
    return grid.score(text_test, y_test)


def process_dataset(dataset):
    text_train, text_test, y_train, y_test = train_test_split(
        dataset.text,
        dataset.label,
        test_size=0.1,
        stratify=dataset.label,
        random_state=20,
    )
    accuracy = run_experiment(
        CountVectorizer(max_features=50_000),
        text_train,
        y_train,
        text_test,
        y_test,
    )
    print(f"BoW: {accuracy}")
    print("")
    accuracy = run_experiment(
        TfidfVectorizer(max_features=50_000, norm=None),
        text_train,
        y_train,
        text_test,
        y_test,
    )
    print(f"BoW-TFIDF: {accuracy}")


def main():
    parser = argparse.ArgumentParser(
        description="Run the experiment with a BoW model"
    )
    parser.add_argument(
        "dataset",
        choices=["yelp", "yelp-sample", "yahoo", "amazon", "synthetic"],
        help="Choose the dataset",
    )

    args = parser.parse_args()

    if args.dataset == "yelp":
        dataset_config = Yelp
    elif args.dataset == "yahoo":
        dataset_config = Yahoo
    elif args.dataset == "amazon":
        dataset_config = Amazon
    elif args.dataset == "synthetic":
        dataset_config = Synthetic
    else:
        # should not end there
        exit()

    args = parser.parse_args()

    dataset = pd.concat(
        [
            pd.read_csv(dataset_config.TRAIN_DATASET).fillna(""),
            pd.read_csv(dataset_config.VAL_DATASET).fillna(""),
            pd.read_csv(dataset_config.TEST_DATASET).fillna(""),
        ]
    )
    process_dataset(dataset)


if __name__ == "__main__":
    main()