diff --git a/comprehensive_semhash_test.py b/comprehensive_semhash_test.py index a9b1556..5c22967 100644 --- a/comprehensive_semhash_test.py +++ b/comprehensive_semhash_test.py @@ -1,8 +1,10 @@ -from __future__ import unicode_literals import sys import re import os from itertools import product +from time import time +import math +import random import codecs import json import csv @@ -12,15 +14,13 @@ import matplotlib.pyplot as plt from collections import OrderedDict from sklearn import model_selection -from time import time from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer from sklearn.feature_selection import SelectFromModel, SelectKBest, chi2 from sklearn.model_selection import StratifiedShuffleSplit from sklearn.neighbors.nearest_centroid import NearestCentroid -import math -import random from tqdm import tqdm from nltk.corpus import wordnet +from __future__ import unicode_literals from sklearn.linear_model import RidgeClassifier from sklearn.pipeline import Pipeline from sklearn.svm import LinearSVC @@ -32,10 +32,10 @@ from sklearn.neighbors import NearestCentroid from sklearn.ensemble import RandomForestClassifier from sklearn.neural_network import MLPClassifier +from sklearn.linear_model import LogisticRegression +from sklearn.cluster import KMeans from sklearn.utils.extmath import density from sklearn import metrics -from sklearn.cluster import KMeans -from sklearn.linear_model import LogisticRegression from sklearn.model_selection import GridSearchCV # ## Benchmarking using SemHash on NLU Evaluation Corpora @@ -81,21 +81,12 @@ # os.environ['LDFLAGS'] = '-framework CoreFoundation -framework SystemConfiguration' # !pip3 install spacy print(sys.path) - - - - #coding: utf-8 # import locale # print(locale.getlocale()) - - # Spacy english dataset with vectors needs to be present. It can be downloaded using the following command: # # python -m spacy download en_core_web_lg - - - # !python -m spacy download en_core_web_lg nlp=spacy.load('en_core_web_lg') print('Running') @@ -111,118 +102,121 @@ # for hyper_aug in [True, False]: # augm - - -nouns = {x.name().split('.', 1)[0] for x in wordnet.all_synsets('n')} -verbs = {x.name().split('.', 1)[0] for x in wordnet.all_synsets('v')} - -def get_synonyms(word, number= 3): - synonyms = [] - for syn in wordnet.synsets(word): - for l in syn.lemmas(): - synonyms.append(l.name().lower().replace("_", " ")) - synonyms = list(OrderedDict.fromkeys(synonyms)) - return synonyms[:number] - #return [token.text for token in most_similar(nlp.vocab[word])] - - - - -print(get_synonyms("search",-1)) - - - - -#Hyperparameters -benchmark_dataset = '' # Choose from 'AskUbuntu', 'Chatbot' or 'WebApplication' -oversample = False # Whether to oversample small classes or not. True in the paper -synonym_extra_samples = False # Whether to replace words by synonyms in the oversampled samples. True in the paper -augment_extra_samples = False # Whether to add random spelling mistakes in the oversampled samples. False in the paper -additional_synonyms = -1 # How many extra synonym augmented sentences to add for each sentence. 0 in the paper -additional_augments = -1 # How many extra spelling mistake augmented sentences to add for each sentence. 0 in the paper -mistake_distance = -1 # How far away on the keyboard a mistake can be -VECTORIZER = "" #which vectorizer to use. choose between "count", "hash", and "tfidf" - -RESULT_FILE = "result5.csv" -METADATA_FILE = "metadata5.csv" -NUMBER_OF_RUNS_PER_SETTING = 10 +class Semhash_test(): + def __init__(self): + self.nouns = {x.name().split('.', 1)[0] for x in wordnet.all_synsets('n')} + self.verbs = {x.name().split('.', 1)[0] for x in wordnet.all_synsets('v')} + #Specify initial/default values for Hyperparameters + self.benchmark_dataset = '' # Choose from 'AskUbuntu', 'Chatbot' or 'WebApplication' + self.oversample = False # Whether to oversample small classes or not. True in the paper + self.synonym_extra_samples = False # Whether to replace words by synonyms in the oversampled samples. True in the paper + self.augment_extra_samples = False # Whether to add random spelling mistakes in the oversampled samples. False in the paper + self.additional_synonyms = -1 # How many extra synonym augmented sentences to add for each sentence. 0 in the paper + self.additional_augments = -1 # How many extra spelling mistake augmented sentences to add for each sentence. 0 in the paper + self.mistake_distance = -1 # How far away on the keyboard a mistake can be + self.VECTORIZER = "" #which vectorizer to use. choose between "count", "hash", and "tfidf" + #Results are stored in these files + RESULT_FILE = "result5.csv" + METADATA_FILE = "metadata5.csv" + NUMBER_OF_RUNS_PER_SETTING = 10 + + + + def get_synonyms(word, number= 3): + """ + This function returns the synonyms of the word provided as arguement + """ + synonyms = [] + for syn in wordnet.synsets(word): + for l in syn.lemmas(): + synonyms.append(l.name().lower().replace("_", " ")) + synonyms = list(OrderedDict.fromkeys(synonyms)) + return synonyms[:number] + #return [token.text for token in most_similar(nlp.vocab[word])] + print(get_synonyms("search",-1)) #Comprehensive settings testing #for benchmark_dataset, (oversample, synonym_extra_samples, augment_extra_samples), additional_synonyms, additional_augments, mistake_distance, VECTORIZER in product(['AskUbuntu', 'Chatbot', 'WebApplication'], [(False, False, False),(True, False, False),(True, False, True),(True, True, False),(True, True, True)], [0,4], [0,4], [2.1], ["tfidf", "hash", "count"]): #Settings from the original paper -for benchmark_dataset, (oversample, synonym_extra_samples, augment_extra_samples), additional_synonyms, additional_augments, mistake_distance, VECTORIZER in product(['AskUbuntu', 'Chatbot', 'WebApplication'], [(True, True, False)], [0], [0], [2.1], ["tfidf"]): - - if benchmark_dataset == "Chatbot": - intent_dict = {"DepartureTime":0, "FindConnection":1} - elif benchmark_dataset == "AskUbuntu": - intent_dict = {"Make Update":0, "Setup Printer":1, "Shutdown Computer":2, "Software Recommendation":3, "None":4} - elif benchmark_dataset == "WebApplication": - intent_dict = {"Download Video":0, "Change Password":1, "None":2, "Export Data":3, "Sync Accounts":4, - "Filter Spam":5, "Find Alternative":6, "Delete Account":7} - - - - - filename_train = "datasets/KL/" + benchmark_dataset + "/train.csv" - filename_test = "datasets/KL/" + benchmark_dataset + "/test.csv" - - - - - def read_CSV_datafile(filename): - X = [] - y = [] - with open(filename,'r') as csvfile: - reader = csv.reader(csvfile, delimiter='\t') - for row in reader: - X.append(row[0]) - if benchmark_dataset == 'AskUbuntu': - y.append(intent_dict[row[1]]) - elif benchmark_dataset == 'Chatbot': - y.append(intent_dict[row[1]]) - else: - y.append(intent_dict[row[1]]) - return X,y - - - - - def tokenize(doc): - """ - Returns a list of strings containing each token in `sentence` - """ - #return [i for i in re.split(r"([-.\"',:? !\$#@~()*&\^%;\[\]/\\\+<>\n=])", - # doc) if i != '' and i != ' ' and i != '\n'] - tokens = [] - doc = nlp.tokenizer(doc) - for token in doc: - tokens.append(token.text) - return tokens - - - - - def preprocess(doc): - clean_tokens = [] - doc = nlp(doc) - for token in doc: - if not token.is_stop: - clean_tokens.append(token.lemma_) - return " ".join(clean_tokens) - - + def benchmark_dataset() + for benchmark_dataset, (oversample, synonym_extra_samples, augment_extra_samples), additional_synonyms, additional_augments, mistake_distance, VECTORIZER in product(['AskUbuntu', 'Chatbot', 'WebApplication'], [(True, True, False)], [0], [0], [2.1], ["tfidf"]): + if benchmark_dataset == "Chatbot": + intent_dict = {"DepartureTime" :0, + "FindConnection":1} + + elif benchmark_dataset == "AskUbuntu": + intent_dict = {"Make Update" :0, + "Setup Printer":1, + "Shutdown Computer":2, + "Software Recommendation":3, + "None":4} + elif benchmark_dataset == "WebApplication": + intent_dict = {"Download Video":0, + "Change Password":1, + "None":2, + "Export Data":3, + "Sync Accounts":4, + "Filter Spam":5, + "Find Alternative":6, + "Delete Account":7} + #Defining the train and test files + filename_train = "datasets/KL/" + benchmark_dataset + "/train.csv" + filename_test = "datasets/KL/" + benchmark_dataset + "/test.csv" + + def read_CSV_datafile(filename): + """ + This function reads data from the csv file provided as arguement + """ + X = [] + y = [] + with open(filename,'r') as csvfile: + reader = csv.reader(csvfile, delimiter='\t') + for row in reader: + X.append(row[0]) + if benchmark_dataset == 'AskUbuntu': + y.append(intent_dict[row[1]]) + + elif benchmark_dataset == 'Chatbot': + y.append(intent_dict[row[1]]) + + else: + y.append(intent_dict[row[1]]) + return X,y + + def tokenize(doc): + """ + Returns a list of strings containing each token in `sentence` + """ + #return [i for i in re.split(r"([-.\"',:? !\$#@~()*&\^%;\[\]/\\\+<>\n=])", + # doc) if i != '' and i != ' ' and i != '\n'] + tokens = [] + doc = nlp.tokenizer(doc) + for token in doc: + tokens.append(token.text) + return tokens + + def preprocess(doc): + clean_tokens = [] + doc = nlp(doc) + for token in doc: + if not token.is_stop: + clean_tokens.append(token.lemma_) + return " ".join(clean_tokens) #********* Data augmentation part ************** - class MeraDataset(): - """ Class to find typos based on the keyboard distribution, for QWERTY style keyboards - - It's the actual test set as defined in the paper that we comparing against.""" +class MeraDataset(): + """ + Class to find typos based on the keyboard distribution, for QWERTY style keyboards + It's the actual test set as defined in the paper that we comparing against. + """ def __init__(self, dataset_path): - """ Instantiate the object. - @param: dataset_path The directory which contains the data set.""" + """ + Instantiate the object. + @param: dataset_path The directory which contains the data set. + """ self.dataset_path = dataset_path self.X_test, self.y_test, self.X_train, self.y_train = self.load() self.keyboard_cartesian = {'q': {'x': 0, 'y': 0}, 'w': {'x': 1, 'y': 0}, 'e': {'x': 2, 'y': 0}, @@ -235,45 +229,49 @@ def __init__(self, dataset_path): 'h': {'x': 5, 'y': 1}, 'k': {'x': 7, 'y': 1}, 'ö': {'x': 11,'y': 0}, 'l': {'x': 8, 'y': 1}, 'v': {'x': 3, 'y': 2}, 'n': {'x': 5, 'y': 2}, 'ß': {'x': 10,'y': 2}, 'ü': {'x': 10,'y': 2}, 'ä': {'x': 10,'y': 0}} + self.nearest_to_i = self.get_nearest_to_i(self.keyboard_cartesian) self.splits = self.stratified_split() def get_nearest_to_i(self, keyboard_cartesian): - """ Get the nearest key to the one read. - @params: keyboard_cartesian The layout of the QWERTY keyboard for English - - return dictionary of eaculidean distances for the characters""" + """ + Get the nearest key to the one read. + @params: keyboard_cartesian The layout of the QWERTY keyboard for English + return dictionary of eaculidean distances for the characters. + """ nearest_to_i = {} for i in keyboard_cartesian.keys(): nearest_to_i[i] = [] + for j in keyboard_cartesian.keys(): if self._euclidean_distance(i, j) < mistake_distance: #was > 1.2 nearest_to_i[i].append(j) return nearest_to_i def _shuffle_word(self, word, cutoff=0.7): - """ Rearange the given characters in a word simulating typos given a probability. - - @param: word A single word coming from a sentence - @param: cutoff The cutoff probability to make a change (default 0.9) - - return The word rearranged - """ + """ + Rearange the given characters in a word simulating typos given a probability. + @param: word A single word coming from a sentence + @param: cutoff The cutoff probability to make a change (default 0.9) + return The word rearranged + """ word = list(word.lower()) if random.uniform(0, 1.0) > cutoff: loc = np.random.randint(0, len(word)) + if word[loc] in self.keyboard_cartesian: word[loc] = random.choice(self.nearest_to_i[word[loc]]) - return ''.join(word) + return ''.join(word) def _euclidean_distance(self, a, b): - """ Calculates the euclidean between 2 points in the keyboard + """ + Calculates the euclidean between 2 points in the keyboard @param: a Point one @param: b Point two - - return The euclidean distance between the two points""" - X = (self.keyboard_cartesian[a]['x'] - self.keyboard_cartesian[b]['x']) ** 2 + return The euclidean distance between the two points + """ + X = (self.keyboard_cartesian[a]['x'] - self.keyboard_cartesian[b]['x']) ** 2 Y = (self.keyboard_cartesian[a]['y'] - self.keyboard_cartesian[b]['y']) ** 2 return math.sqrt(X + Y) @@ -284,8 +282,8 @@ def _augment_sentence(self, sentence, num_samples): """ Augment the dataset of file with a sentence shuffled @param: sentence The sentence from the set @param: num_samples The number of sentences to genererate - return A set of augmented sentences""" + sentences = [] for _ in range(num_samples): sentences.append(self._get_augment_sentence(sentence)) @@ -306,34 +304,40 @@ def _augment_split(self, X_train, y_train, num_samples=100): sample = [[Xs.append(item), ys.append(y)] for item in tmp_x] # print(X, y) # print(self.augmentedFile+str(self.nSamples)+".csv") - - with open("./datasets/KL/Chatbot/train_augmented.csv", 'w', encoding='utf8') as csvFile: - fileWriter = csv.writer(csvFile, delimiter='\t') - for i in range(0, len(Xs)-1): + fileWriter = csv.writer(csvFile, delimiter='\t') + for i in range(0, len(Xs)-1): fileWriter.writerow([Xs[i] + '\t' + ys[i]]) # print(Xs[i], "\t", ys[i]) # print(Xs[i]) # fileWriter.writerows(Xs + ['\t'] + ys) return Xs, ys - # Randomly replaces the nouns and verbs by synonyms +class Synonym_estimate(): + def _synonym_word(self, word, cutoff=0.5): + """ Randomly replaces the nouns and verbs by synonyms + """ if random.uniform(0, 1.0) > cutoff and len(get_synonyms(word)) > 0 and word in nouns and word in verbs: return random.choice(get_synonyms(word)) return word - # Randomly replace words (nouns and verbs) in sentence by synonyms + def _get_synonym_sentence(self, sentence, cutoff = 0.5): + """ + Randomly replace words (nouns and verbs) in sentence by synonyms + """ return ' '.join([self._synonym_word(item, cutoff) for item in sentence.split(' ')]) # For all classes except the largest ones; add duplicate (possibly augmented) samples until all classes have the same size def _oversample_split(self, X_train, y_train, synonym_extra_samples = False, augment_extra_samples = False): - """ Split the oversampled train dataset + """ + Split the oversampled train dataset @param: X_train The full array of sentences @param: y_train The train labels in the train dataset - return Oversampled training dataset""" + return Oversampled training dataset + """ classes = {} for X, y in zip(X_train, y_train): @@ -381,12 +385,14 @@ def _synonym_split(self, X_train, y_train, num_samples=100): return Xs, ys def load(self): - """ Load the file for now only the test.csv, train.csv files hardcoded + """ + Load the file for now only the test.csv, train.csv files hardcoded - return The vector separated in test, train and the labels for each one""" + return The vector separated in test, train and the labels for each one + """ with open(self.dataset_path) as csvfile: - readCSV = csv.reader(csvfile, delimiter=' ') - all_rows = list(readCSV) + readCSV = csv.reader(csvfile, delimiter=' ') + all_rows = list(readCSV) # for i in all_rows: # if i == 28823: # print(all_rows[i]) @@ -394,17 +400,19 @@ def load(self): y_test = [a[1] for a in all_rows] with open(self.dataset_path) as csvfile: - readCSV = csv.reader(csvfile, delimiter='\t') - all_rows = list(readCSV) - X_train = [a[0] for a in all_rows] - y_train = [a[1] for a in all_rows] + readCSV = csv.reader(csvfile, delimiter='\t') + all_rows = list(readCSV) + X_train = [a[0] for a in all_rows] + y_train = [a[1] for a in all_rows] return X_test, y_test, X_train, y_train def process_sentence(self, x): - """ Clean the tokens from stop words in a sentence. - @param x Sentence to get rid of stop words. + """ + Clean the tokens from stop words in a sentence. + @param x Sentence to get rid of stop words. - returns clean string sentence""" + returns clean string sentence + """ clean_tokens = [] doc = nlp.tokenizer(x) for token in doc: @@ -413,19 +421,24 @@ def process_sentence(self, x): return " ".join(clean_tokens) def process_batch(self, X): - """See the progress as is coming along. + """ + See the progress as is coming along. - return list[] of clean sentences""" + return list[] of clean sentences + """ return [self.process_sentence(a) for a in tqdm(X)] def stratified_split(self): - """ Split data whole into stratified test and training sets, then remove stop word from sentences + """ + Split data whole into stratified test and training sets, then remove stop word from sentences - return list of dictionaries with keys train,test and values the x and y for each one""" + return list of dictionaries with keys train,test and values the x and y for each one + """ self.X_train, self.X_test = ([preprocess(sentence) for sentence in self.X_train],[preprocess(sentence) for sentence in self.X_test]) print(self.X_train) if oversample: self.X_train, self.y_train = self._oversample_split(self.X_train, self.y_train, synonym_extra_samples, augment_extra_samples) + if additional_synonyms > 0: self.X_train, self.y_train = self._synonym_split(self.X_train, self.y_train, additional_synonyms) if additional_augments > 0: @@ -443,41 +456,36 @@ def get_splits(self): #**************************************************** + def split() + print("./datasets/KL/" + benchmark_dataset + "/train.csv") + t0 = time() + dataset = MeraDataset("./datasets/KL/" + benchmark_dataset + "/train.csv") + print("mera****************************") + splits = dataset.get_splits() + xS_train = [] + yS_train = [] + for elem in splits[0]["train"]["X"]: + xS_train.append(elem) + print(xS_train[:5]) - print("./datasets/KL/" + benchmark_dataset + "/train.csv") - t0 = time() - dataset = MeraDataset("./datasets/KL/" + benchmark_dataset + "/train.csv") - - print("mera****************************") - splits = dataset.get_splits() - xS_train = [] - yS_train = [] - for elem in splits[0]["train"]["X"]: - xS_train.append(elem) - print(xS_train[:5]) - - for elem in splits[0]["train"]["y"]: - yS_train.append(intent_dict[elem]) - preprocess_time = time()-t0 - print(len(xS_train)) + for elem in splits[0]["train"]["y"]: + yS_train.append(intent_dict[elem]) + preprocess_time = time()-t0 + print(len(xS_train)) + X_train_raw, y_train_raw = read_CSV_datafile(filename = filename_train) + X_test_raw, y_test_raw = read_CSV_datafile(filename = filename_test) + print(y_train_raw[:5]) + print(X_test_raw[:5]) + print(y_test_raw[:5]) + X_train_raw = xS_train + y_train_raw = yS_train + print("Training data samples: \n",X_train_raw, "\n\n") + print("Class Labels: \n", y_train_raw, "\n\n") - - X_train_raw, y_train_raw = read_CSV_datafile(filename = filename_train) - X_test_raw, y_test_raw = read_CSV_datafile(filename = filename_test) - print(y_train_raw[:5]) - print(X_test_raw[:5]) - print(y_test_raw[:5]) - X_train_raw = xS_train - y_train_raw = yS_train - - print("Training data samples: \n",X_train_raw, "\n\n") - - print("Class Labels: \n", y_train_raw, "\n\n") - - print("Size of Training Data: {}".format(len(X_train_raw))) + print("Size of Training Data: {}".format(len(X_train_raw))) # @@ -486,7 +494,7 @@ def get_splits(self): # # SemHash - +class Semhash_estimate(): def find_ngrams(input_list, n): return zip(*[input_list[i:] for i in range(n)]) @@ -512,8 +520,6 @@ def semhash_corpus(corpus): X_train_raw = semhash_corpus(X_train_raw) X_test_raw = semhash_corpus(X_test_raw) semhash_time = time()-t0 - - print(X_train_raw[:5]) print(y_train_raw[:5]) print() @@ -527,27 +533,28 @@ def get_vectorizer(corpus, preprocessor=None, tokenizer=None): vectorizer = CountVectorizer(analyzer='word')#,ngram_range=(1,1)) vectorizer.fit(corpus) feature_names = vectorizer.get_feature_names() + elif VECTORIZER == "hash": vectorizer = HashingVectorizer(analyzer='word', n_features=2**10, non_negative=True) vectorizer.fit(corpus) feature_names = None + elif VECTORIZER == "tfidf": vectorizer = TfidfVectorizer(analyzer='word') vectorizer.fit(corpus) feature_names = vectorizer.get_feature_names() + else: raise Exception("{} is not a recognized Vectorizer".format(VECTORIZER)) return vectorizer, feature_names - - def trim(s): """Trim string to fit on terminal (assuming 80-column display)""" return s if len(s) <= 80 else s[:77] + "..." - # ############################################################################# # Benchmark classifiers +class Classifiers_metrics(): def benchmark(clf, X_train, y_train, X_test, y_test, target_names, print_report=True, feature_names=None, print_top10=False, print_cm=True): @@ -662,21 +669,34 @@ def data_for_training(): print("Evaluating Split {}".format(i_s)) target_names = None if benchmark_dataset == "Chatbot": - target_names = ["Departure Time", "Find Connection"] + target_names = ["Departure Time", + "Find Connection"] + elif benchmark_dataset == "AskUbuntu": - target_names = ["Make Update", "Setup Printer", "Shutdown Computer","Software Recommendation", "None"] + target_names = ["Make Update", + "Setup Printer", + "Shutdown Computer", + "Software Recommendation", + "None"] + elif benchmark_dataset == "WebApplication": - target_names = ["Download Video", "Change Password", "None", "Export Data", "Sync Accounts", - "Filter Spam", "Find Alternative", "Delete Account"] + target_names = ["Download Video", + "Change Password", + "None", + "Export Data", + "Sync Accounts", + "Filter Spam", + "Find Alternative", + "Delete Account"] print("Train Size: {}\nTest Size: {}".format(X_train.shape[0], X_test.shape[0])) results = [] #alphas = np.array([1,0.1,0.01,0.001,0.0001,0]) parameters_mlp={'hidden_layer_sizes':[(100,50), (300, 100),(300,200,100)]} parameters_RF={ "n_estimators" : [50,60,70], - "min_samples_leaf" : [1, 11]} - k_range = list(range(3,7)) + "min_samples_leaf" : [1, 11]} + k_range = list(range(3,7)) parameters_knn = {'n_neighbors':k_range} - knn=KNeighborsClassifier(n_neighbors=5) + knn= KNeighborsClassifier(n_neighbors=5) for clf, name in [ (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"), (GridSearchCV(knn,parameters_knn, cv=5),"gridsearchknn"), @@ -768,12 +788,9 @@ def data_for_training(): #print('=' * 80) #print("KMeans") results.append(benchmark(KMeans(n_clusters=2, init='k-means++', max_iter=300, - verbose=0, random_state=0, tol=1e-4), - X_train, y_train, X_test, y_test, target_names, - feature_names=feature_names)) - - - + verbose=0, random_state=0, tol=1e-4), + X_train, y_train, X_test, y_test, target_names, + feature_names=feature_names)) #print('=' * 80) #print("LogisticRegression") kfold = model_selection.KFold(n_splits=2, random_state=0) @@ -781,16 +798,10 @@ def data_for_training(): fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=None, solver='liblinear', tol=0.0001, verbose=0, warm_start=False), - X_train, y_train, X_test, y_test, target_names, - feature_names=feature_names)) + X_train, y_train, X_test, y_test, target_names, + feature_names=feature_names)) #plot_results(results) - - - - - - print(len(X_train))