diff --git a/comprehensive_semhash_test.py b/comprehensive_semhash_test.py index a9b1556..2e941d8 100644 --- a/comprehensive_semhash_test.py +++ b/comprehensive_semhash_test.py @@ -37,761 +37,696 @@ from sklearn.cluster import KMeans from sklearn.linear_model import LogisticRegression from sklearn.model_selection import GridSearchCV - -# ## Benchmarking using SemHash on NLU Evaluation Corpora -# -# This notebook benchmarks the results on the 3 NLU Evaluation Corpora: -# 1. Ask Ubuntu Corpus -# 2. Chatbot Corpus -# 3. Web Application Corpus -# -# -# More information about the dataset is available here: -# -# https://github.com/sebischair/NLU-Evaluation-Corpora -# -# -# * Semantic Hashing is used as a featurizer. The idea is taken from the paper: -# -# https://www.microsoft.com/en-us/research/publication/learning-deep-structured-semantic-models-for-web-search-using-clickthrough-data/ -# -# * Benchmarks are performed on the same train and test datasets used by the other benchmarks performed in the past. One important paper that benchmarks the datasets mentioned above on some important platforms (Dialogflow, Luis, Watson and RASA) is : -# -# http://workshop.colips.org/wochat/@sigdial2017/documents/SIGDIAL22.pdf -# -# * Furthermore, Botfuel made another benchmarks with more platforms (Recast, Snips and their own) and results can be found here: -# -# https://github.com/Botfuel/benchmark-nlp-2018 -# -# * The blogposts about the benchmarks done in the past are available at : -# -# https://medium.com/botfuel/benchmarking-intent-classification-services-june-2018-eb8684a1e55f -# -# https://medium.com/snips-ai/an-introduction-to-snips-nlu-the-open-source-library-behind-snips-embedded-voice-platform-b12b1a60a41a -# -# * To be very fair on our benchmarks and results, we used the same train and test set used by the other benchmarks and no cross validation or stratified splits were used. The test data was not used in any way to improve the results. The dataset used can be found here: -# -# https://github.com/Botfuel/benchmark-nlp-2018/tree/master/results -# -# - - - -# import os -# os.environ['LDFLAGS'] = '-framework CoreFoundation -framework SystemConfiguration' -# !pip3 install spacy print(sys.path) - - - -#coding: utf-8 -# import locale -# print(locale.getlocale()) - - -# Spacy english dataset with vectors needs to be present. It can be downloaded using the following command: -# -# python -m spacy download en_core_web_lg - - - -# !python -m spacy download en_core_web_lg nlp=spacy.load('en_core_web_lg') print('Running') -# for hyper_bench in ['AskUbuntu', 'Chatbot', 'WebApplication']: -# benchmark_dataset = hyper - -# for hyper_over in [True, False]: -# oversample = hyper_over - -# for hyper_syn_extra in [True, False]: -# synonym_extra_samples = hyper_syn_extra - -# for hyper_aug in [True, False]: -# augm - - -nouns = {x.name().split('.', 1)[0] for x in wordnet.all_synsets('n')} -verbs = {x.name().split('.', 1)[0] for x in wordnet.all_synsets('v')} - -def get_synonyms(word, number= 3): - synonyms = [] - for syn in wordnet.synsets(word): - for l in syn.lemmas(): - synonyms.append(l.name().lower().replace("_", " ")) - synonyms = list(OrderedDict.fromkeys(synonyms)) - return synonyms[:number] - #return [token.text for token in most_similar(nlp.vocab[word])] - - - - -print(get_synonyms("search",-1)) - +class MeraDataset(): + + def benchmark_dataset(){ + for benchmark_datset,(oversample, synonym_extra_samples, augment_extra_samples), additional_synonyms, additional_augments, mistake_distance, VECTORIZER in product(['AskUbuntu', 'Chatbot', 'WebApplication'], [(True, True, False)], [0], [0], [2.1], ["tfidf"]): + if benchmark_dataset == "Chatbot": + intent_dict = {"DepartureTime":0, + "FindConnection":1} + elif benchmark_dataset == "AskUbuntu": + intent_dict = {"Make Update":0, + "Setup Printer":1, + "Shutdown Computer":2, + "Software Recommendation":3, + "None":4} + elif benchmark_dataset == "WebApplication": + intent_dict = {"Download Video":0, + "Change Password":1, + "None":2, "Export Data":3, + "Sync Accounts":4, + "Filter Spam":5, + "Find Alternative":6, + "Delete Account":7} + } -#Hyperparameters -benchmark_dataset = '' # Choose from 'AskUbuntu', 'Chatbot' or 'WebApplication' -oversample = False # Whether to oversample small classes or not. True in the paper -synonym_extra_samples = False # Whether to replace words by synonyms in the oversampled samples. True in the paper -augment_extra_samples = False # Whether to add random spelling mistakes in the oversampled samples. False in the paper -additional_synonyms = -1 # How many extra synonym augmented sentences to add for each sentence. 0 in the paper -additional_augments = -1 # How many extra spelling mistake augmented sentences to add for each sentence. 0 in the paper -mistake_distance = -1 # How far away on the keyboard a mistake can be -VECTORIZER = "" #which vectorizer to use. choose between "count", "hash", and "tfidf" - -RESULT_FILE = "result5.csv" -METADATA_FILE = "metadata5.csv" -NUMBER_OF_RUNS_PER_SETTING = 10 - -#Comprehensive settings testing -#for benchmark_dataset, (oversample, synonym_extra_samples, augment_extra_samples), additional_synonyms, additional_augments, mistake_distance, VECTORIZER in product(['AskUbuntu', 'Chatbot', 'WebApplication'], [(False, False, False),(True, False, False),(True, False, True),(True, True, False),(True, True, True)], [0,4], [0,4], [2.1], ["tfidf", "hash", "count"]): -#Settings from the original paper -for benchmark_dataset, (oversample, synonym_extra_samples, augment_extra_samples), additional_synonyms, additional_augments, mistake_distance, VECTORIZER in product(['AskUbuntu', 'Chatbot', 'WebApplication'], [(True, True, False)], [0], [0], [2.1], ["tfidf"]): - if benchmark_dataset == "Chatbot": - intent_dict = {"DepartureTime":0, "FindConnection":1} - elif benchmark_dataset == "AskUbuntu": - intent_dict = {"Make Update":0, "Setup Printer":1, "Shutdown Computer":2, "Software Recommendation":3, "None":4} - elif benchmark_dataset == "WebApplication": - intent_dict = {"Download Video":0, "Change Password":1, "None":2, "Export Data":3, "Sync Accounts":4, - "Filter Spam":5, "Find Alternative":6, "Delete Account":7} + filename_train = "datasets/KL/" + benchmark_dataset + "/train.csv" + filename_test = "datasets/KL/" + benchmark_dataset + "/test.csv" - filename_train = "datasets/KL/" + benchmark_dataset + "/train.csv" - filename_test = "datasets/KL/" + benchmark_dataset + "/test.csv" + def read_CSV_datafile(filename): + X = [] + y = [] + with open(filename,'r') as csvfile: + reader = csv.reader(csvfile, delimiter='\t') + for row in reader: + X.append(row[0]) + if benchmark_dataset == 'AskUbuntu': + y.append(intent_dict[row[1]]) + elif benchmark_dataset == 'Chatbot': + y.append(intent_dict[row[1]]) + else: + y.append(intent_dict[row[1]]) + return X,y - def read_CSV_datafile(filename): - X = [] - y = [] - with open(filename,'r') as csvfile: - reader = csv.reader(csvfile, delimiter='\t') - for row in reader: - X.append(row[0]) - if benchmark_dataset == 'AskUbuntu': - y.append(intent_dict[row[1]]) - elif benchmark_dataset == 'Chatbot': - y.append(intent_dict[row[1]]) - else: - y.append(intent_dict[row[1]]) - return X,y + def tokenize(doc): + """ + Returns a list of strings containing each token in `sentence` + """ + #return [i for i in re.split(r"([-.\"',:? !\$#@~()*&\^%;\[\]/\\\+<>\n=])", + # doc) if i != '' and i != ' ' and i != '\n'] + tokens = [] + doc = nlp.tokenizer(doc) + for token in doc: + tokens.append(token.text) + return tokens - def tokenize(doc): - """ - Returns a list of strings containing each token in `sentence` - """ - #return [i for i in re.split(r"([-.\"',:? !\$#@~()*&\^%;\[\]/\\\+<>\n=])", - # doc) if i != '' and i != ' ' and i != '\n'] - tokens = [] - doc = nlp.tokenizer(doc) - for token in doc: - tokens.append(token.text) - return tokens - - - - - def preprocess(doc): - clean_tokens = [] - doc = nlp(doc) - for token in doc: - if not token.is_stop: - clean_tokens.append(token.lemma_) - return " ".join(clean_tokens) - - - - - #********* Data augmentation part ************** - class MeraDataset(): - """ Class to find typos based on the keyboard distribution, for QWERTY style keyboards - - It's the actual test set as defined in the paper that we comparing against.""" - - def __init__(self, dataset_path): - """ Instantiate the object. - @param: dataset_path The directory which contains the data set.""" - self.dataset_path = dataset_path - self.X_test, self.y_test, self.X_train, self.y_train = self.load() - self.keyboard_cartesian = {'q': {'x': 0, 'y': 0}, 'w': {'x': 1, 'y': 0}, 'e': {'x': 2, 'y': 0}, - 'r': {'x': 3, 'y': 0}, 't': {'x': 4, 'y': 0}, 'y': {'x': 5, 'y': 0}, - 'u': {'x': 6, 'y': 0}, 'i': {'x': 7, 'y': 0}, 'o': {'x': 8, 'y': 0}, - 'p': {'x': 9, 'y': 0}, 'a': {'x': 0, 'y': 1}, 'z': {'x': 0, 'y': 2}, - 's': {'x': 1, 'y': 1}, 'x': {'x': 1, 'y': 2}, 'd': {'x': 2, 'y': 1}, - 'c': {'x': 2, 'y': 2}, 'f': {'x': 3, 'y': 1}, 'b': {'x': 4, 'y': 2}, - 'm': {'x': 6, 'y': 2}, 'j': {'x': 6, 'y': 1}, 'g': {'x': 4, 'y': 1}, - 'h': {'x': 5, 'y': 1}, 'k': {'x': 7, 'y': 1}, 'ö': {'x': 11,'y': 0}, - 'l': {'x': 8, 'y': 1}, 'v': {'x': 3, 'y': 2}, 'n': {'x': 5, 'y': 2}, - 'ß': {'x': 10,'y': 2}, 'ü': {'x': 10,'y': 2}, 'ä': {'x': 10,'y': 0}} - self.nearest_to_i = self.get_nearest_to_i(self.keyboard_cartesian) - self.splits = self.stratified_split() - - - def get_nearest_to_i(self, keyboard_cartesian): - """ Get the nearest key to the one read. - @params: keyboard_cartesian The layout of the QWERTY keyboard for English - - return dictionary of eaculidean distances for the characters""" - nearest_to_i = {} - for i in keyboard_cartesian.keys(): - nearest_to_i[i] = [] - for j in keyboard_cartesian.keys(): - if self._euclidean_distance(i, j) < mistake_distance: #was > 1.2 - nearest_to_i[i].append(j) - return nearest_to_i - - def _shuffle_word(self, word, cutoff=0.7): - """ Rearange the given characters in a word simulating typos given a probability. - - @param: word A single word coming from a sentence - @param: cutoff The cutoff probability to make a change (default 0.9) - - return The word rearranged - """ - word = list(word.lower()) - if random.uniform(0, 1.0) > cutoff: - loc = np.random.randint(0, len(word)) - if word[loc] in self.keyboard_cartesian: - word[loc] = random.choice(self.nearest_to_i[word[loc]]) - return ''.join(word) - - def _euclidean_distance(self, a, b): - """ Calculates the euclidean between 2 points in the keyboard - @param: a Point one - @param: b Point two - - return The euclidean distance between the two points""" - X = (self.keyboard_cartesian[a]['x'] - self.keyboard_cartesian[b]['x']) ** 2 - Y = (self.keyboard_cartesian[a]['y'] - self.keyboard_cartesian[b]['y']) ** 2 - return math.sqrt(X + Y) - - def _get_augment_sentence(self, sentence): - return ' '.join([self._shuffle_word(item) for item in sentence.split(' ')]) - - def _augment_sentence(self, sentence, num_samples): - """ Augment the dataset of file with a sentence shuffled - @param: sentence The sentence from the set - @param: num_samples The number of sentences to genererate - - return A set of augmented sentences""" - sentences = [] - for _ in range(num_samples): - sentences.append(self._get_augment_sentence(sentence)) - sentences = list(set(sentences)) - # print("sentences", sentences) - return sentences + [sentence] - - def _augment_split(self, X_train, y_train, num_samples=100): - """ Split the augmented train dataset - @param: X_train The full array of sentences - @param: y_train The train labels in the train dataset - @param: num_samples the number of new sentences to create (default 1000) - - return Augmented training dataset""" - Xs, ys = [], [] - for X, y in zip(X_train, y_train): - tmp_x = self._augment_sentence(X, num_samples) - sample = [[Xs.append(item), ys.append(y)] for item in tmp_x] - # print(X, y) - # print(self.augmentedFile+str(self.nSamples)+".csv") - - with open("./datasets/KL/Chatbot/train_augmented.csv", 'w', encoding='utf8') as csvFile: - fileWriter = csv.writer(csvFile, delimiter='\t') - for i in range(0, len(Xs)-1): - fileWriter.writerow([Xs[i] + '\t' + ys[i]]) - # print(Xs[i], "\t", ys[i]) - # print(Xs[i]) - # fileWriter.writerows(Xs + ['\t'] + ys) - return Xs, ys - - # Randomly replaces the nouns and verbs by synonyms - def _synonym_word(self, word, cutoff=0.5): - if random.uniform(0, 1.0) > cutoff and len(get_synonyms(word)) > 0 and word in nouns and word in verbs: - return random.choice(get_synonyms(word)) - return word - - # Randomly replace words (nouns and verbs) in sentence by synonyms - def _get_synonym_sentence(self, sentence, cutoff = 0.5): - return ' '.join([self._synonym_word(item, cutoff) for item in sentence.split(' ')]) - - # For all classes except the largest ones; add duplicate (possibly augmented) samples until all classes have the same size - def _oversample_split(self, X_train, y_train, synonym_extra_samples = False, augment_extra_samples = False): - """ Split the oversampled train dataset - @param: X_train The full array of sentences - @param: y_train The train labels in the train dataset - - return Oversampled training dataset""" - - classes = {} - for X, y in zip(X_train, y_train): - if y not in classes: - classes[y] = [] - classes[y].append(X) - - max_class_size = max([len(entries) for entries in classes.values()]) - - Xs, ys = [],[] - for y in classes.keys(): - for i in range(max_class_size): - sentence = classes[y][i % len(classes[y])] - if i >= len(classes[y]): - if synonym_extra_samples: - sentence = self._get_synonym_sentence(sentence) - if augment_extra_samples: - sentence = self._get_augment_sentence(sentence) - Xs.append(sentence) - ys.append(y) - - #with open(filename_train+"augment", 'w', encoding='utf8') as csvFile: - # fileWriter = csv.writer(csvFile, delimiter='\t') - # for i in range(0, len(Xs)-1): - # fileWriter.writerow([Xs[i] + '\t' + ys[i]]) - - return Xs, ys - - def _synonym_split(self, X_train, y_train, num_samples=100): - """ Split the augmented train dataset - @param: X_train The full array of sentences - @param: y_train The train labels in the train dataset - @param: num_samples the number of new sentences to create (default 1000) - - return Augmented training dataset""" - Xs, ys = [], [] - for X, y in zip(X_train, y_train): - sample = [[Xs.append(self._get_synonym_sentence(X)), ys.append(y)] for item in range(additional_synonyms)] - # print(X, y) - - #with open(filename_train+"augment", 'w', encoding='utf8') as csvFile: - # fileWriter = csv.writer(csvFile, delimiter='\t') - # for i in range(0, len(Xs)-1): - # fileWriter.writerow([Xs[i] + '\t' + ys[i]]) - return Xs, ys - - def load(self): - """ Load the file for now only the test.csv, train.csv files hardcoded - - return The vector separated in test, train and the labels for each one""" - with open(self.dataset_path) as csvfile: - readCSV = csv.reader(csvfile, delimiter=' ') - all_rows = list(readCSV) - # for i in all_rows: - # if i == 28823: - # print(all_rows[i]) - X_test = [a[0] for a in all_rows] - y_test = [a[1] for a in all_rows] - - with open(self.dataset_path) as csvfile: - readCSV = csv.reader(csvfile, delimiter='\t') - all_rows = list(readCSV) - X_train = [a[0] for a in all_rows] - y_train = [a[1] for a in all_rows] - return X_test, y_test, X_train, y_train - - def process_sentence(self, x): - """ Clean the tokens from stop words in a sentence. - @param x Sentence to get rid of stop words. - - returns clean string sentence""" + def preprocess(doc): clean_tokens = [] - doc = nlp.tokenizer(x) + doc = nlp(doc) for token in doc: if not token.is_stop: clean_tokens.append(token.lemma_) return " ".join(clean_tokens) - def process_batch(self, X): - """See the progress as is coming along. - - return list[] of clean sentences""" - return [self.process_sentence(a) for a in tqdm(X)] - - def stratified_split(self): - """ Split data whole into stratified test and training sets, then remove stop word from sentences - - return list of dictionaries with keys train,test and values the x and y for each one""" - self.X_train, self.X_test = ([preprocess(sentence) for sentence in self.X_train],[preprocess(sentence) for sentence in self.X_test]) - print(self.X_train) - if oversample: - self.X_train, self.y_train = self._oversample_split(self.X_train, self.y_train, synonym_extra_samples, augment_extra_samples) - if additional_synonyms > 0: - self.X_train, self.y_train = self._synonym_split(self.X_train, self.y_train, additional_synonyms) - if additional_augments > 0: - self.X_train, self.y_train = self._augment_split(self.X_train, self.y_train, additional_augments) - - splits = [{"train": {"X": self.X_train, "y": self.y_train}, - "test": {"X": self.X_test, "y": self.y_test}}] - return splits - def get_splits(self): - """ Get the splitted sentences - return splitted list of dictionaries""" - return self.splits - #**************************************************** + #********* Data augmentation part ************** + """ Class to find typos based on the keyboard distribution, for QWERTY style keyboards + It's the actual test set as defined in the paper that we comparing against.""" - print("./datasets/KL/" + benchmark_dataset + "/train.csv") - t0 = time() - dataset = MeraDataset("./datasets/KL/" + benchmark_dataset + "/train.csv") - - print("mera****************************") - splits = dataset.get_splits() - xS_train = [] - yS_train = [] - for elem in splits[0]["train"]["X"]: - xS_train.append(elem) - print(xS_train[:5]) - - for elem in splits[0]["train"]["y"]: - yS_train.append(intent_dict[elem]) - preprocess_time = time()-t0 - print(len(xS_train)) - - - - - X_train_raw, y_train_raw = read_CSV_datafile(filename = filename_train) - X_test_raw, y_test_raw = read_CSV_datafile(filename = filename_test) - print(y_train_raw[:5]) - print(X_test_raw[:5]) - print(y_test_raw[:5]) - X_train_raw = xS_train - y_train_raw = yS_train - - print("Training data samples: \n",X_train_raw, "\n\n") - - print("Class Labels: \n", y_train_raw, "\n\n") - - print("Size of Training Data: {}".format(len(X_train_raw))) - + def __init__(self, dataset_path, mistake_distance, nouns, verbs, + additional_synonyms, additional_augments, synonym_extra_samples, + augment_extra_samples, oversample): + """ Instantiate the object. + + @Param dataset_path: The directory which contains the data set + """ + self.oversample = oversample + self.augment_extra_samples = augment_extra_samples + self.synonym_extra_samples = synonym_extra_samples + self.additional_augments = additional_augments + self.additional_synonyms = additional_synonyms + self.verbs = verbs + self.nouns = nouns + self.mistake_distance=mistake_distance + self.dataset_path = dataset_path + self.X_test, self.y_test, self.X_train, self.y_train = self.load() + self.keyboard_cartesian = {'q': {'x': 0, 'y': 0}, 'w': {'x': 1, 'y': 0}, 'e': {'x': 2, 'y': 0}, + 'r': {'x': 3, 'y': 0}, 't': {'x': 4, 'y': 0}, 'y': {'x': 5, 'y': 0}, + 'u': {'x': 6, 'y': 0}, 'i': {'x': 7, 'y': 0}, 'o': {'x': 8, 'y': 0}, + 'p': {'x': 9, 'y': 0}, 'a': {'x': 0, 'y': 1}, 'z': {'x': 0, 'y': 2}, + 's': {'x': 1, 'y': 1}, 'x': {'x': 1, 'y': 2}, 'd': {'x': 2, 'y': 1}, + 'c': {'x': 2, 'y': 2}, 'f': {'x': 3, 'y': 1}, 'b': {'x': 4, 'y': 2}, + 'm': {'x': 6, 'y': 2}, 'j': {'x': 6, 'y': 1}, 'g': {'x': 4, 'y': 1}, + 'h': {'x': 5, 'y': 1}, 'k': {'x': 7, 'y': 1}, 'ö': {'x': 11,'y': 0}, + 'l': {'x': 8, 'y': 1}, 'v': {'x': 3, 'y': 2}, 'n': {'x': 5, 'y': 2}, + 'ß': {'x': 10,'y': 2}, 'ü': {'x': 10,'y': 2}, 'ä': {'x': 10,'y': 0}} + self.nearest_to_i = self.get_nearest_to_i(self.keyboard_cartesian) + self.splits = self.stratified_split() + + + def get_nearest_to_i(self, keyboard_cartesian): + """ Get the nearest key to the one read. + @params: keyboard_cartesian The layout of the QWERTY keyboard for English + + return dictionary of eaculidean distances for the characters""" + nearest_to_i = {} + for i in keyboard_cartesian.keys(): + nearest_to_i[i] = [] + for j in keyboard_cartesian.keys(): + if self._euclidean_distance(i, j) < mistake_distance: #was > 1.2 + nearest_to_i[i].append(j) + return nearest_to_i + + def _shuffle_word(self, word, cutoff=0.7): + """ Rearange the given characters in a word simulating typos given a probability. + + @param: word A single word coming from a sentence + @param: cutoff The cutoff probability to make a change (default 0.9) + + return The word rearranged + """ + word = list(word.lower()) + if random.uniform(0, 1.0) > cutoff: + loc = np.random.randint(0, len(word)) + if word[loc] in self.keyboard_cartesian: + word[loc] = random.choice(self.nearest_to_i[word[loc]]) + return ''.join(word) + + def _euclidean_distance(self, a, b): + """ Calculates the euclidean between 2 points in the keyboard + @param: a Point one + @param: b Point two + + return The euclidean distance between the two points""" + X = (self.keyboard_cartesian[a]['x'] - self.keyboard_cartesian[b]['x']) ** 2 + Y = (self.keyboard_cartesian[a]['y'] - self.keyboard_cartesian[b]['y']) ** 2 + return math.sqrt(X + Y) + + def _get_augment_sentence(self, sentence): + return ' '.join([self._shuffle_word(item) for item in sentence.split(' ')]) + + RESULT_FILE = "result5.csv" + METADATA_FILE = "metadata5.csv" + NUMBER_OF_RUNS_PER_SETTING = 10 + + def _augment_sentence(self, sentence, num_samples): + """ Augment the dataset of file with a sentence shuffled + @param: sentence The sentence from the set + @param: num_samples The number of sentences to genererate + + return A set of augmented sentences""" + sentences = [] + for _ in range(num_samples): + sentences.append(self._get_augment_sentence(sentence)) + sentences = list(set(sentences)) + # print("sentences", sentences) + return sentences + [sentence] + + nouns = {x.name().split('.', 1)[0] for x in wordnet.all_synsets('n')} + verbs = {x.name().split('.', 1)[0] for x in wordnet.all_synsets('v')} + + def get_synonyms(word, number= 3): + synonyms = [] + for syn in wordnet.synsets(word): + for l in syn.lemmas(): + synonyms.append(l.name().lower().replace("_", " ")) + synonyms = list(OrderedDict.fromkeys(synonyms)) + return synonyms[:number] + #return [token.text for token in most_similar(nlp.vocab[word])] + print(get_synonyms("search",-1)) + + def _augment_split(self, X_train, y_train, num_samples=100): + """ Split the augmented train dataset + @param: X_train The full array of sentences + @param: y_train The train labels in the train dataset + @param: num_samples the number of new sentences to create (default 1000) + + return Augmented training dataset""" + Xs, ys = [], [] + for X, y in zip(X_train, y_train): + tmp_x = self._augment_sentence(X, num_samples) + sample = [[Xs.append(item), ys.append(y)] for item in tmp_x] + # print(X, y) + # print(self.augmentedFile+str(self.nSamples)+".csv") + + + with open("./datasets/KL/Chatbot/train_augmented.csv", 'w', encoding='utf8') as csvFile: + fileWriter = csv.writer(csvFile, delimiter='\t') + for i in range(0, len(Xs)-1): + fileWriter.writerow([Xs[i] + '\t' + ys[i]]) + # print(Xs[i], "\t", ys[i]) + # print(Xs[i]) + # fileWriter.writerows(Xs + ['\t'] + ys) + return Xs, ys + + # Randomly replaces the nouns and verbs by synonyms + def _synonym_word(self, word, cutoff=0.5): + if random.uniform(0, 1.0) > cutoff and len(get_synonyms(word)) > 0 and word in nouns and word in verbs: + return random.choice(get_synonyms(word)) + return word + + # Randomly replace words (nouns and verbs) in sentence by synonyms + def _get_synonym_sentence(self, sentence, cutoff = 0.5): + return ' '.join([self._synonym_word(item, cutoff) for item in sentence.split(' ')]) + + # For all classes except the largest ones; add duplicate (possibly augmented) samples until all classes have the same size + def _oversample_split(self, X_train, y_train, synonym_extra_samples = False, augment_extra_samples = False): + """ Split the oversampled train dataset + @param: X_train The full array of sentences + @param: y_train The train labels in the train dataset + + return Oversampled training dataset""" + + classes = {} + for X, y in zip(X_train, y_train): + if y not in classes: + classes[y] = [] + classes[y].append(X) + + max_class_size = max([len(entries) for entries in classes.values()]) + + Xs, ys = [],[] + for y in classes.keys(): + for i in range(max_class_size): + sentence = classes[y][i % len(classes[y])] + if i >= len(classes[y]): + if synonym_extra_samples: + sentence = self._get_synonym_sentence(sentence) + if augment_extra_samples: + sentence = self._get_augment_sentence(sentence) + Xs.append(sentence) + ys.append(y) + + #with open(filename_train+"augment", 'w', encoding='utf8') as csvFile: + # fileWriter = csv.writer(csvFile, delimiter='\t') + # for i in range(0, len(Xs)-1): + # fileWriter.writerow([Xs[i] + '\t' + ys[i]]) + + return Xs, ys + + def _synonym_split(self, X_train, y_train, num_samples=100): + """ Split the augmented train dataset + @param: X_train The full array of sentences + @param: y_train The train labels in the train dataset + @param: num_samples the number of new sentences to create (default 1000) + + return Augmented training dataset""" + Xs, ys = [], [] + for X, y in zip(X_train, y_train): + sample = [[Xs.append(self._get_synonym_sentence(X)), ys.append(y)] for item in range(additional_synonyms)] + # print(X, y) + + #with open(filename_train+"augment", 'w', encoding='utf8') as csvFile: + # fileWriter = csv.writer(csvFile, delimiter='\t') + # for i in range(0, len(Xs)-1): + # fileWriter.writerow([Xs[i] + '\t' + ys[i]]) + return Xs, ys + + def load(self): + """ Load the file for now only the test.csv, train.csv files hardcoded + + return The vector separated in test, train and the labels for each one""" + with open(self.dataset_path) as csvfile: + readCSV = csv.reader(csvfile, delimiter=' ') + all_rows = list(readCSV) + # for i in all_rows: + # if i == 28823: + # print(all_rows[i]) + X_test = [a[0] for a in all_rows] + y_test = [a[1] for a in all_rows] + + with open(self.dataset_path) as csvfile: + readCSV = csv.reader(csvfile, delimiter='\t') + all_rows = list(readCSV) + X_train = [a[0] for a in all_rows] + y_train = [a[1] for a in all_rows] + return X_test, y_test, X_train, y_train + + def process_sentence(self, x): + """ Clean the tokens from stop words in a sentence. + @param x Sentence to get rid of stop words. + + returns clean string sentence""" + clean_tokens = [] + doc = nlp.tokenizer(x) + for token in doc: + if not token.is_stop: + clean_tokens.append(token.lemma_) + return " ".join(clean_tokens) + + def process_batch(self, X): + """See the progress as is coming along. + + return list[] of clean sentences""" + return [self.process_sentence(a) for a in tqdm(X)] + + def stratified_split(self): + """ Split data whole into stratified test and training sets, then remove stop word from sentences + + return list of dictionaries with keys train,test and values the x and y for each one""" + self.X_train, self.X_test = ([preprocess(sentence) for sentence in self.X_train],[preprocess(sentence) for sentence in self.X_test]) + print(self.X_train) + if oversample: + self.X_train, self.y_train = self._oversample_split(self.X_train, self.y_train, synonym_extra_samples, augment_extra_samples) + if additional_synonyms > 0: + self.X_train, self.y_train = self._synonym_split(self.X_train, self.y_train, additional_synonyms) + if additional_augments > 0: + self.X_train, self.y_train = self._augment_split(self.X_train, self.y_train, additional_augments) + + splits = [{"train": {"X": self.X_train, "y": self.y_train}, + "test": {"X": self.X_test, "y": self.y_test}}] + return splits + + def get_splits(self): + """ Get the splitted sentences + + return splitted list of dictionaries""" + return self.splits + #**************************************************** + + + + + print("./datasets/KL/" + benchmark_dataset + "/train.csv") + t0 = time() + dataset = MeraDataset("./datasets/KL/" + benchmark_dataset + "/train.csv") - # - # - # + print("mera****************************") + splits = dataset.get_splits() + xS_train = [] + yS_train = [] + for elem in splits[0]["train"]["X"]: + xS_train.append(elem) + print(xS_train[:5]) - # # SemHash + for elem in splits[0]["train"]["y"]: + yS_train.append(intent_dict[elem]) + preprocess_time = time()-t0 + print(len(xS_train)) - def find_ngrams(input_list, n): - return zip(*[input_list[i:] for i in range(n)]) - def semhash_tokenizer(text): - tokens = text.split(" ") - final_tokens = [] - for unhashed_token in tokens: - hashed_token = "#{}#".format(unhashed_token) - final_tokens += [''.join(gram) - for gram in list(find_ngrams(list(hashed_token), 3))] - return final_tokens + X_train_raw, y_train_raw = read_CSV_datafile(filename = filename_train) + X_test_raw, y_test_raw = read_CSV_datafile(filename = filename_test) + print(y_train_raw[:5]) + print(X_test_raw[:5]) + print(y_test_raw[:5]) + X_train_raw = xS_train + y_train_raw = yS_train - def semhash_corpus(corpus): - new_corpus = [] - for sentence in corpus: - sentence = preprocess(sentence) - tokens = semhash_tokenizer(sentence) - new_corpus.append(" ".join(map(str,tokens))) - return new_corpus + print("Training data samples: \n",X_train_raw, "\n\n") - t0 = time() - X_train_raw = semhash_corpus(X_train_raw) - X_test_raw = semhash_corpus(X_test_raw) - semhash_time = time()-t0 + print("Class Labels: \n", y_train_raw, "\n\n") + print("Size of Training Data: {}".format(len(X_train_raw))) - print(X_train_raw[:5]) - print(y_train_raw[:5]) - print() - print(X_test_raw[:5]) - print(y_test_raw[:5]) + # + # + # + # # SemHash - def get_vectorizer(corpus, preprocessor=None, tokenizer=None): - if VECTORIZER == "count": - vectorizer = CountVectorizer(analyzer='word')#,ngram_range=(1,1)) - vectorizer.fit(corpus) - feature_names = vectorizer.get_feature_names() - elif VECTORIZER == "hash": - vectorizer = HashingVectorizer(analyzer='word', n_features=2**10, non_negative=True) - vectorizer.fit(corpus) - feature_names = None - elif VECTORIZER == "tfidf": - vectorizer = TfidfVectorizer(analyzer='word') - vectorizer.fit(corpus) - feature_names = vectorizer.get_feature_names() - else: - raise Exception("{} is not a recognized Vectorizer".format(VECTORIZER)) - return vectorizer, feature_names + def find_ngrams(input_list, n): + return zip(*[input_list[i:] for i in range(n)]) - def trim(s): - """Trim string to fit on terminal (assuming 80-column display)""" - return s if len(s) <= 80 else s[:77] + "..." + def semhash_tokenizer(text): + tokens = text.split(" ") + final_tokens = [] + for unhashed_token in tokens: + hashed_token = "#{}#".format(unhashed_token) + final_tokens += [''.join(gram) + for gram in list(find_ngrams(list(hashed_token), 3))] + return final_tokens + def semhash_corpus(corpus): + new_corpus = [] + for sentence in corpus: + sentence = preprocess(sentence) + tokens = semhash_tokenizer(sentence) + new_corpus.append(" ".join(map(str,tokens))) + return new_corpus - # ############################################################################# - # Benchmark classifiers - def benchmark(clf, X_train, y_train, X_test, y_test, target_names, - print_report=True, feature_names=None, print_top10=False, - print_cm=True): - print('_' * 80) - print("Training: ") - print(clf) t0 = time() - clf.fit(X_train, y_train) - train_time = time() - t0 - print("train time: %0.3fs" % train_time) + X_train_raw = semhash_corpus(X_train_raw) + X_test_raw = semhash_corpus(X_test_raw) + semhash_time = time()-t0 - t0 = time() - pred = clf.predict(X_test) - test_time = time() - t0 - print("test time: %0.3fs" % test_time) - score = metrics.accuracy_score(y_test, pred) - f1_score = metrics.f1_score(y_test, pred, average='weighted') + print(X_train_raw[:5]) + print(y_train_raw[:5]) + print() + print(X_test_raw[:5]) + print(y_test_raw[:5]) + + + + def get_vectorizer(corpus, preprocessor=None, tokenizer=None): + if VECTORIZER == "count": + vectorizer = CountVectorizer(analyzer='word')#,ngram_range=(1,1)) + vectorizer.fit(corpus) + feature_names = vectorizer.get_feature_names() + elif VECTORIZER == "hash": + vectorizer = HashingVectorizer(analyzer='word', n_features=2**10, non_negative=True) + vectorizer.fit(corpus) + feature_names = None + elif VECTORIZER == "tfidf": + vectorizer = TfidfVectorizer(analyzer='word') + vectorizer.fit(corpus) + feature_names = vectorizer.get_feature_names() + else: + raise Exception("{} is not a recognized Vectorizer".format(VECTORIZER)) + return vectorizer, feature_names + + + + def trim(s): + """Trim string to fit on terminal (assuming 80-column display)""" + return s if len(s) <= 80 else s[:77] + "..." + + + # ############################################################################# + # Benchmark classifiers + def benchmark(clf, X_train, y_train, X_test, y_test, target_names, + print_report=True, feature_names=None, print_top10=False, + print_cm=True): + print('_' * 80) + print("Training: ") + print(clf) + t0 = time() + clf.fit(X_train, y_train) + train_time = time() - t0 + print("train time: %0.3fs" % train_time) + + t0 = time() + pred = clf.predict(X_test) + test_time = time() - t0 + print("test time: %0.3fs" % test_time) + + score = metrics.accuracy_score(y_test, pred) + f1_score = metrics.f1_score(y_test, pred, average='weighted') + + #bad_pred = X_test[pred != y_test] + + print("accuracy: %0.3f" % score) + #print("Accuracy: %0.3f (+/- %0.3f)" % (score.mean(), score.std() * 2)) + + if hasattr(clf, 'coef_'): + print("dimensionality: %d" % clf.coef_.shape[1]) + print("density: %f" % density(clf.coef_)) + + if print_top10 and feature_names is not None: + print("top 10 keywords per class:") + for i, label in enumerate(["Make Update", "Setup Printer", "Shutdown Computer","Software Recommendation", "None"]): + top10 = np.argsort(clf.coef_[i])[-10:] + print(trim("%s: %s" % (label, " ".join([feature_names[i] for i in top10])))) + print() + + if print_report: + print("classification report:") + print(metrics.classification_report(y_test, pred,labels = range(len(target_names)), + target_names=target_names)) + + if print_cm: + print("confusion matrix:") + print(metrics.confusion_matrix(y_test, pred)) + + with open("./"+RESULT_FILE, 'a', encoding='utf8') as csvFile: + fileWriter = csv.writer(csvFile, delimiter='\t') + fileWriter.writerow([benchmark_dataset,str(clf),str(oversample),str(synonym_extra_samples),str(augment_extra_samples), + str(additional_synonyms),str(additional_augments), str(mistake_distance), str(score), str(f1_score), str(train_time), str(test_time)]) - #bad_pred = X_test[pred != y_test] + print() + clf_descr = str(clf).split('(')[0] + return clf_descr, score, train_time, test_time, f1_score - print("accuracy: %0.3f" % score) - #print("Accuracy: %0.3f (+/- %0.3f)" % (score.mean(), score.std() * 2)) - if hasattr(clf, 'coef_'): - print("dimensionality: %d" % clf.coef_.shape[1]) - print("density: %f" % density(clf.coef_)) - if print_top10 and feature_names is not None: - print("top 10 keywords per class:") - for i, label in enumerate(["Make Update", "Setup Printer", "Shutdown Computer","Software Recommendation", "None"]): - top10 = np.argsort(clf.coef_[i])[-10:] - print(trim("%s: %s" % (label, " ".join([feature_names[i] for i in top10])))) - print() - if print_report: - print("classification report:") - print(metrics.classification_report(y_test, pred,labels = range(len(target_names)), - target_names=target_names)) + def plot_results(results): + # make some plots + indices = np.arange(len(results)) - if print_cm: - print("confusion matrix:") - print(metrics.confusion_matrix(y_test, pred)) + results = [[x[i] for x in results] for i in range(4)] - with open("./"+RESULT_FILE, 'a', encoding='utf8') as csvFile: - fileWriter = csv.writer(csvFile, delimiter='\t') - fileWriter.writerow([benchmark_dataset,str(clf),str(oversample),str(synonym_extra_samples),str(augment_extra_samples), - str(additional_synonyms),str(additional_augments), str(mistake_distance), str(score), str(f1_score), str(train_time), str(test_time)]) + clf_names, score, training_time, test_time = results + training_time = np.array(training_time) / np.max(training_time) + test_time = np.array(test_time) / np.max(test_time) - print() - clf_descr = str(clf).split('(')[0] - return clf_descr, score, train_time, test_time, f1_score + plt.figure(figsize=(12, 8)) + plt.title("Score") + plt.barh(indices, score, .2, label="score", color='navy') + plt.barh(indices + .3, training_time, .2, label="training time", + color='c') + plt.barh(indices + .6, test_time, .2, label="test time", color='darkorange') + plt.yticks(()) + plt.legend(loc='best') + plt.subplots_adjust(left=.25) + plt.subplots_adjust(top=.95) + plt.subplots_adjust(bottom=.05) + for i, c in zip(indices, clf_names): + plt.text(-.3, i, c) + plt.show() - def plot_results(results): - # make some plots - indices = np.arange(len(results)) - results = [[x[i] for x in results] for i in range(4)] - clf_names, score, training_time, test_time = results - training_time = np.array(training_time) / np.max(training_time) - test_time = np.array(test_time) / np.max(test_time) + def data_for_training(): + vectorizer, feature_names = get_vectorizer(X_train_raw, preprocessor=preprocess, tokenizer=tokenize) - plt.figure(figsize=(12, 8)) - plt.title("Score") - plt.barh(indices, score, .2, label="score", color='navy') - plt.barh(indices + .3, training_time, .2, label="training time", - color='c') - plt.barh(indices + .6, test_time, .2, label="test time", color='darkorange') - plt.yticks(()) - plt.legend(loc='best') - plt.subplots_adjust(left=.25) - plt.subplots_adjust(top=.95) - plt.subplots_adjust(bottom=.05) + X_train = vectorizer.transform(X_train_raw).toarray() + X_test = vectorizer.transform(X_test_raw).toarray() - for i, c in zip(indices, clf_names): - plt.text(-.3, i, c) + return X_train, y_train_raw, X_test, y_test_raw, feature_names - plt.show() + t0 = time() + X_train, y_train, X_test, y_test, feature_names = data_for_training() + vectorize_time = time()-t0 + with open("./"+METADATA_FILE, 'a', encoding='utf8') as csvFile: + fileWriter = csv.writer(csvFile, delimiter='\t') + fileWriter.writerow([benchmark_dataset,str(oversample),str(synonym_extra_samples),str(augment_extra_samples),str(additional_synonyms),str(additional_augments),str(mistake_distance),str(preprocess_time),str(semhash_time),str(vectorize_time)]) + + + print(X_train[0].tolist()) + print(y_train[0]) + print(feature_names) + + + for _ in enumerate(range(NUMBER_OF_RUNS_PER_SETTING)): + i_s = 0 + split = 0 + print("Evaluating Split {}".format(i_s)) + target_names = None + if benchmark_dataset == "Chatbot": + target_names = ["Departure Time", "Find Connection"] + elif benchmark_dataset == "AskUbuntu": + target_names = ["Make Update", "Setup Printer", "Shutdown Computer","Software Recommendation", "None"] + elif benchmark_dataset == "WebApplication": + target_names = ["Download Video", "Change Password", "None", "Export Data", "Sync Accounts", + "Filter Spam", "Find Alternative", "Delete Account"] + print("Train Size: {}\nTest Size: {}".format(X_train.shape[0], X_test.shape[0])) + results = [] + #alphas = np.array([1,0.1,0.01,0.001,0.0001,0]) + parameters_mlp={'hidden_layer_sizes':[(100,50), (300, 100),(300,200,100)]} + parameters_RF={ "n_estimators" : [50,60,70], + "min_samples_leaf" : [1, 11]} + k_range = list(range(3,7)) + parameters_knn = {'n_neighbors':k_range} + knn=KNeighborsClassifier(n_neighbors=5) + for clf, name in [ + (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"), + (GridSearchCV(knn,parameters_knn, cv=5),"gridsearchknn"), + #(Perceptron(n_iter=50), "Perceptron"), + (GridSearchCV(MLPClassifier(activation='tanh'),parameters_mlp, cv=5),"gridsearchmlp"), + (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"), + (GridSearchCV(RandomForestClassifier(n_estimators=10),parameters_RF, cv=5),"gridsearchRF") + ]: + + print('=' * 80) + print(name) + result = benchmark(clf, X_train, y_train, X_test, y_test, target_names, + feature_names=feature_names) + results.append(result) + + # print('parameters') + # print(clf.grid_scores_[0]) + #print('CV Validation Score') + # print(clf.grid_scores_[0].cv_validation_scores) + # print('Mean Validation Score') + # print(clf.grid_scores_[0].mean_validation_score) + # grid_mean_scores = [result.mean_validation_score for result in clf.grid_scores_] + # print(grid_mean_scores) + # plt.plot(k_range, grid_mean_scores) + # plt.xlabel('Value of K for KNN') + # plt.ylabel('Cross-Validated Accuracy') + + parameters_Linearsvc = [{'C': [1, 10], 'gamma': [0.1,1.0]}] + for penalty in ["l2", "l1"]: + # print('=' * 80) + # print("%s penalty" % penalty.upper()) + # Train Liblinear model + grid=(GridSearchCV(LinearSVC,parameters_Linearsvc, cv=10),"gridsearchSVC") + #results.append(benchmark(LinearSVC(penalty=penalty), X_train, y_train, X_test, y_test, target_names, + # feature_names=feature_names)) + + result = benchmark(LinearSVC(penalty=penalty, dual=False,tol=1e-3), + X_train, y_train, X_test, y_test, target_names, + feature_names=feature_names) + results.append(result) + + # Train SGD model + result = benchmark(SGDClassifier(alpha=.0001, n_iter=50, + penalty=penalty), + X_train, y_train, X_test, y_test, target_names, + feature_names=feature_names) + results.append(result) + + # Train SGD with Elastic Net penalty + #print('=' * 80) + #print("Elastic-Net penalty") + results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50, + penalty="elasticnet"), + X_train, y_train, X_test, y_test, target_names, + feature_names=feature_names)) + # Train NearestCentroid without threshold + #print('=' * 80) + #print("NearestCentroid (aka Rocchio classifier)") + results.append(benchmark(NearestCentroid(), + X_train, y_train, X_test, y_test, target_names, + feature_names=feature_names)) + # Train sparse Naive Bayes classifiers + #print('=' * 80) + #print("Naive Bayes") + results.append(benchmark(MultinomialNB(alpha=.01), + X_train, y_train, X_test, y_test, target_names, + feature_names=feature_names)) - def data_for_training(): - vectorizer, feature_names = get_vectorizer(X_train_raw, preprocessor=preprocess, tokenizer=tokenize) + result = benchmark(BernoulliNB(alpha=.01), + X_train, y_train, X_test, y_test, target_names, + feature_names=feature_names) + results.append(result) - X_train = vectorizer.transform(X_train_raw).toarray() - X_test = vectorizer.transform(X_test_raw).toarray() + #print('=' * 80) + #print("LinearSVC with L1-based feature selection") + # The smaller C, the stronger the regularization. + # The more regularization, the more sparsity. + result = benchmark(Pipeline([ + ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False, + tol=1e-3))), + ('classification', LinearSVC(penalty="l2"))]), + X_train, y_train, X_test, y_test, target_names, + feature_names=feature_names) + results.append(result) + #print(grid.grid_scores_) + #KMeans clustering algorithm + #print('=' * 80) + #print("KMeans") + results.append(benchmark(KMeans(n_clusters=2, init='k-means++', max_iter=300, + verbose=0, random_state=0, tol=1e-4), + X_train, y_train, X_test, y_test, target_names, + feature_names=feature_names)) - return X_train, y_train_raw, X_test, y_test_raw, feature_names - t0 = time() - X_train, y_train, X_test, y_test, feature_names = data_for_training() - vectorize_time = time()-t0 - with open("./"+METADATA_FILE, 'a', encoding='utf8') as csvFile: - fileWriter = csv.writer(csvFile, delimiter='\t') - fileWriter.writerow([benchmark_dataset,str(oversample),str(synonym_extra_samples),str(augment_extra_samples),str(additional_synonyms),str(additional_augments),str(mistake_distance),str(preprocess_time),str(semhash_time),str(vectorize_time)]) + #print('=' * 80) + #print("LogisticRegression") + kfold = model_selection.KFold(n_splits=2, random_state=0) + results.append(benchmark(LogisticRegression(C=1.0, class_weight=None, dual=False, + fit_intercept=True, intercept_scaling=1, max_iter=100, + multi_class='ovr', n_jobs=1, penalty='l2', random_state=None, + solver='liblinear', tol=0.0001, verbose=0, warm_start=False), + X_train, y_train, X_test, y_test, target_names, + feature_names=feature_names)) + #plot_results(results) - print(X_train[0].tolist()) - print(y_train[0]) - print(feature_names) - for _ in enumerate(range(NUMBER_OF_RUNS_PER_SETTING)): - i_s = 0 - split = 0 - print("Evaluating Split {}".format(i_s)) - target_names = None - if benchmark_dataset == "Chatbot": - target_names = ["Departure Time", "Find Connection"] - elif benchmark_dataset == "AskUbuntu": - target_names = ["Make Update", "Setup Printer", "Shutdown Computer","Software Recommendation", "None"] - elif benchmark_dataset == "WebApplication": - target_names = ["Download Video", "Change Password", "None", "Export Data", "Sync Accounts", - "Filter Spam", "Find Alternative", "Delete Account"] - print("Train Size: {}\nTest Size: {}".format(X_train.shape[0], X_test.shape[0])) - results = [] - #alphas = np.array([1,0.1,0.01,0.001,0.0001,0]) - parameters_mlp={'hidden_layer_sizes':[(100,50), (300, 100),(300,200,100)]} - parameters_RF={ "n_estimators" : [50,60,70], - "min_samples_leaf" : [1, 11]} - k_range = list(range(3,7)) - parameters_knn = {'n_neighbors':k_range} - knn=KNeighborsClassifier(n_neighbors=5) - for clf, name in [ - (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"), - (GridSearchCV(knn,parameters_knn, cv=5),"gridsearchknn"), - #(Perceptron(n_iter=50), "Perceptron"), - (GridSearchCV(MLPClassifier(activation='tanh'),parameters_mlp, cv=5),"gridsearchmlp"), - (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"), - (GridSearchCV(RandomForestClassifier(n_estimators=10),parameters_RF, cv=5),"gridsearchRF") - ]: - - print('=' * 80) - print(name) - result = benchmark(clf, X_train, y_train, X_test, y_test, target_names, - feature_names=feature_names) - results.append(result) - # print('parameters') - # print(clf.grid_scores_[0]) - #print('CV Validation Score') - # print(clf.grid_scores_[0].cv_validation_scores) - # print('Mean Validation Score') - # print(clf.grid_scores_[0].mean_validation_score) - # grid_mean_scores = [result.mean_validation_score for result in clf.grid_scores_] - # print(grid_mean_scores) - # plt.plot(k_range, grid_mean_scores) - # plt.xlabel('Value of K for KNN') - # plt.ylabel('Cross-Validated Accuracy') - - parameters_Linearsvc = [{'C': [1, 10], 'gamma': [0.1,1.0]}] - for penalty in ["l2", "l1"]: - # print('=' * 80) - # print("%s penalty" % penalty.upper()) - # Train Liblinear model - grid=(GridSearchCV(LinearSVC,parameters_Linearsvc, cv=10),"gridsearchSVC") - #results.append(benchmark(LinearSVC(penalty=penalty), X_train, y_train, X_test, y_test, target_names, - # feature_names=feature_names)) - - result = benchmark(LinearSVC(penalty=penalty, dual=False,tol=1e-3), - X_train, y_train, X_test, y_test, target_names, - feature_names=feature_names) - results.append(result) - # Train SGD model - result = benchmark(SGDClassifier(alpha=.0001, n_iter=50, - penalty=penalty), - X_train, y_train, X_test, y_test, target_names, - feature_names=feature_names) - results.append(result) - # Train SGD with Elastic Net penalty - #print('=' * 80) - #print("Elastic-Net penalty") - results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50, - penalty="elasticnet"), - X_train, y_train, X_test, y_test, target_names, - feature_names=feature_names)) - - # Train NearestCentroid without threshold - #print('=' * 80) - #print("NearestCentroid (aka Rocchio classifier)") - results.append(benchmark(NearestCentroid(), - X_train, y_train, X_test, y_test, target_names, - feature_names=feature_names)) - - # Train sparse Naive Bayes classifiers - #print('=' * 80) - #print("Naive Bayes") - results.append(benchmark(MultinomialNB(alpha=.01), - X_train, y_train, X_test, y_test, target_names, - feature_names=feature_names)) - - result = benchmark(BernoulliNB(alpha=.01), - X_train, y_train, X_test, y_test, target_names, - feature_names=feature_names) - results.append(result) - - #print('=' * 80) - #print("LinearSVC with L1-based feature selection") - # The smaller C, the stronger the regularization. - # The more regularization, the more sparsity. - result = benchmark(Pipeline([ - ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False, - tol=1e-3))), - ('classification', LinearSVC(penalty="l2"))]), - X_train, y_train, X_test, y_test, target_names, - feature_names=feature_names) - results.append(result) - #print(grid.grid_scores_) - #KMeans clustering algorithm - #print('=' * 80) - #print("KMeans") - results.append(benchmark(KMeans(n_clusters=2, init='k-means++', max_iter=300, - verbose=0, random_state=0, tol=1e-4), - X_train, y_train, X_test, y_test, target_names, - feature_names=feature_names)) - - - - #print('=' * 80) - #print("LogisticRegression") - kfold = model_selection.KFold(n_splits=2, random_state=0) - results.append(benchmark(LogisticRegression(C=1.0, class_weight=None, dual=False, - fit_intercept=True, intercept_scaling=1, max_iter=100, - multi_class='ovr', n_jobs=1, penalty='l2', random_state=None, - solver='liblinear', tol=0.0001, verbose=0, warm_start=False), - X_train, y_train, X_test, y_test, target_names, - feature_names=feature_names)) - - #plot_results(results) - - - - - - - print(len(X_train)) + print(len(X_train))