From b5b64831ee8a846c7b6ff50b6d1bb75b661c66a3 Mon Sep 17 00:00:00 2001 From: tkhabia Date: Sat, 2 May 2020 17:59:17 +0530 Subject: [PATCH 1/4] added function --- comprehensive_semhash_test.py | 39 ++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/comprehensive_semhash_test.py b/comprehensive_semhash_test.py index a9b1556..1ef2fad 100644 --- a/comprehensive_semhash_test.py +++ b/comprehensive_semhash_test.py @@ -446,21 +446,24 @@ def get_splits(self): print("./datasets/KL/" + benchmark_dataset + "/train.csv") - t0 = time() - dataset = MeraDataset("./datasets/KL/" + benchmark_dataset + "/train.csv") - - print("mera****************************") - splits = dataset.get_splits() - xS_train = [] - yS_train = [] - for elem in splits[0]["train"]["X"]: - xS_train.append(elem) - print(xS_train[:5]) - - for elem in splits[0]["train"]["y"]: - yS_train.append(intent_dict[elem]) - preprocess_time = time()-t0 - print(len(xS_train)) + def gen_raw_train_data(benchmark_dataset): + + dataset = MeraDataset(os.path.join("./datasets/KL/" , benchmark_dataset , "/train.csv")) + print("mera****************************") + splits = dataset.get_splits() + xS_train = [] + yS_train = [] + + for elem in splits[0]["train"]["X"]: + xS_train.append(elem) + + print(xS_train[:5]) + print(len(xS_train)) + + for elem in splits[0]["train"]["y"]: + yS_train.append(intent_dict[elem]) + + return xS_train , yS_train , preprocess_time @@ -470,8 +473,10 @@ def get_splits(self): print(y_train_raw[:5]) print(X_test_raw[:5]) print(y_test_raw[:5]) - X_train_raw = xS_train - y_train_raw = yS_train + t0 = time() + X_train_raw ,y_train_raw = gen_raw_train_data(benchmark_dataset) + preprocess_time = time()-t0 + print("Training data samples: \n",X_train_raw, "\n\n") From 07790c51cccd71b9a892d99a191ccf184fd03748 Mon Sep 17 00:00:00 2001 From: tkhabia Date: Sun, 3 May 2020 16:55:37 +0530 Subject: [PATCH 2/4] corrected mistakes --- comprehensive_semhash_test.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/comprehensive_semhash_test.py b/comprehensive_semhash_test.py index 1ef2fad..cdad109 100644 --- a/comprehensive_semhash_test.py +++ b/comprehensive_semhash_test.py @@ -297,7 +297,7 @@ def _augment_split(self, X_train, y_train, num_samples=100): """ Split the augmented train dataset @param: X_train The full array of sentences @param: y_train The train labels in the train dataset - @param: num_samples the number of new sentences to create (default 1000) + @param: num_samples the number of new sentences to create (default 100) return Augmented training dataset""" Xs, ys = [], [] @@ -366,12 +366,12 @@ def _synonym_split(self, X_train, y_train, num_samples=100): """ Split the augmented train dataset @param: X_train The full array of sentences @param: y_train The train labels in the train dataset - @param: num_samples the number of new sentences to create (default 1000) + @param: num_samples the number of new sentences to create (default 100) return Augmented training dataset""" Xs, ys = [], [] for X, y in zip(X_train, y_train): - sample = [[Xs.append(self._get_synonym_sentence(X)), ys.append(y)] for item in range(additional_synonyms)] + sample = [[Xs.append(self._get_synonym_sentence(X)), ys.append(y)] for item in range(num_samples)] # print(X, y) #with open(filename_train+"augment", 'w', encoding='utf8') as csvFile: @@ -385,7 +385,7 @@ def load(self): return The vector separated in test, train and the labels for each one""" with open(self.dataset_path) as csvfile: - readCSV = csv.reader(csvfile, delimiter=' ') + readCSV = csv.reader(csvfile, delimiter='\t') all_rows = list(readCSV) # for i in all_rows: # if i == 28823: @@ -415,8 +415,8 @@ def process_sentence(self, x): def process_batch(self, X): """See the progress as is coming along. - return list[] of clean sentences""" - return [self.process_sentence(a) for a in tqdm(X)] + return list[] of cleprocess_sentencean sentences""" + return [self.(a) for a in tqdm(X)] def stratified_split(self): """ Split data whole into stratified test and training sets, then remove stop word from sentences @@ -463,11 +463,11 @@ def gen_raw_train_data(benchmark_dataset): for elem in splits[0]["train"]["y"]: yS_train.append(intent_dict[elem]) - return xS_train , yS_train , preprocess_time - + return xS_train , yS_train +# preprocessing X_train_raw, y_train_raw = read_CSV_datafile(filename = filename_train) X_test_raw, y_test_raw = read_CSV_datafile(filename = filename_test) print(y_train_raw[:5]) From 28f84e776d474f2dd4d0945ff4939c044d4f0218 Mon Sep 17 00:00:00 2001 From: tkhabia Date: Sun, 3 May 2020 17:06:21 +0530 Subject: [PATCH 3/4] corrected mistake --- comprehensive_semhash_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comprehensive_semhash_test.py b/comprehensive_semhash_test.py index cdad109..baf5d14 100644 --- a/comprehensive_semhash_test.py +++ b/comprehensive_semhash_test.py @@ -415,8 +415,8 @@ def process_sentence(self, x): def process_batch(self, X): """See the progress as is coming along. - return list[] of cleprocess_sentencean sentences""" - return [self.(a) for a in tqdm(X)] + return list[] of clean sentences""" + return [self.process_sentence(a) for a in tqdm(X)] def stratified_split(self): """ Split data whole into stratified test and training sets, then remove stop word from sentences From a1376e34faf1648b7c153d9fc788a786e9828f83 Mon Sep 17 00:00:00 2001 From: tkhabia Date: Sun, 3 May 2020 17:35:19 +0530 Subject: [PATCH 4/4] added more functions --- comprehensive_semhash_test.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/comprehensive_semhash_test.py b/comprehensive_semhash_test.py index baf5d14..58f6943 100644 --- a/comprehensive_semhash_test.py +++ b/comprehensive_semhash_test.py @@ -447,7 +447,7 @@ def get_splits(self): print("./datasets/KL/" + benchmark_dataset + "/train.csv") def gen_raw_train_data(benchmark_dataset): - + ''' generate raw training data from benchmark dataset ''' dataset = MeraDataset(os.path.join("./datasets/KL/" , benchmark_dataset , "/train.csv")) print("mera****************************") splits = dataset.get_splits() @@ -651,6 +651,17 @@ def data_for_training(): X_train, y_train, X_test, y_test, feature_names = data_for_training() vectorize_time = time()-t0 + def get_target_names (benchmark_dataset): + """ returns a list of target names for corresponding benchmark_dataset """ + if benchmark_dataset == "Chatbot": + target_names = ["Departure Time", "Find Connection"] + elif benchmark_dataset == "AskUbuntu": + target_names = ["Make Update", "Setup Printer", "Shutdown Computer","Software Recommendation", "None"] + elif benchmark_dataset == "WebApplication": + target_names = ["Download Video", "Change Password", "None", "Export Data", "Sync Accounts", + "Filter Spam", "Find Alternative", "Delete Account"] + return target_names + with open("./"+METADATA_FILE, 'a', encoding='utf8') as csvFile: fileWriter = csv.writer(csvFile, delimiter='\t') fileWriter.writerow([benchmark_dataset,str(oversample),str(synonym_extra_samples),str(augment_extra_samples),str(additional_synonyms),str(additional_augments),str(mistake_distance),str(preprocess_time),str(semhash_time),str(vectorize_time)]) @@ -660,19 +671,13 @@ def data_for_training(): print(y_train[0]) print(feature_names) - + for _ in enumerate(range(NUMBER_OF_RUNS_PER_SETTING)): i_s = 0 split = 0 print("Evaluating Split {}".format(i_s)) - target_names = None - if benchmark_dataset == "Chatbot": - target_names = ["Departure Time", "Find Connection"] - elif benchmark_dataset == "AskUbuntu": - target_names = ["Make Update", "Setup Printer", "Shutdown Computer","Software Recommendation", "None"] - elif benchmark_dataset == "WebApplication": - target_names = ["Download Video", "Change Password", "None", "Export Data", "Sync Accounts", - "Filter Spam", "Find Alternative", "Delete Account"] + target_names = get_target_names(benchmark_dataset) + print("Train Size: {}\nTest Size: {}".format(X_train.shape[0], X_test.shape[0])) results = [] #alphas = np.array([1,0.1,0.01,0.001,0.0001,0])