From b5b64831ee8a846c7b6ff50b6d1bb75b661c66a3 Mon Sep 17 00:00:00 2001
From: tkhabia <tanmay.khabia@students.iiit.ac.in>
Date: Sat, 2 May 2020 17:59:17 +0530
Subject: [PATCH 1/4] added function

---
 comprehensive_semhash_test.py | 39 ++++++++++++++++++++---------------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/comprehensive_semhash_test.py b/comprehensive_semhash_test.py
index a9b1556..1ef2fad 100644
--- a/comprehensive_semhash_test.py
+++ b/comprehensive_semhash_test.py
@@ -446,21 +446,24 @@ def get_splits(self):
 
 
     print("./datasets/KL/" + benchmark_dataset + "/train.csv")
-    t0 = time()
-    dataset = MeraDataset("./datasets/KL/" + benchmark_dataset + "/train.csv")
-    
-    print("mera****************************")
-    splits = dataset.get_splits()
-    xS_train = []
-    yS_train = []
-    for elem in splits[0]["train"]["X"]:
-        xS_train.append(elem)
-    print(xS_train[:5])
-
-    for elem in splits[0]["train"]["y"]:
-        yS_train.append(intent_dict[elem])
-    preprocess_time = time()-t0
-    print(len(xS_train))
+    def gen_raw_train_data(benchmark_dataset):
+        
+        dataset = MeraDataset(os.path.join("./datasets/KL/" , benchmark_dataset , "/train.csv"))
+        print("mera****************************")
+        splits = dataset.get_splits()
+        xS_train = []
+        yS_train = []
+        
+        for elem in splits[0]["train"]["X"]:
+            xS_train.append(elem)
+        
+        print(xS_train[:5])
+        print(len(xS_train))
+
+        for elem in splits[0]["train"]["y"]:
+            yS_train.append(intent_dict[elem])
+        
+        return xS_train , yS_train , preprocess_time
 
 
 
@@ -470,8 +473,10 @@ def get_splits(self):
     print(y_train_raw[:5])
     print(X_test_raw[:5])
     print(y_test_raw[:5])
-    X_train_raw = xS_train
-    y_train_raw = yS_train
+    t0 = time()
+    X_train_raw ,y_train_raw = gen_raw_train_data(benchmark_dataset)
+    preprocess_time = time()-t0
+
 
     print("Training data samples: \n",X_train_raw, "\n\n")
 

From 07790c51cccd71b9a892d99a191ccf184fd03748 Mon Sep 17 00:00:00 2001
From: tkhabia <tanmay.khabia@students.iiit.ac.in>
Date: Sun, 3 May 2020 16:55:37 +0530
Subject: [PATCH 2/4] corrected mistakes

---
 comprehensive_semhash_test.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/comprehensive_semhash_test.py b/comprehensive_semhash_test.py
index 1ef2fad..cdad109 100644
--- a/comprehensive_semhash_test.py
+++ b/comprehensive_semhash_test.py
@@ -297,7 +297,7 @@ def _augment_split(self, X_train, y_train, num_samples=100):
             """ Split the augmented train dataset
                 @param: X_train The full array of sentences
                 @param: y_train The train labels in the train dataset
-                @param: num_samples the number of new sentences to create (default 1000)
+                @param: num_samples the number of new sentences to create (default 100)
 
                 return Augmented training dataset"""
             Xs, ys = [], []
@@ -366,12 +366,12 @@ def _synonym_split(self, X_train, y_train, num_samples=100):
             """ Split the augmented train dataset
                 @param: X_train The full array of sentences
                 @param: y_train The train labels in the train dataset
-                @param: num_samples the number of new sentences to create (default 1000)
+                @param: num_samples the number of new sentences to create (default 100)
 
                 return Augmented training dataset"""
             Xs, ys = [], []
             for X, y in zip(X_train, y_train):
-                sample = [[Xs.append(self._get_synonym_sentence(X)), ys.append(y)] for item in range(additional_synonyms)]
+                sample = [[Xs.append(self._get_synonym_sentence(X)), ys.append(y)] for item in range(num_samples)]
     #             print(X, y)
 
             #with open(filename_train+"augment", 'w', encoding='utf8') as csvFile:
@@ -385,7 +385,7 @@ def load(self):
 
                 return The vector separated in test, train and the labels for each one"""
             with open(self.dataset_path) as csvfile:
-                readCSV = csv.reader(csvfile, delimiter='	')
+                readCSV = csv.reader(csvfile, delimiter='\t')
                 all_rows = list(readCSV)
     #             for i in all_rows:
     #                 if i ==  28823:
@@ -415,8 +415,8 @@ def process_sentence(self, x):
         def process_batch(self, X):
             """See the progress as is coming along.
 
-                return list[] of clean sentences"""
-            return [self.process_sentence(a) for a in tqdm(X)]
+                return list[] of cleprocess_sentencean sentences"""
+            return [self.(a) for a in tqdm(X)]
 
         def stratified_split(self):
             """ Split data whole into stratified test and training sets, then remove stop word from sentences
@@ -463,11 +463,11 @@ def gen_raw_train_data(benchmark_dataset):
         for elem in splits[0]["train"]["y"]:
             yS_train.append(intent_dict[elem])
         
-        return xS_train , yS_train , preprocess_time
-
+        return xS_train , yS_train 
 
 
 
+#   preprocessing
     X_train_raw, y_train_raw = read_CSV_datafile(filename = filename_train)
     X_test_raw, y_test_raw = read_CSV_datafile(filename = filename_test)
     print(y_train_raw[:5])

From 28f84e776d474f2dd4d0945ff4939c044d4f0218 Mon Sep 17 00:00:00 2001
From: tkhabia <tanmay.khabia@students.iiit.ac.in>
Date: Sun, 3 May 2020 17:06:21 +0530
Subject: [PATCH 3/4] corrected mistake

---
 comprehensive_semhash_test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/comprehensive_semhash_test.py b/comprehensive_semhash_test.py
index cdad109..baf5d14 100644
--- a/comprehensive_semhash_test.py
+++ b/comprehensive_semhash_test.py
@@ -415,8 +415,8 @@ def process_sentence(self, x):
         def process_batch(self, X):
             """See the progress as is coming along.
 
-                return list[] of cleprocess_sentencean sentences"""
-            return [self.(a) for a in tqdm(X)]
+                return list[] of clean sentences"""
+            return [self.process_sentence(a) for a in tqdm(X)]
 
         def stratified_split(self):
             """ Split data whole into stratified test and training sets, then remove stop word from sentences

From a1376e34faf1648b7c153d9fc788a786e9828f83 Mon Sep 17 00:00:00 2001
From: tkhabia <tanmay.khabia@students.iiit.ac.in>
Date: Sun, 3 May 2020 17:35:19 +0530
Subject: [PATCH 4/4] added more functions

---
 comprehensive_semhash_test.py | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/comprehensive_semhash_test.py b/comprehensive_semhash_test.py
index baf5d14..58f6943 100644
--- a/comprehensive_semhash_test.py
+++ b/comprehensive_semhash_test.py
@@ -447,7 +447,7 @@ def get_splits(self):
 
     print("./datasets/KL/" + benchmark_dataset + "/train.csv")
     def gen_raw_train_data(benchmark_dataset):
-        
+        ''' generate raw training data from benchmark dataset '''
         dataset = MeraDataset(os.path.join("./datasets/KL/" , benchmark_dataset , "/train.csv"))
         print("mera****************************")
         splits = dataset.get_splits()
@@ -651,6 +651,17 @@ def data_for_training():
     X_train, y_train, X_test, y_test, feature_names = data_for_training()
     vectorize_time = time()-t0
 
+    def get_target_names (benchmark_dataset):
+        """ returns a list of target names for corresponding benchmark_dataset """
+        if benchmark_dataset == "Chatbot":
+            target_names = ["Departure Time", "Find Connection"]
+        elif benchmark_dataset == "AskUbuntu":
+            target_names = ["Make Update", "Setup Printer", "Shutdown Computer","Software Recommendation", "None"]
+        elif benchmark_dataset == "WebApplication":
+            target_names = ["Download Video", "Change Password", "None", "Export Data", "Sync Accounts",
+                      "Filter Spam", "Find Alternative", "Delete Account"]
+        return target_names
+
     with open("./"+METADATA_FILE, 'a', encoding='utf8') as csvFile:
             fileWriter = csv.writer(csvFile, delimiter='\t')
             fileWriter.writerow([benchmark_dataset,str(oversample),str(synonym_extra_samples),str(augment_extra_samples),str(additional_synonyms),str(additional_augments),str(mistake_distance),str(preprocess_time),str(semhash_time),str(vectorize_time)])
@@ -660,19 +671,13 @@ def data_for_training():
     print(y_train[0])
     print(feature_names)
 
-
+    
     for _ in enumerate(range(NUMBER_OF_RUNS_PER_SETTING)):
         i_s = 0
         split = 0
         print("Evaluating Split {}".format(i_s))
-        target_names = None
-        if benchmark_dataset == "Chatbot":
-            target_names = ["Departure Time", "Find Connection"]
-        elif benchmark_dataset == "AskUbuntu":
-            target_names = ["Make Update", "Setup Printer", "Shutdown Computer","Software Recommendation", "None"]
-        elif benchmark_dataset == "WebApplication":
-            target_names = ["Download Video", "Change Password", "None", "Export Data", "Sync Accounts",
-                      "Filter Spam", "Find Alternative", "Delete Account"]
+        target_names = get_target_names(benchmark_dataset)
+
         print("Train Size: {}\nTest Size: {}".format(X_train.shape[0], X_test.shape[0]))
         results = []
         #alphas = np.array([1,0.1,0.01,0.001,0.0001,0])