This repository was archived by the owner on Apr 10, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathtrain_and_validate.py
More file actions
executable file
·118 lines (92 loc) · 3.88 KB
/
train_and_validate.py
File metadata and controls
executable file
·118 lines (92 loc) · 3.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/anaconda3/envs/bugzilla-env/bin/python
# !! change python environment !!
import data_helpers
import gensim
import getopt
import numpy as np
import pandas as pd
import os
import pathlib
import pickle
import sklearn
import sys
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import *
from sklearn.metrics import *
class MeanEmbeddingVectorizer(object):
def __init__(self, word2vec):
self.word2vec = word2vec
# if a text is empty we should return a vector of zeros
# with the same dimensionality as all the other vectors
def fit(self, X, y):
return self
def transform(self, X):
"""
This method sums all wordvecs of all words in a sentences
and divides the resulting vector by the len of word count in the sentence
"""
return np.array([np.sum([self.word2vec[w] for w in words if w in self.word2vec] or
[np.zeros(100)], axis=0) / len(words) for words in X])
def train_and_evaluate(inputfile):
print('Importing bug reports...')
df = pd.read_csv(inputfile, error_bad_lines=False, quotechar="'", encoding='utf-8')
print("Total number of sentences: ", df.size) # number of verified summaries by developers
class_names = ["non-severe", "normal", " severe"]
print('Preprocessing summaries...(This may take several minutes, please wait!) ')
summaries = [df.values[id][0] for id in range(len(df.values))]
severities = [df.values[id][1] for id in range(len(df.values))]
summary_arr = data_helpers.convert_tolower(summaries)
summary_arr = data_helpers.remove_punctuation(summary_arr)
sentences = data_helpers.tokenize_sentences(summary_arr)
# documents = remove_stopwords(summary_arr, tokenized)
# stemmed = stem_words(summary_arr, documents)
print('Calculating word vectors...')
model = gensim.models.Word2Vec(sentences, size=100, workers=-1, iter=1000)
words = list(model.wv.vocab)
model.wv.save_word2vec_format('w2v_model.bin')
w2v = dict(zip(model.wv.index2word, model.wv.syn0))
mev = MeanEmbeddingVectorizer(w2v)
M = mev.transform(sentences)
y = np.array(severities)
print('Training model... This process may take severak minutes...')
nn = sklearn.neural_network.MLPClassifier(hidden_layer_sizes=(100,), activation='tanh', solver='lbfgs',
random_state=1,
verbose=True, early_stopping=False, max_iter=1000)
# validation (comment out this block if you don't want to validate)
print('Validating the neural network model...')
skf = StratifiedKFold(n_splits=5)
total_accuracy = 0
for train_index, test_index in skf.split(M, y):
x_train, x_test = M[train_index], M[test_index]
y_train, y_test = y[train_index], y[test_index]
nn.fit(x_train, y_train)
y_predict = nn.predict(x_test)
total_accuracy += accuracy_score(y_test, y_predict)
# train
nn.fit(M, y)
with open('nn_model.bin', 'wb') as file:
pickle.dump(nn, file)
with open('w2v_model.bin', 'wb') as file:
pickle.dump(model, file)
print('Models are written to disk...')
# comment out this line if you comment out validation block
print("Accuracy: %2.1f %%" % (total_accuracy / 5 * 100))
def main(argv):
inputfile = ''
outputfile = ''
try:
opts, args = getopt.getopt(argv, "hi:o:", ["ifile="])
except getopt.GetoptError:
print ('train_and_validate.py -i <inputfile>')
sys.exit(2)
for opt, arg in opts:
if opt in ("-h", "--help"):
print ('train_and_validate.py -i <inputfile>')
sys.exit()
elif opt in ("-i", "--ifile"):
inputfile = arg
if inputfile == '':
inputfile = 'summaryList.csv'
train_and_evaluate(inputfile)
if __name__ == '__main__':
main(sys.argv[1:])