-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtrain_doc2vec_model.py
More file actions
executable file
·38 lines (31 loc) · 1.05 KB
/
train_doc2vec_model.py
File metadata and controls
executable file
·38 lines (31 loc) · 1.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import pandas as pd
import re
df=pd.read_csv('./tops.csv',dtype='unicode',usecols=['title'])
data=df['title'].tolist()
tagged_data = [TaggedDocument(words=word_tokenize((re.sub('[^A-Za-z]+', '',str(_d))).lower()), tags=[str(i)]) for i, _d in enumerate(data)]
#tokenize words to run through model
max_epochs = 1000
vec_size = 500
alpha = 0.025
model = Doc2Vec(vector_size=vec_size,
alpha=alpha,
min_alpha=0.00025,
min_count=1,
workers=4, #multithreaded
dm =1,dbow_words=0)
model.build_vocab(tagged_data)
i=1
for epoch in range(max_epochs):
print('iteration {0}'.format(epoch))
model.train(tagged_data,
total_examples=model.corpus_count,
epochs=model.iter)
# decrease the learning rate
if model.alpha >= 0.00015:
model.alpha -= (0.004/i)
#print(model.alpha)
i=i+1
model.save("final.model")
print("Model Saved")