This repository was archived by the owner on Apr 10, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathdata_helpers.py
More file actions
58 lines (51 loc) · 1.75 KB
/
data_helpers.py
File metadata and controls
58 lines (51 loc) · 1.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
def tokenize_sentences(sentences):
# takes list of summaries
# returns a list of list which is tokenized version of each sentences
tokenized_sentences = [[str(w) for w in word_tokenize(sent)] for sent in sentences]
return tokenized_sentences
def remove_punctuation(summary_arr):
arr = ["".join([ch for ch in text if ch not in string.punctuation]) for text in summary_arr]
return arr
def remove_multipunc(summary_arr):
sarr = []
for sent in summary_arr:
tmp1 = sent.split()
tmp2 = []
for word in tmp1:
if 1 >= sum((word.count(chr) for chr in string.punctuation)):
tmp2.append(word)
sarr.append(" ".join(tmp2))
return sarr
def remove_stopwords(summary_arr, tokenized):
documents = []
for bug_id in range(len(summary_arr)):
docvec = []
for w in tokenized[bug_id]:
if w not in set(stopwords.words("english")):
docvec.append(w)
documents.append(docvec)
return documents
def stem_words(summary_arr, documents):
ps = PorterStemmer()
stemmed_words = []
for bug_id in range(len(summary_arr)):
docvec = []
for w in documents[bug_id]:
if not any(char.isdigit() for char in w):
docvec.append(ps.stem(w))
elif w.startswith("0x"):
docvec.append(ps.stem(w))
stemmed_words.append(docvec)
return stemmed_words
def convert_tolower(summary_arr):
arr = []
for text in summary_arr:
tmp = []
for ch in text:
tmp.append(ch.lower())
arr.append(''.join(tmp))
return arr