-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtask3.py
More file actions
139 lines (111 loc) · 4.51 KB
/
Copy pathtask3.py
File metadata and controls
139 lines (111 loc) · 4.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# import task2
import nltk
import spacy
import neuralcoref
import pandas as pd
import json
from spacy_lookup import Entity
from nltk.tokenize import word_tokenize, sent_tokenize
from task2part import parse_part_template
from task2buy import parse_buy_template
from task2work import parse_work_template
import sys
if not sys.warnoptions:
import warnings
warnings.simplefilter("ignore")
def main():
#change input file name here
file ="input.txt"
outputfile = file[1:-4]+".json"
content=None
try:
f = open(file, "r")
content =f.read()
except UnicodeDecodeError:
f = open(file, "r", encoding='utf8')
content =f.read()
# Create a dataframe from csv
df = pd.read_csv('./titles.csv')
# User list comprehension to create a list of lists from Dataframe rows
job_titles = [row[0] for row in df.values]
# nlp.remove_pipe('entity')
entity = Entity(keywords_list=job_titles, label='Job-Title')
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe(entity , before='ner')
neuralcoref.add_to_pipe(nlp)
doc = nlp(content)
#coreference resolution is done in the whole document
content = doc._.coref_resolved
#Extacting the sentences from the text document
sentences = []
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
sentences.extend(tokenizer.tokenize(content))
output = {}
output["document"] =file
output["extraction"] = []
for sentence in sentences:
sen= nlp(sentence)
#Find part template
locations_count = 0
for ents in sen.ents:
if ents.label_ =='GPE' or ents.label_ =='LOC':
locations_count = locations_count + 1
if locations_count >=2:
resultset=parse_part_template(sen)
for x in resultset:
if x[0]!='' and x[1]!='':
part_temp={}
part_temp["template"] = "PART"
part_temp["sentences"] = []
part_temp["sentences"].append(sentence)
part_temp["arguments"] = {}
part_temp["arguments"]["1"] = x[0]
part_temp["arguments"]["2"] = x[1]
output["extraction"].append(part_temp)
#Find work template
isJobTitleGiven = False
isOrgGiven = False
for ents in sen.ents:
if ents.label_ == 'Job-Title' :
isJobTitleGiven = True
if ents.label_ == 'ORG':
isOrgGiven = True
if isOrgGiven and isJobTitleGiven:
results=parse_work_template(sen)
for res in results:
if res[0] != '' and res[1]!= '' and res[2]!= '':
work_temp={}
work_temp["template"] = "WORK"
work_temp["sentences"] = []
work_temp["sentences"].append(sentence)
work_temp["arguments"] = {}
work_temp["arguments"]["1"] = res[0]
work_temp["arguments"]["2"] = res[2]
work_temp["arguments"]["3"] = res[1]
work_temp["arguments"]["4"] = res[3]
output["extraction"].append(work_temp)
#Find buy template
for i, tok in enumerate(sen):
if tok.dep_ == 'ROOT' or tok.pos_ == 'VERB':
if tok.text.lower() in ['buy', 'bought','shop', 'acquire', 'acquired', 'purchase', 'invest in'\
'invested', 'get', 'obtain', 'obtained', 'secure', 'redeem', 'land', 'spent', 'get']:
results= parse_buy_template(sen)
if results[0]!= '' and results[1]!='':
buy_temp={}
buy_temp["template"] = "BUY"
buy_temp["sentences"] = []
buy_temp["sentences"].append(sentence)
buy_temp["arguments"] = {}
buy_temp["arguments"]["1"] = results[0]
buy_temp["arguments"]["2"] = results[1]
buy_temp["arguments"]["3"] = results[2]
buy_temp["arguments"]["4"] = results[3]
buy_temp["arguments"]["5"] = results[4]
output["extraction"].append(buy_temp)
# Serializing json
json_object = json.dumps(output, indent = 4)
# Writing to sample.json
with open(outputfile, 'w+') as f:
f.write(json_object)
if __name__ == "__main__":
main()