NLP_InfoExtractApplicationProject/task3.py at master · ksk94966/NLP_InfoExtractApplicationProject · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# import task2
import nltk
import spacy
import neuralcoref
import pandas as pd
import json
from spacy_lookup import Entity
from nltk.tokenize import word_tokenize, sent_tokenize
from task2part import parse_part_template
from task2buy import parse_buy_template
from task2work import parse_work_template

import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")


def main():
    #change input file name here
    file ="input.txt"
    outputfile = file[1:-4]+".json"

    content=None
    try:
        f = open(file, "r")
        content =f.read()
    except UnicodeDecodeError:
        f = open(file, "r", encoding='utf8')
        content =f.read()


    # Create a dataframe from csv
    df = pd.read_csv('./titles.csv')

    # User list comprehension to create a list of lists from Dataframe rows
    job_titles = [row[0] for row in df.values]


    # nlp.remove_pipe('entity')
    entity = Entity(keywords_list=job_titles, label='Job-Title')


    nlp = spacy.load('en_core_web_sm')
    nlp.add_pipe(entity , before='ner')
    neuralcoref.add_to_pipe(nlp)
    doc = nlp(content)

    #coreference resolution is done in the whole document
    content = doc._.coref_resolved

    #Extacting the sentences from the text document
    sentences = []
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    sentences.extend(tokenizer.tokenize(content))

    output = {}
    output["document"] =file
    output["extraction"] = []
    for sentence in sentences:
        sen= nlp(sentence)

        #Find part  template
        locations_count = 0
        for ents in sen.ents:
            if ents.label_ =='GPE' or ents.label_ =='LOC':
                locations_count = locations_count + 1
        if locations_count >=2:
            resultset=parse_part_template(sen)
            for x in resultset:
                if x[0]!='' and x[1]!='':
                    part_temp={}
                    part_temp["template"] = "PART"
                    part_temp["sentences"] = []
                    part_temp["sentences"].append(sentence)
                    part_temp["arguments"] = {}
                    part_temp["arguments"]["1"] = x[0]
                    part_temp["arguments"]["2"] = x[1]
                    output["extraction"].append(part_temp)

        #Find work template
        isJobTitleGiven = False
        isOrgGiven = False
        for ents in sen.ents:

            if ents.label_ == 'Job-Title' :
                isJobTitleGiven = True
            if ents.label_ == 'ORG':
                isOrgGiven = True
        if isOrgGiven and isJobTitleGiven:
            results=parse_work_template(sen)
            for res in results:
                if res[0] != '' and res[1]!= '' and res[2]!= '':
                    work_temp={}
                    work_temp["template"] = "WORK"
                    work_temp["sentences"] = []
                    work_temp["sentences"].append(sentence)
                    work_temp["arguments"] = {}
                    work_temp["arguments"]["1"] = res[0]
                    work_temp["arguments"]["2"] = res[2]
                    work_temp["arguments"]["3"] = res[1]
                    work_temp["arguments"]["4"] = res[3]
                    output["extraction"].append(work_temp)

        #Find buy template
        for i, tok in enumerate(sen):
            if tok.dep_ == 'ROOT' or tok.pos_ == 'VERB':
                if tok.text.lower() in ['buy', 'bought','shop', 'acquire', 'acquired', 'purchase', 'invest in'\
                    'invested', 'get', 'obtain', 'obtained', 'secure', 'redeem', 'land', 'spent', 'get']:
                    results= parse_buy_template(sen)
                    if results[0]!= '' and results[1]!='':
                        buy_temp={}
                        buy_temp["template"] = "BUY"
                        buy_temp["sentences"] = []
                        buy_temp["sentences"].append(sentence)
                        buy_temp["arguments"] = {}
                        buy_temp["arguments"]["1"] = results[0]
                        buy_temp["arguments"]["2"] = results[1]
                        buy_temp["arguments"]["3"] = results[2]
                        buy_temp["arguments"]["4"] = results[3]
                        buy_temp["arguments"]["5"] = results[4]
                        output["extraction"].append(buy_temp)

    # Serializing json
    json_object = json.dumps(output, indent = 4)
    # Writing to sample.json
    with open(outputfile, 'w+') as f:
        f.write(json_object)


if __name__ == "__main__":
    main()