-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathmapper.py
More file actions
executable file
·170 lines (141 loc) · 7 KB
/
mapper.py
File metadata and controls
executable file
·170 lines (141 loc) · 7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import pandas as pd
from fuzzywuzzy import process, fuzz
from read_snomed import load_lexicon
from helper_tools import ngrams, sanatize, get_tuple, flatten, group
from collections import namedtuple
#---------------------------------------------------------------------------------------------
# GLOBAL PARAMETERS
SCORER_LIMIT = 87 # Sets the sensitivity of the FuzzyWuzzy matcher (Lower = more false positives)
MAX_NGRAM_SIZE = 6 # Sets the maximum ngram size tested (Higher = slower)
MAX_DISTANCE = 8 # Sets the maximum length difference possible for a match (Higher = slower)
USE_REMOVE = True # Sets if the fuzzy search removes terms as it finds them
STANDARD_LEXICON = "recommended" # Sets the standard subset of the SNOMED database to use
#---------------------------------------------------------------------------------------------
class Mapper:
def __init__(self, text, lexicon = STANDARD_LEXICON):
self.text = sanatize(text)
self.original_text = text # We leave this unmodified by other functions
self.lexicon = load_lexicon(lexicon)
self.matches = set() # Matched ID's are added here
self.matches_w_score = set() # For debugging and sorting
@property
def codes(self):
return(x.code for x in self.matches)
def __str__(self):
return(f"Original text is: {self.original_text}\nCurrent text is: {self.text}")
def __repr__(self):
return(f"Original text is: {self.original_text}\nCurrent text is: {self.text}")
def remove(self,string):
# Removes the string from self.text
self.text = self.text.replace(string,' ')
def get_num_words(self):
# Gets the current number of words in the text
text_list = self.text.split(' ')
length = len(text_list)
return(length)
def get_max_ngram(self):
# get the maximum ngram size needed
length = self.get_num_words()
if length < MAX_NGRAM_SIZE:
return(length)
else:
return(MAX_NGRAM_SIZE)
def cut_lexicon(self, query):
# Returns a lexicon that is cut down to words within max_distance of the query
length = len(query)
upper = length + MAX_DISTANCE
lower = length - MAX_DISTANCE
reduced_lexicon = [syn for syn in self.lexicon if lower <= len(syn.term) <= upper]
return(reduced_lexicon)
def exact_search(self):
# Matches only on exact strings
length = self.get_num_words()
for i in range(MAX_NGRAM_SIZE,0,-1): # Iterates through all ngram lengths up to the MAX_NGRAM_SIZE
ngram_list = ngrams(self.text,i)
for ngram in ngram_list:
new_lexicon = [x for x in self.lexicon if x.term == ngram]
for syn in new_lexicon:
self.matches.add(syn)
self.matches_w_score.add((syn, 'NA', "exact"))
def acronym_search(self):
# Matches against the acronym list
acronyms = load_lexicon("my_acronyms")
length = self.get_num_words()
for i in range(length,0,-1):
ngram_list = ngrams(self.text,i)
for ngram in ngram_list:
new_lexicon = [x for x in acronyms if x.term == ngram]
for syn in new_lexicon:
self.matches.add(syn)
tup_w_score = namedtuple('matches_w_score', ['ngram','tup', 'score','scorer_name'])
self.matches_w_score.add(tup_w_score(syn[0], syn, 'NA', 'acronym'))
def fuzzy_match(self, query, reduced_lexicon, limit=SCORER_LIMIT):
# Matches against the SNOMED lexicon using fuzzy string matching
terms = [x.term for x in reduced_lexicon]
result_sort = process.extractOne(query, terms, scorer=fuzz.token_sort_ratio)
result_sort = (result_sort[0], result_sort[1] - 10) #Rank result sort a bit lower than ratio
result_ratio = process.extractOne(query, terms, scorer=fuzz.ratio)
if result_sort[1] >= result_ratio[1]:
result = result_sort
scorer_name = "sort"
else:
result = result_ratio
scorer_name = "ratio"
if result[1] >= limit: #if the returned score is greater or equal to than the cutoff value
return((get_tuple(result[0], reduced_lexicon), result[1], scorer_name)) # Returns a list of matched synonyms
else:
return(False)
def fuzzy_search(self):
max_ngram = self.get_max_ngram()
for i in range(max_ngram,0,-1):
ngram_list = ngrams(self.text,i)
for ngram in ngram_list:
reduced_lexicon = self.cut_lexicon(ngram)
result = self.fuzzy_match(ngram,reduced_lexicon)
if result:
tups = self.fuzzy_match(ngram,reduced_lexicon)[0]
score = self.fuzzy_match(ngram,reduced_lexicon)[1]
scorer_name = self.fuzzy_match(ngram,reduced_lexicon)[2]
for tup in tups:
self.matches.add(tup)
tup_w_score = namedtuple('matches_w_score', ['ngram','tup', 'score','scorer_name'])
self.matches_w_score.add(tup_w_score(ngram, tup, score, scorer_name))
if USE_REMOVE == True:
self.remove(ngram)
self.text = sanatize(self.text)
def to_df(self, show_scores = False):
# Returns self.matches as a Pandas DataFrame
if show_scores == True:
matches = list(group(list(flatten(list(self.matches_w_score))),5))
df = pd.DataFrame(matches)
if df.empty != True:
df.columns = ['ngram', 'Term', 'ID', 'Score', 'Name']
s = df.Term.str.len().sort_values(ascending=False).index
df = df.reindex(s)
df.reset_index(inplace=True, drop=True)
return(df)
else:
return(pd.DataFrame([])) # Return an empty DataFrame
else:
matches = list(self.matches)
df = pd.DataFrame(matches)
if df.empty != True:
df.columns = ['Term', 'ID']
s = df.Term.str.len().sort_values(ascending=False).index
df = df.reindex(s)
df.reset_index(inplace=True, drop=True)
df = df.drop_duplicates(['ID'], keep='first')
return(df)
else:
return(pd.DataFrame([])) # Return an empty DataFrame
def standard_search(self, show_scores = False):
# Convenience function - the standard sequence of search functions
# Returns matches as a Pandas DataFrame
self.acronym_search()
self.fuzzy_search()
return(self.to_df(show_scores))
def rapid_search(self):
# Acronym search + exact search
self.acronym_search()
self.exact_search()
return(self.to_df())