From 23abe0d0dae24162f25b8abdeb6151771ad96dc3 Mon Sep 17 00:00:00 2001 From: user Date: Mon, 1 May 2023 17:36:19 -0400 Subject: [PATCH 1/7] Improve quote detection by finding start and end quotation marks --- src/textacy/constants.py | 26 ++++++++++++++++++++++++++ src/textacy/extract/triples.py | 22 ++++++++++++---------- 2 files changed, 38 insertions(+), 10 deletions(-) diff --git a/src/textacy/constants.py b/src/textacy/constants.py index 416c1362..4200d011 100644 --- a/src/textacy/constants.py +++ b/src/textacy/constants.py @@ -21,6 +21,29 @@ OBJ_DEPS: set[str] = {"attr", "dobj", "dative", "oprd"} AUX_DEPS: set[str] = {"aux", "auxpass", "neg"} +""" +Ordinal points of the token.is_quote characters, matched up by start and end. + +source: +switch = "\"\'" +start = "“‘```“‘«‹「『„‚" +end = "”’’’’”’»›」』”’" + +""" +QUOTATION_MARK_PAIRS = { + (34, 34), + (39, 39), + (96, 8217), + (171, 187), + (8216, 8217), + (8218, 8217), + (8220, 8221), + (8222, 8221), + (8249, 8250), + (12300, 12301), + (12302, 12303) + } + REPORTING_VERBS: dict[str, set[str]] = { "en": { "according", @@ -44,6 +67,7 @@ "conclude", "confirm", "contend", + "continue", "criticize", "declare", "decline", @@ -63,6 +87,7 @@ "order", "predict", "promise", + "read", "recall", "recommend", "reply", @@ -74,6 +99,7 @@ "tell", "testify", "think", + "tweet", "urge", "warn", "worry", diff --git a/src/textacy/extract/triples.py b/src/textacy/extract/triples.py index f004d31d..af118d2b 100644 --- a/src/textacy/extract/triples.py +++ b/src/textacy/extract/triples.py @@ -11,7 +11,6 @@ from operator import attrgetter from typing import Iterable, Mapping, Optional, Pattern -from cytoolz import itertoolz from spacy.symbols import ( AUX, VERB, @@ -225,14 +224,17 @@ def direct_quotations(doc: Doc) -> Iterable[DQTriple]: f"direct quotation extraction is not implemented for lang='{doc.lang_}', " f"only {sorted(constants.REPORTING_VERBS.keys())}" ) - qtok_idxs = [tok.i for tok in doc if tok.is_quote] - if len(qtok_idxs) % 2 != 0: - raise ValueError( - f"{len(qtok_idxs)} quotation marks found, indicating an unclosed quotation; " - "given the limitations of this method, it's safest to bail out " - "rather than guess which quotation is unclosed" - ) - qtok_pair_idxs = list(itertoolz.partition(2, qtok_idxs)) + + # pairs up quotation-like characters based on acceptable start/end combos + # see constants for more info + qtok = [tok for tok in doc if tok.is_quote] + qtok_pair_idxs = [] + for n, q in enumerate(qtok): + if q.i not in [q_[1] for q_ in qtok_pair_idxs]: + for q_ in qtok[n+1:]: + if (ord(q.text), ord(q_.text)) in constants.QUOTATION_MARK_PAIRS: + qtok_pair_idxs.append((q.i, q_.i)) + break for qtok_start_idx, qtok_end_idx in qtok_pair_idxs: content = doc[qtok_start_idx : qtok_end_idx + 1] cue = None @@ -241,7 +243,7 @@ def direct_quotations(doc: Doc) -> Iterable[DQTriple]: if ( # quotations should have at least a couple tokens # excluding the first/last quotation mark tokens - len(content) < 4 + len(content) < 3 # filter out titles of books and such, if possible or all( tok.is_title From e8e897165b0958ffb5a3efc97e73bc09b2e494fa Mon Sep 17 00:00:00 2001 From: user Date: Mon, 1 May 2023 17:39:51 -0400 Subject: [PATCH 2/7] revert some unrelated changes --- src/textacy/constants.py | 3 --- src/textacy/extract/triples.py | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/src/textacy/constants.py b/src/textacy/constants.py index 4200d011..98cfdd7e 100644 --- a/src/textacy/constants.py +++ b/src/textacy/constants.py @@ -67,7 +67,6 @@ "conclude", "confirm", "contend", - "continue", "criticize", "declare", "decline", @@ -87,7 +86,6 @@ "order", "predict", "promise", - "read", "recall", "recommend", "reply", @@ -99,7 +97,6 @@ "tell", "testify", "think", - "tweet", "urge", "warn", "worry", diff --git a/src/textacy/extract/triples.py b/src/textacy/extract/triples.py index af118d2b..0be00e24 100644 --- a/src/textacy/extract/triples.py +++ b/src/textacy/extract/triples.py @@ -243,7 +243,7 @@ def direct_quotations(doc: Doc) -> Iterable[DQTriple]: if ( # quotations should have at least a couple tokens # excluding the first/last quotation mark tokens - len(content) < 3 + len(content) < 4 # filter out titles of books and such, if possible or all( tok.is_title From 5c67017648bb06ccea91cd5594b4305acd43d34d Mon Sep 17 00:00:00 2001 From: Andy Friedman Date: Mon, 15 May 2023 14:19:18 -0400 Subject: [PATCH 3/7] added improved direct_quotations function, helper functions, constants and tests --- src/textacy/constants.py | 13 ++- src/textacy/extract/triples.py | 148 +++++++++++++++++++++++---------- tests/extract/test_triples.py | 2 +- 3 files changed, 117 insertions(+), 46 deletions(-) diff --git a/src/textacy/constants.py b/src/textacy/constants.py index 98cfdd7e..f66694d0 100644 --- a/src/textacy/constants.py +++ b/src/textacy/constants.py @@ -41,7 +41,18 @@ (8222, 8221), (8249, 8250), (12300, 12301), - (12302, 12303) + (12302, 12303), + (8220, 34), + (8216, 34), + (96, 34), + (8216, 34), + (171, 34), + (8249, 34), + (12300, 34), + (12302, 34), + (8222, 34), + (8218, 34), + (34, 8221) } REPORTING_VERBS: dict[str, set[str]] = { diff --git a/src/textacy/extract/triples.py b/src/textacy/extract/triples.py index 0be00e24..f6f71d44 100644 --- a/src/textacy/extract/triples.py +++ b/src/textacy/extract/triples.py @@ -14,6 +14,7 @@ from spacy.symbols import ( AUX, VERB, + PUNCT, agent, attr, aux, @@ -216,7 +217,6 @@ def direct_quotations(doc: Doc) -> Iterable[DQTriple]: Loosely inspired by Krestel, Bergler, Witte. "Minding the Source: Automatic Tagging of Reported Speech in Newspaper Articles". """ - # TODO: train a model to do this instead, maybe similar to entity recognition try: _reporting_verbs = constants.REPORTING_VERBS[doc.lang_] except KeyError: @@ -224,7 +224,6 @@ def direct_quotations(doc: Doc) -> Iterable[DQTriple]: f"direct quotation extraction is not implemented for lang='{doc.lang_}', " f"only {sorted(constants.REPORTING_VERBS.keys())}" ) - # pairs up quotation-like characters based on acceptable start/end combos # see constants for more info qtok = [tok for tok in doc if tok.is_quote] @@ -235,6 +234,12 @@ def direct_quotations(doc: Doc) -> Iterable[DQTriple]: if (ord(q.text), ord(q_.text)) in constants.QUOTATION_MARK_PAIRS: qtok_pair_idxs.append((q.i, q_.i)) break + + def filter_quote_tokens(tok): + return any( + qts_idx <= tok.i <= qte_idx for qts_idx, qte_idx in qtok_pair_idxs + ) + for qtok_start_idx, qtok_end_idx in qtok_pair_idxs: content = doc[qtok_start_idx : qtok_end_idx + 1] cue = None @@ -251,53 +256,47 @@ def direct_quotations(doc: Doc) -> Iterable[DQTriple]: # if tok.pos in {NOUN, PROPN} if not (tok.is_punct or tok.is_stop) ) - # TODO: require closing punctuation before the quotation mark? - # content[-2].is_punct is False ): continue - # get window of adjacent/overlapping sentences - window_sents = ( - sent - for sent in doc.sents - # these boundary cases are a subtle bit of work... - if ( - (sent.start < qtok_start_idx and sent.end >= qtok_start_idx - 1) - or (sent.start <= qtok_end_idx + 1 and sent.end > qtok_end_idx) - ) - ) + + triple = None + for n, window_sents in enumerate([ + windower(qtok_start_idx, qtok_end_idx, doc, True), + windower(qtok_start_idx, qtok_end_idx, doc) + ]): # get candidate cue verbs in window - cue_cands = [ - tok - for sent in window_sents - for tok in sent - if ( - tok.pos == VERB - and tok.lemma_ in _reporting_verbs - # cue verbs must occur *outside* any quotation content - and not any( - qts_idx <= tok.i <= qte_idx for qts_idx, qte_idx in qtok_pair_idxs - ) + cue_cands = [ + tok + for sent in window_sents + for tok in sent + if not filter_quote_tokens(tok) + and filter_cue_candidates(tok, _reporting_verbs) + ] + # sort candidates by proximity to quote content + cue_cands = sorted( + cue_cands, + key=lambda cc: min(abs(cc.i - qtok_start_idx), abs(cc.i - qtok_end_idx)), ) - ] - # sort candidates by proximity to quote content - cue_cands = sorted( - cue_cands, - key=lambda cc: min(abs(cc.i - qtok_start_idx), abs(cc.i - qtok_end_idx)), - ) - for cue_cand in cue_cands: - if cue is not None: - break - for speaker_cand in cue_cand.children: - if speaker_cand.dep in _ACTIVE_SUBJ_DEPS: - cue = expand_verb(cue_cand) - speaker = expand_noun(speaker_cand) + for cue_cand in cue_cands: + if cue is not None: break - if content and cue and speaker: - yield DQTriple( - speaker=sorted(speaker, key=attrgetter("i")), - cue=sorted(cue, key=attrgetter("i")), - content=content, - ) + speaker_cands = [ + speaker_cand for speaker_cand in cue_cand.children + if not filter_quote_tokens(speaker_cand) + and filter_speaker_candidates(speaker_cand, qtok_start_idx, qtok_end_idx) + ] + for speaker_cand in speaker_cands: + if speaker_cand.dep in _ACTIVE_SUBJ_DEPS: + cue = expand_verb(cue_cand) + speaker = expand_noun(speaker_cand) + break + if content and cue and speaker: + yield DQTriple( + speaker=sorted(speaker, key=attrgetter("i")), + cue=sorted(cue, key=attrgetter("i")), + content=content, + ) + break def expand_noun(tok: Token) -> list[Token]: @@ -319,3 +318,64 @@ def expand_verb(tok: Token) -> list[Token]: child for child in tok.children if child.dep in _VERB_MODIFIER_DEPS ] return [tok] + verb_modifiers + +def filter_cue_candidates(tok, verbs): + return all([ + tok.pos == VERB, + tok.lemma_ in verbs + ]) + +# def get_cue_candidates(window_sents, verbs): +# return [ +# tok +# for sent in window_sents +# for tok in sent +# if filter_cue_candidates(tok, verbs) +# ] + +def filter_speaker_candidates(candidate, i, j): + """ + Evaluates whether the candidate is not punctuation and that it is outside of the quote. + """ + return all([ + candidate.pos!=PUNCT, + ((candidate.i >= i and candidate.i >= j) or (candidate.i <= i and candidate.i <= j)), + ]) + +def line_break_window(i, j, doc): + """ + Finds the boundaries of the paragraph containing doc[i:j]. + """ + for i_, j_ in list(zip( + [tok.i for tok in doc if tok.text=="\n"], + [tok.i for tok in doc if tok.text=="\n"][1:]) + ): + if i_ <= i and j_ >= j: + return (i_, j_) + else: + return (None, None) + +def windower(i, j, doc, para=False) -> Iterable: + """ + Two ways to search for cue and speaker: the old way, and a new way based on line breaks. + """ + if para: + i_, j_ = line_break_window(i, j, doc) + if i_: + return ( + sent + for sent in doc[i_+1:j_-1].sents + ) + else: + return [] + else: + # get window of adjacent/overlapping sentences + return ( + sent + for sent in doc.sents + # these boundary cases are a subtle bit of work... + if ( + (sent.start < i and sent.end >= i - 1) + or (sent.start <= j + 1 and sent.end > j) + ) + ) \ No newline at end of file diff --git a/tests/extract/test_triples.py b/tests/extract/test_triples.py index 85c3f2d6..ad95d755 100644 --- a/tests/extract/test_triples.py +++ b/tests/extract/test_triples.py @@ -92,7 +92,7 @@ def sss_doc(lang_en): (["She", "friend"], ["sell"], ["sea", "shells"]), (["She", "friend"], ["throw"], ["sea", "shells"]), ], - ), + ) ], ) def test_subject_verb_object_triples(text, svos_exp, lang_en): From 1c8a8ac7de421b0e08e54da36b08b7ad32e30aef Mon Sep 17 00:00:00 2001 From: Andy Friedman Date: Thu, 15 Jun 2023 15:13:38 -0400 Subject: [PATCH 4/7] updates made, tests added --- src/textacy/constants.py | 77 ++++++++------ src/textacy/extract/triples.py | 181 +++++++++++++++++++-------------- tests/extract/test_triples.py | 31 ++++++ 3 files changed, 183 insertions(+), 106 deletions(-) diff --git a/src/textacy/constants.py b/src/textacy/constants.py index f66694d0..94f1b860 100644 --- a/src/textacy/constants.py +++ b/src/textacy/constants.py @@ -21,38 +21,49 @@ OBJ_DEPS: set[str] = {"attr", "dobj", "dative", "oprd"} AUX_DEPS: set[str] = {"aux", "auxpass", "neg"} -""" -Ordinal points of the token.is_quote characters, matched up by start and end. - -source: -switch = "\"\'" -start = "“‘```“‘«‹「『„‚" -end = "”’’’’”’»›」』”’" +MIN_QUOTE_LENGTH: int=4 -""" QUOTATION_MARK_PAIRS = { - (34, 34), - (39, 39), - (96, 8217), - (171, 187), - (8216, 8217), - (8218, 8217), - (8220, 8221), - (8222, 8221), - (8249, 8250), - (12300, 12301), - (12302, 12303), - (8220, 34), - (8216, 34), - (96, 34), - (8216, 34), - (171, 34), - (8249, 34), - (12300, 34), - (12302, 34), - (8222, 34), - (8218, 34), - (34, 8221) + # """ + # Ordinal points of the token.is_quote characters, matched up by start and end. + + # Some of these pairs are from weirdly formatted newspaper uploads, so could be some noise! + + # source: + # switch = "\"\'" + # start = "“‘```“‘«‹「『„‚" + # end = "”’’’’”’»›」』”’" + # """ + (34, 34), # " " + (39, 39), # ' ' + (96, 8217), # ` ’ + (171, 187), # « » + (8216, 8217), # ‘ ’ + (8218, 8217), # ‚ ’ + (8220, 8221), # “ ” + (8222, 8221), # „ ” + (8249, 8250), # ‹ › + (12300, 12301), #「 」 + (12302, 12303), #『 』 + (8220, 34), # “ " + (8216, 34), # ‘ " + (96, 34), # ` " + (8216, 34), # ‘ " + (171, 34), # « " + (8249, 34), # ‹ " + (12300, 34), #「 " + (12302, 34), #『 " + (8222, 34), # „ " + (8218, 34), # ‚ " + (34, 8221), # " ” + (34, 8217), # " ’ + (34, 10), + (39, 10), + (96, 10), + (171, 10), + (8216, 10), + (8218, 10), + (8249, 10) } REPORTING_VERBS: dict[str, set[str]] = { @@ -230,3 +241,9 @@ ) RE_ALNUM: Pattern = re.compile(r"[^\W_]+") + +# regexes for quote detection prep +ALL_QUOTES = '‹「`»」‘"„›”‚’\'』『«“' +DOUBLE_QUOTES = '‹「」»"„『”‚』›«“' +ANY_DOUBLE_QUOTE_REGEX = r"[{}]".format(DOUBLE_QUOTES) +DOUBLE_QUOTES_NOSPACE_REGEX = r"(?<=\S)([{}])(?=\S)".format(DOUBLE_QUOTES) diff --git a/src/textacy/extract/triples.py b/src/textacy/extract/triples.py index f6f71d44..8e136525 100644 --- a/src/textacy/extract/triples.py +++ b/src/textacy/extract/triples.py @@ -27,9 +27,10 @@ nsubjpass, obj, pobj, - xcomp, + xcomp ) from spacy.tokens import Doc, Span, Token +import regex as re from .. import constants, types, utils from . import matches @@ -209,6 +210,7 @@ def direct_quotations(doc: Doc) -> Iterable[DQTriple]: Args: doc + min_quote_length - minimum distance (in tokens) between potentially paired quotation marks. Yields: Next direct quotation in ``doc`` as a (speaker, cue, content) triple. @@ -226,21 +228,29 @@ def direct_quotations(doc: Doc) -> Iterable[DQTriple]: ) # pairs up quotation-like characters based on acceptable start/end combos # see constants for more info - qtok = [tok for tok in doc if tok.is_quote] - qtok_pair_idxs = [] - for n, q in enumerate(qtok): - if q.i not in [q_[1] for q_ in qtok_pair_idxs]: - for q_ in qtok[n+1:]: + qtoks = [tok for tok in doc if tok.is_quote] + linebreaks = [t.i for t in doc if t.is_space and t.text == "\n"] + qtok_idx_pairs = [(-1,-1)] + for q in qtoks: + if ( + not bool(q.whitespace_) + and q.i not in [q_[1] for q_ in qtok_idx_pairs] + and q.i > qtok_idx_pairs[-1][1] + ): + try: + lb = next(l for l in linebreaks if l >= q.i) + q_range = [q_ for q_ in qtoks if q_.i > q.i and q_.i <= lb] + except StopIteration: + q_range = [q_ for q_ in qtoks if q_.i > q.i] + for q_ in q_range: if (ord(q.text), ord(q_.text)) in constants.QUOTATION_MARK_PAIRS: - qtok_pair_idxs.append((q.i, q_.i)) + qtok_idx_pairs.append((q.i, q_.i)) break def filter_quote_tokens(tok): - return any( - qts_idx <= tok.i <= qte_idx for qts_idx, qte_idx in qtok_pair_idxs - ) + return any(qts_idx <= tok.i <= qte_idx for qts_idx, qte_idx in qtok_idx_pairs) - for qtok_start_idx, qtok_end_idx in qtok_pair_idxs: + for qtok_start_idx, qtok_end_idx in qtok_idx_pairs: content = doc[qtok_start_idx : qtok_end_idx + 1] cue = None speaker = None @@ -248,7 +258,7 @@ def filter_quote_tokens(tok): if ( # quotations should have at least a couple tokens # excluding the first/last quotation mark tokens - len(content) < 4 + len(content) < constants.MIN_QUOTE_LENGTH # filter out titles of books and such, if possible or all( tok.is_title @@ -259,31 +269,31 @@ def filter_quote_tokens(tok): ): continue - triple = None - for n, window_sents in enumerate([ + for window_sents in [ windower(qtok_start_idx, qtok_end_idx, doc, True), windower(qtok_start_idx, qtok_end_idx, doc) - ]): + ]: # get candidate cue verbs in window - cue_cands = [ - tok - for sent in window_sents - for tok in sent - if not filter_quote_tokens(tok) - and filter_cue_candidates(tok, _reporting_verbs) - ] - # sort candidates by proximity to quote content - cue_cands = sorted( - cue_cands, - key=lambda cc: min(abs(cc.i - qtok_start_idx), abs(cc.i - qtok_end_idx)), + cue_candidates = [ + tok + for sent in window_sents + for tok in sent + if tok.pos == VERB + and tok.lemma_ in _reporting_verbs + and not filter_quote_tokens(tok) + ] + cue_candidates = sorted(cue_candidates, + key=lambda cc: min(abs(cc.i - qtok_start_idx), abs(cc.i - qtok_end_idx)) ) - for cue_cand in cue_cands: + for cue_cand in cue_candidates: if cue is not None: break speaker_cands = [ speaker_cand for speaker_cand in cue_cand.children - if not filter_quote_tokens(speaker_cand) - and filter_speaker_candidates(speaker_cand, qtok_start_idx, qtok_end_idx) + if speaker_cand.pos!=PUNCT + and not filter_quote_tokens(speaker_cand) + and ((speaker_cand.i >= qtok_end_idx) + or (speaker_cand.i <= qtok_start_idx )) ] for speaker_cand in speaker_cands: if speaker_cand.dep in _ACTIVE_SUBJ_DEPS: @@ -298,7 +308,6 @@ def filter_quote_tokens(tok): ) break - def expand_noun(tok: Token) -> list[Token]: """Expand a noun token to include all associated conjunct and compound nouns.""" tok_and_conjuncts = [tok] + list(tok.conjuncts) @@ -306,66 +315,25 @@ def expand_noun(tok: Token) -> list[Token]: child for tc in tok_and_conjuncts for child in tc.children - # TODO: why doesn't compound import from spacy.symbols? if child.dep_ == "compound" ] return tok_and_conjuncts + compounds - def expand_verb(tok: Token) -> list[Token]: """Expand a verb token to include all associated auxiliary and negation tokens.""" verb_modifiers = [ child for child in tok.children if child.dep in _VERB_MODIFIER_DEPS ] return [tok] + verb_modifiers - -def filter_cue_candidates(tok, verbs): - return all([ - tok.pos == VERB, - tok.lemma_ in verbs - ]) - -# def get_cue_candidates(window_sents, verbs): -# return [ -# tok -# for sent in window_sents -# for tok in sent -# if filter_cue_candidates(tok, verbs) -# ] - -def filter_speaker_candidates(candidate, i, j): - """ - Evaluates whether the candidate is not punctuation and that it is outside of the quote. - """ - return all([ - candidate.pos!=PUNCT, - ((candidate.i >= i and candidate.i >= j) or (candidate.i <= i and candidate.i <= j)), - ]) - -def line_break_window(i, j, doc): - """ - Finds the boundaries of the paragraph containing doc[i:j]. - """ - for i_, j_ in list(zip( - [tok.i for tok in doc if tok.text=="\n"], - [tok.i for tok in doc if tok.text=="\n"][1:]) - ): - if i_ <= i and j_ >= j: - return (i_, j_) - else: - return (None, None) -def windower(i, j, doc, para=False) -> Iterable: +def windower(i, j, doc, by_linebreak: bool=False) -> Iterable: """ Two ways to search for cue and speaker: the old way, and a new way based on line breaks. """ - if para: + if by_linebreak: i_, j_ = line_break_window(i, j, doc) - if i_: - return ( - sent - for sent in doc[i_+1:j_-1].sents - ) + if i_ is not None: + return (sent for sent in doc[i_+1:j_-1].sents) else: return [] else: @@ -378,4 +346,65 @@ def windower(i, j, doc, para=False) -> Iterable: (sent.start < i and sent.end >= i - 1) or (sent.start <= j + 1 and sent.end > j) ) - ) \ No newline at end of file + ) + +def line_break_window(i, j, doc): + """ + Finds the boundaries of the paragraph containing doc[i:j]. + """ + lb_tok_idxs = [tok.i for tok in doc if tok.text == "\n"] + for i_, j_ in zip(lb_tok_idxs, lb_tok_idxs[1:]): + if i_ <= i and j_ >= j: + return (i_, j_) + else: + return (None, None) + +def prep_text_for_quote_detection(t: str, fix_plural_possessives: bool=True) -> str: + """ + Sorts out some common issues that trip up the quote detector. Works best one paragraph at a time -- use prep_document_for_quote_detection for the whole doc. + + - replaces consecutive apostrophes with a double quote (no idea why this happens but it does) + - adds spaces before or after double quotes that don't have them + - if enabled, fixes plural possessives by adding an "x", because the hanging apostrophe can trigger quote detection. + - adds a double quote to the end of paragraphs that are continuations of quotes and thus traditionally don't end with quotation marks + + Input: + t (str) - text to be prepped, preferably one paragraph + fix_plural_possessives (bool) - enables fix_plural_possessives + + Output: + t (str) - text prepped for quote detection + """ + if not t: + return + + t = t.replace("\'\'", "\"") + if fix_plural_possessives: + t = re.sub(r"(.{3,8}s\')(\s)", r"\1x\2", t) + while re.search(constants.DOUBLE_QUOTES_NOSPACE_REGEX, p): + match = re.search(constants.DOUBLE_QUOTES_NOSPACE_REGEX, p) + if len(re.findall(constants.ANY_DOUBLE_QUOTE_REGEX, p[:match.start()])) % 2 != 0: + replacer = '" ' + else: + replacer = ' "' + p = p[:match.start()] + replacer + p[match.end():] + if ( + not (p[0] == "'" and p[-1] == "'") + and p[0] in constants.ALL_QUOTES + and len(re.findall(constants.ANY_DOUBLE_QUOTE_REGEX, p[1:])) % 2 == 0 + ): + p += '"' + return p.strip() + +def prep_document_for_quote_detection(t: str, para_char: str="\n") -> str: + """ + Splits text into paragraphs (on para_char), runs prep_text_for_quote_detection on all paragraphs, then reassembles with para_char. + + Input: + t (str) - document to prep for quote detection + para_char (str) - paragraph boundary in t + + Output: + document prepped for quote detection + """ + return para_char.join([prep_text_for_quote_detection(t) for t in t.split(para_char) if t]) \ No newline at end of file diff --git a/tests/extract/test_triples.py b/tests/extract/test_triples.py index ad95d755..4110dafc 100644 --- a/tests/extract/test_triples.py +++ b/tests/extract/test_triples.py @@ -95,6 +95,7 @@ def sss_doc(lang_en): ) ], ) + def test_subject_verb_object_triples(text, svos_exp, lang_en): doc = lang_en(text) svos = list(extract.subject_verb_object_triples(doc)) @@ -255,3 +256,33 @@ def test_direct_quotations_spanish(lang_es, text, exp): for speaker, cue, content in obs ] assert obs_text == exp + + +@pytest.mark.parametrize( + "text, speakers", + [( # tests whitespace check -- don't start quotes with quotation marks if tok.whitespace == True + """'AMANDA, you stabbed me," a stunned Anna Ramirez told her identical twin early Saturday outside their Camden home. Edward said prosecutors had established "clear and convincing evidence" that Ramirez should be held.""", + ['Edward'] + ), + ( # tests that odd numbers of quotation marks can be parsed + """He approached the stable. "Where are the horses' carrots?" he asked.""", + ['he'] + ), + ( # tests that overlapping quotes are ignored + "The stranger was eating a burger. He told everyone, \"This 'hamburger with extra cheese and pickles' is good.\"", + ['He'] + ), + ( # tests ending quotes at linebreaks + "\'uneasy\' on Gilmer street\"\nPolice are investigating a shooting. They did not identify the 17-year-old because he is a minor.\n\"Detectives are looking into it,\" Richmond police said in a news release Tuesday morning.", + ['Richmond', 'police'] + ), + ( # tests second use of windower + 'He pounces on counters and jabs the gun at tellers, only 2 pounds of pressure preventing that gun from firing.\n"He\'s going to hurt somebody because he\'s carrying a revolver with the hammer cocked back. He\'s saying he\'s going to kill people," said Paul Martin, a Pinellas sheriff\'s detective who has been chasing the robber for two years. "He\'s pointed guns at people\'s heads."\nThey call him the crowbar robber because in ', + ['Paul', 'Martin', 'Paul', 'Martin'] + ) + ] + +) +def test_adjustment_for_quote_detection(lang_en, text, speakers): + quotes = extract.direct_quotations(lang_en(text)) + assert [speaker.text for quote in quotes for speaker in quote.speaker] == speakers From ff1ce50af92b5b38336e0d44be7fdd7c12ce8c51 Mon Sep 17 00:00:00 2001 From: Andy Friedman Date: Thu, 15 Jun 2023 15:21:04 -0400 Subject: [PATCH 5/7] simplifying integrating linebreaks into quote detection --- src/textacy/extract/triples.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/src/textacy/extract/triples.py b/src/textacy/extract/triples.py index 8e136525..74ba7aa5 100644 --- a/src/textacy/extract/triples.py +++ b/src/textacy/extract/triples.py @@ -228,21 +228,15 @@ def direct_quotations(doc: Doc) -> Iterable[DQTriple]: ) # pairs up quotation-like characters based on acceptable start/end combos # see constants for more info - qtoks = [tok for tok in doc if tok.is_quote] - linebreaks = [t.i for t in doc if t.is_space and t.text == "\n"] + qtoks = [tok for tok in doc if tok.is_quote or (tok.is_space and tok.text == "\n")] qtok_idx_pairs = [(-1,-1)] - for q in qtoks: + for n, q in enumerate(qtoks): if ( not bool(q.whitespace_) and q.i not in [q_[1] for q_ in qtok_idx_pairs] and q.i > qtok_idx_pairs[-1][1] ): - try: - lb = next(l for l in linebreaks if l >= q.i) - q_range = [q_ for q_ in qtoks if q_.i > q.i and q_.i <= lb] - except StopIteration: - q_range = [q_ for q_ in qtoks if q_.i > q.i] - for q_ in q_range: + for q_ in qtoks[n+1:]: if (ord(q.text), ord(q_.text)) in constants.QUOTATION_MARK_PAIRS: qtok_idx_pairs.append((q.i, q_.i)) break From e2568fea056407fd70f01ce596d30199de3b76b1 Mon Sep 17 00:00:00 2001 From: Andy Friedman Date: Tue, 20 Jun 2023 10:45:00 -0400 Subject: [PATCH 6/7] final revamp --- src/textacy/extract/triples.py | 88 ++++++++++++++++++---------------- tests/extract/test_triples.py | 61 ++++++++++++----------- 2 files changed, 79 insertions(+), 70 deletions(-) diff --git a/src/textacy/extract/triples.py b/src/textacy/extract/triples.py index 74ba7aa5..5d13a5c0 100644 --- a/src/textacy/extract/triples.py +++ b/src/textacy/extract/triples.py @@ -9,7 +9,7 @@ import collections from operator import attrgetter -from typing import Iterable, Mapping, Optional, Pattern +from typing import Iterable, Mapping, Optional, Pattern, Literal from spacy.symbols import ( AUX, @@ -228,7 +228,7 @@ def direct_quotations(doc: Doc) -> Iterable[DQTriple]: ) # pairs up quotation-like characters based on acceptable start/end combos # see constants for more info - qtoks = [tok for tok in doc if tok.is_quote or (tok.is_space and tok.text == "\n")] + qtoks = [tok for tok in doc if tok.is_quote or (re.match(r"\n", tok.text))] qtok_idx_pairs = [(-1,-1)] for n, q in enumerate(qtoks): if ( @@ -239,34 +239,29 @@ def direct_quotations(doc: Doc) -> Iterable[DQTriple]: for q_ in qtoks[n+1:]: if (ord(q.text), ord(q_.text)) in constants.QUOTATION_MARK_PAIRS: qtok_idx_pairs.append((q.i, q_.i)) - break + break + qtok_idx_pairs = qtok_idx_pairs[1:] def filter_quote_tokens(tok): return any(qts_idx <= tok.i <= qte_idx for qts_idx, qte_idx in qtok_idx_pairs) for qtok_start_idx, qtok_end_idx in qtok_idx_pairs: - content = doc[qtok_start_idx : qtok_end_idx + 1] + content = doc[qtok_start_idx : qtok_end_idx] cue = None speaker = None - # filter quotations by content + if ( - # quotations should have at least a couple tokens - # excluding the first/last quotation mark tokens - len(content) < constants.MIN_QUOTE_LENGTH + len(content.text.split()) < constants.MIN_QUOTE_LENGTH # filter out titles of books and such, if possible or all( tok.is_title for tok in content - # if tok.pos in {NOUN, PROPN} if not (tok.is_punct or tok.is_stop) ) ): continue - for window_sents in [ - windower(qtok_start_idx, qtok_end_idx, doc, True), - windower(qtok_start_idx, qtok_end_idx, doc) - ]: + for window_sents in [windower(content, "overlap"), windower(content, "linebreaks")]: # get candidate cue verbs in window cue_candidates = [ tok @@ -298,7 +293,7 @@ def filter_quote_tokens(tok): yield DQTriple( speaker=sorted(speaker, key=attrgetter("i")), cue=sorted(cue, key=attrgetter("i")), - content=content, + content=doc[qtok_start_idx:qtok_end_idx+1], ) break @@ -320,38 +315,47 @@ def expand_verb(tok: Token) -> list[Token]: ] return [tok] + verb_modifiers -def windower(i, j, doc, by_linebreak: bool=False) -> Iterable: +def windower(quote: Span, method: Literal["overlap", "linebreaks"]): """ - Two ways to search for cue and speaker: the old way, and a new way based on line breaks. - """ - if by_linebreak: - i_, j_ = line_break_window(i, j, doc) - if i_ is not None: - return (sent for sent in doc[i_+1:j_-1].sents) - else: - return [] - else: - # get window of adjacent/overlapping sentences - return ( - sent - for sent in doc.sents - # these boundary cases are a subtle bit of work... - if ( - (sent.start < i and sent.end >= i - 1) - or (sent.start <= j + 1 and sent.end > j) - ) - ) + Finds the range of sentences in which to look for quote attribution. -def line_break_window(i, j, doc): - """ - Finds the boundaries of the paragraph containing doc[i:j]. + 3 ways: + - "overlap": any sentences that overlap with the quote span + - "linebreaks": overlap sentences +/- one sentence, without crossing linebreaks after the quote + - None: overlap sentences +/- one sentence, + + Input: + quote (Span) - quote to be attributed + method (str) - how the sentence range will be determined + + Output: + sents (list) - list of sentences """ - lb_tok_idxs = [tok.i for tok in doc if tok.text == "\n"] - for i_, j_ in zip(lb_tok_idxs, lb_tok_idxs[1:]): - if i_ <= i and j_ >= j: - return (i_, j_) + if method == "overlap": + return [ + sent for sent in quote.doc.sents + if (sent.start < quote.start < sent.end) + or (sent.start < quote.end < sent.end) + ] else: - return (None, None) + sent_indexes = [ + n for n, s in enumerate(quote.doc.sents) + if (s.start <= quote.start <= s.end) + or (s.start <= quote.end <= s.end) + ] + + i_sent = sent_indexes[0] - 1 if sent_indexes[0] > 0 else 0 + j_sent = sent_indexes[-1]+2 + sents = list(quote.doc.sents)[i_sent:j_sent] + if method == "linebreaks": + linebreaks = [0] + [tok.i for tok in quote.doc if re.match(r"\n", tok.text)] + [quote.doc[-1].i] + linebreak_limits = [ + lb for lb in linebreaks + if sents[0].start < lb <= quote.end + 1 + ] + if linebreak_limits: + return [s for s in sents if s.end <= max(linebreak_limits)] + return sents def prep_text_for_quote_detection(t: str, fix_plural_possessives: bool=True) -> str: """ diff --git a/tests/extract/test_triples.py b/tests/extract/test_triples.py index 4110dafc..cef52d72 100644 --- a/tests/extract/test_triples.py +++ b/tests/extract/test_triples.py @@ -192,8 +192,8 @@ def test_semistructured_statements(sss_doc, entity, cue, fragment_len_range, exp "text, exp", [ ( - 'Burton said, "I love cats!"', - [(["Burton"], ["said"], '"I love cats!"')], + 'Burton said, "I love those cats!"', + [(["Burton"], ["said"], '"I love those cats!"')], ), # NOTE: this case is failing as of spacy v3.2 # let's hide it for now so that tests pass overall @@ -202,14 +202,14 @@ def test_semistructured_statements(sss_doc, entity, cue, fragment_len_range, exp # [(["Burton", "Nick"], ["reply"], '"We love cats!"')], # ), ( - 'Burton explained from a podium. "I love cats," he said.', - [(["he"], ["said"], '"I love cats,"')], + 'Burton explained from a podium. "I love those cats," he said.', + [(["he"], ["said"], '"I love those cats,"')], ), ( - '"I love cats!" insists Burton. "I absolutely do."', + '"I love those cats!" insists Burton. "Yeah, I absolutely do."', [ - (["Burton"], ["insists"], '"I love cats!"'), - (["Burton"], ["insists"], '"I absolutely do."'), + (["Burton"], ["insists"], '"I love those cats!"'), + (["Burton"], ["insists"], '"Yeah, I absolutely do."'), ], ), ( @@ -260,29 +260,34 @@ def test_direct_quotations_spanish(lang_es, text, exp): @pytest.mark.parametrize( "text, speakers", - [( # tests whitespace check -- don't start quotes with quotation marks if tok.whitespace == True - """'AMANDA, you stabbed me," a stunned Anna Ramirez told her identical twin early Saturday outside their Camden home. Edward said prosecutors had established "clear and convincing evidence" that Ramirez should be held.""", - ['Edward'] - ), - ( # tests that odd numbers of quotation marks can be parsed - """He approached the stable. "Where are the horses' carrots?" he asked.""", - ['he'] - ), - ( # tests that overlapping quotes are ignored - "The stranger was eating a burger. He told everyone, \"This 'hamburger with extra cheese and pickles' is good.\"", - ['He'] - ), - ( # tests ending quotes at linebreaks - "\'uneasy\' on Gilmer street\"\nPolice are investigating a shooting. They did not identify the 17-year-old because he is a minor.\n\"Detectives are looking into it,\" Richmond police said in a news release Tuesday morning.", - ['Richmond', 'police'] - ), - ( # tests second use of windower - 'He pounces on counters and jabs the gun at tellers, only 2 pounds of pressure preventing that gun from firing.\n"He\'s going to hurt somebody because he\'s carrying a revolver with the hammer cocked back. He\'s saying he\'s going to kill people," said Paul Martin, a Pinellas sheriff\'s detective who has been chasing the robber for two years. "He\'s pointed guns at people\'s heads."\nThey call him the crowbar robber because in ', - ['Paul', 'Martin', 'Paul', 'Martin'] + [ + ( # tests that odd numbers of quotation marks can be parsed + """He approached the stable. "Where are the horses' carrots?" he asked.""", + ['he'] + ), + ( # tests that overlapping quotes are ignored + "The stranger was eating a burger. He told everyone, \"This 'hamburger with extra cheese and pickles' is good.\"", + ['He'] + ), + ( # tests ending quotes at linebreaks + "\'uneasy\' on Gilmer street\"\nPolice are investigating a shooting. They did not identify the 17-year-old because he is a minor.\n\"Detectives are looking into it,\" Richmond police said in a news release Tuesday morning.", + ['Richmond', 'police'] + ), + ( # tests second use of windower + 'He pounces on counters and jabs the gun at tellers, only 2 pounds of pressure preventing that gun from firing.\n"He\'s going to hurt somebody because he\'s carrying a revolver with the hammer cocked back. He\'s saying he\'s going to kill people," said Paul Martin, a Pinellas sheriff\'s detective who has been chasing the robber for two years. "He\'s pointed guns at people\'s heads."\nThey call him the crowbar robber.', + ['Paul', 'Martin', 'Paul', 'Martin'] + ), + ( # checks that attributions don't leak over linebreaks (if they aren't supposed to) + "And despite perhaps sometimes seeming like superheroes, sandwich aritsts are just like everyone else in society. Conney said in order to be what the people need, the job requires them to \"do their best to deal with it and fix whatever problem there is at that moment.\"\nThe chief said being involved in violence is never easy for anyone, but added that a sandwich artists' work isn't necessarily done when they leave the scene.", + ['Conney'] + ), + ( # find quotes in last sentences + "Garnier said he also learned a lesson from the ordeal.\n\"Think before you act,\" the clown said. \"Your actions have repercussions. No matter how trivial and joking I thought it was, people took it seriously.\"", + ['clown', 'clown'] + ) + ] ) - ] -) def test_adjustment_for_quote_detection(lang_en, text, speakers): quotes = extract.direct_quotations(lang_en(text)) assert [speaker.text for quote in quotes for speaker in quote.speaker] == speakers From 6170a7b9d18c2f8d677cd188b7c0ba9b5bc7b476 Mon Sep 17 00:00:00 2001 From: Andy Friedman Date: Fri, 21 Jul 2023 15:28:00 -0400 Subject: [PATCH 7/7] files formatted with black, changed code to use `re` instead of `regex` package, min_quote_length is now a `direct_quotations` parameter (not a constant), added a better example for testing linebreaks that function as closing quotes --- src/textacy/constants.py | 73 +++++++++-------- src/textacy/extract/triples.py | 143 +++++++++++++++++++-------------- tests/extract/test_triples.py | 64 ++++++++------- 3 files changed, 153 insertions(+), 127 deletions(-) diff --git a/src/textacy/constants.py b/src/textacy/constants.py index 94f1b860..0ee360d1 100644 --- a/src/textacy/constants.py +++ b/src/textacy/constants.py @@ -21,50 +21,49 @@ OBJ_DEPS: set[str] = {"attr", "dobj", "dative", "oprd"} AUX_DEPS: set[str] = {"aux", "auxpass", "neg"} -MIN_QUOTE_LENGTH: int=4 - QUOTATION_MARK_PAIRS = { # """ # Ordinal points of the token.is_quote characters, matched up by start and end. - # Some of these pairs are from weirdly formatted newspaper uploads, so could be some noise! - # source: - # switch = "\"\'" + # either = "\"\'" # start = "“‘```“‘«‹「『„‚" # end = "”’’’’”’»›」』”’" # """ - (34, 34), # " " - (39, 39), # ' ' - (96, 8217), # ` ’ - (171, 187), # « » - (8216, 8217), # ‘ ’ - (8218, 8217), # ‚ ’ - (8220, 8221), # “ ” - (8222, 8221), # „ ” - (8249, 8250), # ‹ › - (12300, 12301), #「 」 - (12302, 12303), #『 』 - (8220, 34), # “ " - (8216, 34), # ‘ " - (96, 34), # ` " - (8216, 34), # ‘ " - (171, 34), # « " - (8249, 34), # ‹ " - (12300, 34), #「 " - (12302, 34), #『 " - (8222, 34), # „ " - (8218, 34), # ‚ " - (34, 8221), # " ” - (34, 8217), # " ’ - (34, 10), - (39, 10), - (96, 10), - (171, 10), - (8216, 10), - (8218, 10), - (8249, 10) - } + (34, 34), # " " + (39, 39), # ' ' + (96, 8217), # ` ’ + (171, 187), # « » + (8216, 8217), # ‘ ’ + (8218, 8217), # ‚ ’ + (8220, 8221), # “ ” + (8222, 8221), # „ ” + (8249, 8250), # ‹ › + (12300, 12301), # 「 」 + (12302, 12303), # 『 』 + (8220, 34), # “ " + (8216, 34), # ‘ " + (96, 34), # ` " + (8216, 34), # ‘ " + (171, 34), # « " + (8249, 34), # ‹ " + (12300, 34), # 「 " + (12302, 34), # 『 " + (8222, 34), # „ " + (8218, 34), # ‚ " + (34, 8221), # " ” + (34, 8217), # " ’ + (34, 10), # " \n + (39, 10), # ' \n + (96, 10), # ` \n + (171, 10), # « \n + (8216, 10), # ‘ \n + (8218, 10), # ‚ \n + (8220, 10), # “ \n + (8249, 10), # ‹ \n + (12300, 10), # 「 \n + (12302, 10), # 『 \n +} REPORTING_VERBS: dict[str, set[str]] = { "en": { @@ -243,7 +242,7 @@ RE_ALNUM: Pattern = re.compile(r"[^\W_]+") # regexes for quote detection prep -ALL_QUOTES = '‹「`»」‘"„›”‚’\'』『«“' +ALL_QUOTES = "‹「`»」‘\"„›”‚’'』『«“" DOUBLE_QUOTES = '‹「」»"„『”‚』›«“' ANY_DOUBLE_QUOTE_REGEX = r"[{}]".format(DOUBLE_QUOTES) DOUBLE_QUOTES_NOSPACE_REGEX = r"(?<=\S)([{}])(?=\S)".format(DOUBLE_QUOTES) diff --git a/src/textacy/extract/triples.py b/src/textacy/extract/triples.py index 5d13a5c0..5a4d8454 100644 --- a/src/textacy/extract/triples.py +++ b/src/textacy/extract/triples.py @@ -27,10 +27,10 @@ nsubjpass, obj, pobj, - xcomp + xcomp, ) from spacy.tokens import Doc, Span, Token -import regex as re +import re from .. import constants, types, utils from . import matches @@ -203,7 +203,7 @@ def semistructured_statements( ) -def direct_quotations(doc: Doc) -> Iterable[DQTriple]: +def direct_quotations(doc: Doc, min_quote_length=4) -> Iterable[DQTriple]: """ Extract direct quotations with an attributable speaker from a document using simple rules and patterns. Does not extract indirect or mixed quotations! @@ -229,60 +229,65 @@ def direct_quotations(doc: Doc) -> Iterable[DQTriple]: # pairs up quotation-like characters based on acceptable start/end combos # see constants for more info qtoks = [tok for tok in doc if tok.is_quote or (re.match(r"\n", tok.text))] - qtok_idx_pairs = [(-1,-1)] + qtok_idx_pairs = [(-1, -1)] for n, q in enumerate(qtoks): if ( not bool(q.whitespace_) - and q.i not in [q_[1] for q_ in qtok_idx_pairs] + and q.i not in [q_[1] for q_ in qtok_idx_pairs] and q.i > qtok_idx_pairs[-1][1] - ): - for q_ in qtoks[n+1:]: + ): + for q_ in qtoks[n + 1 :]: if (ord(q.text), ord(q_.text)) in constants.QUOTATION_MARK_PAIRS: qtok_idx_pairs.append((q.i, q_.i)) - break + break qtok_idx_pairs = qtok_idx_pairs[1:] - + def filter_quote_tokens(tok): return any(qts_idx <= tok.i <= qte_idx for qts_idx, qte_idx in qtok_idx_pairs) for qtok_start_idx, qtok_end_idx in qtok_idx_pairs: - content = doc[qtok_start_idx : qtok_end_idx] + content = doc[qtok_start_idx:qtok_end_idx] cue = None speaker = None if ( - len(content.text.split()) < constants.MIN_QUOTE_LENGTH + len(content.text.split()) < min_quote_length # filter out titles of books and such, if possible - or all( - tok.is_title - for tok in content - if not (tok.is_punct or tok.is_stop) - ) + or all(tok.is_title for tok in content if not (tok.is_punct or tok.is_stop)) ): continue - for window_sents in [windower(content, "overlap"), windower(content, "linebreaks")]: - # get candidate cue verbs in window + for window_sents in [ + windower(content, "overlap"), + windower(content, "linebreaks"), + ]: + # get candidate cue verbs in window cue_candidates = [ - tok - for sent in window_sents - for tok in sent - if tok.pos == VERB - and tok.lemma_ in _reporting_verbs - and not filter_quote_tokens(tok) - ] - cue_candidates = sorted(cue_candidates, - key=lambda cc: min(abs(cc.i - qtok_start_idx), abs(cc.i - qtok_end_idx)) + tok + for sent in window_sents + for tok in sent + if tok.pos == VERB + and tok.lemma_ in _reporting_verbs + and not filter_quote_tokens(tok) + ] + cue_candidates = sorted( + cue_candidates, + key=lambda cc: min( + abs(cc.i - qtok_start_idx), abs(cc.i - qtok_end_idx) + ), ) for cue_cand in cue_candidates: if cue is not None: break speaker_cands = [ - speaker_cand for speaker_cand in cue_cand.children - if speaker_cand.pos!=PUNCT + speaker_cand + for speaker_cand in cue_cand.children + if speaker_cand.pos != PUNCT and not filter_quote_tokens(speaker_cand) - and ((speaker_cand.i >= qtok_end_idx) - or (speaker_cand.i <= qtok_start_idx )) + and ( + (speaker_cand.i >= qtok_end_idx) + or (speaker_cand.i <= qtok_start_idx) + ) ] for speaker_cand in speaker_cands: if speaker_cand.dep in _ACTIVE_SUBJ_DEPS: @@ -293,10 +298,11 @@ def filter_quote_tokens(tok): yield DQTriple( speaker=sorted(speaker, key=attrgetter("i")), cue=sorted(cue, key=attrgetter("i")), - content=doc[qtok_start_idx:qtok_end_idx+1], + content=doc[qtok_start_idx : qtok_end_idx + 1], ) break + def expand_noun(tok: Token) -> list[Token]: """Expand a noun token to include all associated conjunct and compound nouns.""" tok_and_conjuncts = [tok] + list(tok.conjuncts) @@ -308,14 +314,16 @@ def expand_noun(tok: Token) -> list[Token]: ] return tok_and_conjuncts + compounds + def expand_verb(tok: Token) -> list[Token]: """Expand a verb token to include all associated auxiliary and negation tokens.""" verb_modifiers = [ child for child in tok.children if child.dep in _VERB_MODIFIER_DEPS ] return [tok] + verb_modifiers - -def windower(quote: Span, method: Literal["overlap", "linebreaks"]): + + +def windower(quote: Span, method: Literal["overlap", "linebreaks"]) -> Iterable[Span]: """ Finds the range of sentences in which to look for quote attribution. @@ -333,68 +341,77 @@ def windower(quote: Span, method: Literal["overlap", "linebreaks"]): """ if method == "overlap": return [ - sent for sent in quote.doc.sents - if (sent.start < quote.start < sent.end) - or (sent.start < quote.end < sent.end) - ] + sent + for sent in quote.doc.sents + if (sent.start < quote.start < sent.end) + or (sent.start < quote.end < sent.end) + ] else: sent_indexes = [ - n for n, s in enumerate(quote.doc.sents) - if (s.start <= quote.start <= s.end) - or (s.start <= quote.end <= s.end) - ] - + n + for n, s in enumerate(quote.doc.sents) + if (s.start <= quote.start <= s.end) or (s.start <= quote.end <= s.end) + ] + i_sent = sent_indexes[0] - 1 if sent_indexes[0] > 0 else 0 - j_sent = sent_indexes[-1]+2 + j_sent = sent_indexes[-1] + 2 sents = list(quote.doc.sents)[i_sent:j_sent] if method == "linebreaks": - linebreaks = [0] + [tok.i for tok in quote.doc if re.match(r"\n", tok.text)] + [quote.doc[-1].i] + linebreaks = ( + [0] + + [tok.i for tok in quote.doc if re.match(r"\n", tok.text)] + + [quote.doc[-1].i] + ) linebreak_limits = [ - lb for lb in linebreaks - if sents[0].start < lb <= quote.end + 1 - ] + lb for lb in linebreaks if sents[0].start < lb <= quote.end + 1 + ] if linebreak_limits: return [s for s in sents if s.end <= max(linebreak_limits)] - return sents - -def prep_text_for_quote_detection(t: str, fix_plural_possessives: bool=True) -> str: + return sents + + +def prep_text_for_quote_detection(t: str, fix_plural_possessives: bool = True) -> str: """ Sorts out some common issues that trip up the quote detector. Works best one paragraph at a time -- use prep_document_for_quote_detection for the whole doc. - replaces consecutive apostrophes with a double quote (no idea why this happens but it does) - adds spaces before or after double quotes that don't have them - - if enabled, fixes plural possessives by adding an "x", because the hanging apostrophe can trigger quote detection. + - if enabled, fixes plural possessives by adding an "x", because the hanging apostrophe can trigger quote detection. - adds a double quote to the end of paragraphs that are continuations of quotes and thus traditionally don't end with quotation marks Input: t (str) - text to be prepped, preferably one paragraph fix_plural_possessives (bool) - enables fix_plural_possessives - + Output: t (str) - text prepped for quote detection """ if not t: return - - t = t.replace("\'\'", "\"") + + t = t.replace("''", '"') if fix_plural_possessives: t = re.sub(r"(.{3,8}s\')(\s)", r"\1x\2", t) while re.search(constants.DOUBLE_QUOTES_NOSPACE_REGEX, p): match = re.search(constants.DOUBLE_QUOTES_NOSPACE_REGEX, p) - if len(re.findall(constants.ANY_DOUBLE_QUOTE_REGEX, p[:match.start()])) % 2 != 0: + if ( + len(re.findall(constants.ANY_DOUBLE_QUOTE_REGEX, p[: match.start()])) % 2 + != 0 + ): replacer = '" ' else: replacer = ' "' - p = p[:match.start()] + replacer + p[match.end():] + p = p[: match.start()] + replacer + p[match.end() :] if ( - not (p[0] == "'" and p[-1] == "'") - and p[0] in constants.ALL_QUOTES + not (p[0] == "'" and p[-1] == "'") + and p[0] in constants.ALL_QUOTES and len(re.findall(constants.ANY_DOUBLE_QUOTE_REGEX, p[1:])) % 2 == 0 - ): + ): p += '"' return p.strip() -def prep_document_for_quote_detection(t: str, para_char: str="\n") -> str: + +def prep_document_for_quote_detection(t: str, para_char: str = "\n") -> str: """ Splits text into paragraphs (on para_char), runs prep_text_for_quote_detection on all paragraphs, then reassembles with para_char. @@ -405,4 +422,6 @@ def prep_document_for_quote_detection(t: str, para_char: str="\n") -> str: Output: document prepped for quote detection """ - return para_char.join([prep_text_for_quote_detection(t) for t in t.split(para_char) if t]) \ No newline at end of file + return para_char.join( + [prep_text_for_quote_detection(t) for t in t.split(para_char) if t] + ) diff --git a/tests/extract/test_triples.py b/tests/extract/test_triples.py index cef52d72..f9696632 100644 --- a/tests/extract/test_triples.py +++ b/tests/extract/test_triples.py @@ -92,10 +92,9 @@ def sss_doc(lang_en): (["She", "friend"], ["sell"], ["sea", "shells"]), (["She", "friend"], ["throw"], ["sea", "shells"]), ], - ) + ), ], ) - def test_subject_verb_object_triples(text, svos_exp, lang_en): doc = lang_en(text) svos = list(extract.subject_verb_object_triples(doc)) @@ -228,7 +227,9 @@ def test_semistructured_statements(sss_doc, entity, cue, fragment_len_range, exp ) def test_direct_quotations(lang_en, text, exp): obs = list(extract.direct_quotations(lang_en(text))) - assert all(hasattr(dq, attr) for dq in obs for attr in ["speaker", "cue", "content"]) + assert all( + hasattr(dq, attr) for dq in obs for attr in ["speaker", "cue", "content"] + ) obs_text = [ ([tok.text for tok in speaker], [tok.text for tok in cue], content.text) for speaker, cue, content in obs @@ -259,35 +260,42 @@ def test_direct_quotations_spanish(lang_es, text, exp): @pytest.mark.parametrize( - "text, speakers", - [ - ( # tests that odd numbers of quotation marks can be parsed - """He approached the stable. "Where are the horses' carrots?" he asked.""", - ['he'] + "text, speakers", + [ + ( # tests that odd numbers of quotation marks can be parsed + """He approached the stable. "Where are the horses' carrots?" he asked.""", + ["he"], ), - ( # tests that overlapping quotes are ignored - "The stranger was eating a burger. He told everyone, \"This 'hamburger with extra cheese and pickles' is good.\"", - ['He'] + ( # tests that overlapping quotes are ignored + "The stranger was eating a burger. He told everyone, \"This 'hamburger with extra cheese and pickles' is good.\"", + ["He"], ), - ( # tests ending quotes at linebreaks - "\'uneasy\' on Gilmer street\"\nPolice are investigating a shooting. They did not identify the 17-year-old because he is a minor.\n\"Detectives are looking into it,\" Richmond police said in a news release Tuesday morning.", - ['Richmond', 'police'] + ( # tests parsing of quotes where linebreaks function as closing quotation marks + """Payroll taxes remain fully deductible under the new federal tax law. Similarly, Cuomo's charitable entities would allow taxpayers to pay taxes to local governments and school districts as charitable donations, which remain fully deductible.\n"We will maintain our wait-and-see approach to the state's SALT-mitigation plan," said Heather C. Briccetti, president and CEO of The Business Council of New York State. "In our own discussions with employers, we did not receive positive feedback on the payroll tax proposal, although we do appreciate that the final language made it optional.\n"The effect of the charitable giving gambit is ultimately dependent on IRS determination as to its deductibility," Briccetti said.\nThe New Yorkers who may benefit most from the state's plan are big earners who are important to state finances.""", + ['Heather', 'C.', 'Briccetti', 'Heather', 'C.', 'Briccetti', 'Briccetti'] ), - ( # tests second use of windower - 'He pounces on counters and jabs the gun at tellers, only 2 pounds of pressure preventing that gun from firing.\n"He\'s going to hurt somebody because he\'s carrying a revolver with the hammer cocked back. He\'s saying he\'s going to kill people," said Paul Martin, a Pinellas sheriff\'s detective who has been chasing the robber for two years. "He\'s pointed guns at people\'s heads."\nThey call him the crowbar robber.', - ['Paul', 'Martin', 'Paul', 'Martin'] + ( # tests ending quotes at linebreaks + '\'uneasy\' on Gilmer street"\nPolice are investigating a shooting. They did not identify the 17-year-old because he is a minor.\n"Detectives are looking into it," Richmond police said in a news release Tuesday morning.', + ["Richmond", "police"], ), - ( # checks that attributions don't leak over linebreaks (if they aren't supposed to) - "And despite perhaps sometimes seeming like superheroes, sandwich aritsts are just like everyone else in society. Conney said in order to be what the people need, the job requires them to \"do their best to deal with it and fix whatever problem there is at that moment.\"\nThe chief said being involved in violence is never easy for anyone, but added that a sandwich artists' work isn't necessarily done when they leave the scene.", - ['Conney'] + ( # tests second use of windower + "He pounces on counters and jabs the gun at tellers, only 2 pounds of pressure preventing that gun from firing.\n\"He's going to hurt somebody because he's carrying a revolver with the hammer cocked back. He's saying he's going to kill people,\" said Paul Martin, a Pinellas sheriff's detective who has been chasing the robber for two years. \"He's pointed guns at people's heads.\"\nThey call him the crowbar robber.", + ["Paul", "Martin", "Paul", "Martin"], ), - ( # find quotes in last sentences - "Garnier said he also learned a lesson from the ordeal.\n\"Think before you act,\" the clown said. \"Your actions have repercussions. No matter how trivial and joking I thought it was, people took it seriously.\"", - ['clown', 'clown'] - ) - ] - ) - + ( # checks that attributions don't leak over linebreaks (if they aren't supposed to) + "And despite perhaps sometimes seeming like superheroes, sandwich aritsts are just like everyone else in society. Conney said in order to be what the people need, the job requires them to \"do their best to deal with it and fix whatever problem there is at that moment.\"\nThe chief said being involved in violence is never easy for anyone, but added that a sandwich artists' work isn't necessarily done when they leave the scene.", + ["Conney"], + ), + ( # find quotes in last sentences + 'Garnier said he also learned a lesson from the ordeal.\n"Think before you act," the clown said. "Your actions have repercussions. No matter how trivial and joking I thought it was, people took it seriously."', + ["clown", "clown"], + ), + ], +) def test_adjustment_for_quote_detection(lang_en, text, speakers): quotes = extract.direct_quotations(lang_en(text)) - assert [speaker.text for quote in quotes for speaker in quote.speaker] == speakers + assert [ + speaker.text + for quote in quotes + for speaker in quote.speaker + ] == speakers