diff --git a/src/textacy/constants.py b/src/textacy/constants.py index 416c1362..0ee360d1 100644 --- a/src/textacy/constants.py +++ b/src/textacy/constants.py @@ -21,6 +21,50 @@ OBJ_DEPS: set[str] = {"attr", "dobj", "dative", "oprd"} AUX_DEPS: set[str] = {"aux", "auxpass", "neg"} +QUOTATION_MARK_PAIRS = { + # """ + # Ordinal points of the token.is_quote characters, matched up by start and end. + # Some of these pairs are from weirdly formatted newspaper uploads, so could be some noise! + # source: + # either = "\"\'" + # start = "“‘```“‘«‹「『„‚" + # end = "”’’’’”’»›」』”’" + # """ + (34, 34), # " " + (39, 39), # ' ' + (96, 8217), # ` ’ + (171, 187), # « » + (8216, 8217), # ‘ ’ + (8218, 8217), # ‚ ’ + (8220, 8221), # “ ” + (8222, 8221), # „ ” + (8249, 8250), # ‹ › + (12300, 12301), # 「 」 + (12302, 12303), # 『 』 + (8220, 34), # “ " + (8216, 34), # ‘ " + (96, 34), # ` " + (8216, 34), # ‘ " + (171, 34), # « " + (8249, 34), # ‹ " + (12300, 34), # 「 " + (12302, 34), # 『 " + (8222, 34), # „ " + (8218, 34), # ‚ " + (34, 8221), # " ” + (34, 8217), # " ’ + (34, 10), # " \n + (39, 10), # ' \n + (96, 10), # ` \n + (171, 10), # « \n + (8216, 10), # ‘ \n + (8218, 10), # ‚ \n + (8220, 10), # “ \n + (8249, 10), # ‹ \n + (12300, 10), # 「 \n + (12302, 10), # 『 \n +} + REPORTING_VERBS: dict[str, set[str]] = { "en": { "according", @@ -196,3 +240,9 @@ ) RE_ALNUM: Pattern = re.compile(r"[^\W_]+") + +# regexes for quote detection prep +ALL_QUOTES = "‹「`»」‘\"„›”‚’'』『«“" +DOUBLE_QUOTES = '‹「」»"„『”‚』›«“' +ANY_DOUBLE_QUOTE_REGEX = r"[{}]".format(DOUBLE_QUOTES) +DOUBLE_QUOTES_NOSPACE_REGEX = r"(?<=\S)([{}])(?=\S)".format(DOUBLE_QUOTES) diff --git a/src/textacy/extract/triples.py b/src/textacy/extract/triples.py index f004d31d..5a4d8454 100644 --- a/src/textacy/extract/triples.py +++ b/src/textacy/extract/triples.py @@ -9,12 +9,12 @@ import collections from operator import attrgetter -from typing import Iterable, Mapping, Optional, Pattern +from typing import Iterable, Mapping, Optional, Pattern, Literal -from cytoolz import itertoolz from spacy.symbols import ( AUX, VERB, + PUNCT, agent, attr, aux, @@ -30,6 +30,7 @@ xcomp, ) from spacy.tokens import Doc, Span, Token +import re from .. import constants, types, utils from . import matches @@ -202,13 +203,14 @@ def semistructured_statements( ) -def direct_quotations(doc: Doc) -> Iterable[DQTriple]: +def direct_quotations(doc: Doc, min_quote_length=4) -> Iterable[DQTriple]: """ Extract direct quotations with an attributable speaker from a document using simple rules and patterns. Does not extract indirect or mixed quotations! Args: doc + min_quote_length - minimum distance (in tokens) between potentially paired quotation marks. Yields: Next direct quotation in ``doc`` as a (speaker, cue, content) triple. @@ -217,7 +219,6 @@ def direct_quotations(doc: Doc) -> Iterable[DQTriple]: Loosely inspired by Krestel, Bergler, Witte. "Minding the Source: Automatic Tagging of Reported Speech in Newspaper Articles". """ - # TODO: train a model to do this instead, maybe similar to entity recognition try: _reporting_verbs = constants.REPORTING_VERBS[doc.lang_] except KeyError: @@ -225,77 +226,81 @@ def direct_quotations(doc: Doc) -> Iterable[DQTriple]: f"direct quotation extraction is not implemented for lang='{doc.lang_}', " f"only {sorted(constants.REPORTING_VERBS.keys())}" ) - qtok_idxs = [tok.i for tok in doc if tok.is_quote] - if len(qtok_idxs) % 2 != 0: - raise ValueError( - f"{len(qtok_idxs)} quotation marks found, indicating an unclosed quotation; " - "given the limitations of this method, it's safest to bail out " - "rather than guess which quotation is unclosed" - ) - qtok_pair_idxs = list(itertoolz.partition(2, qtok_idxs)) - for qtok_start_idx, qtok_end_idx in qtok_pair_idxs: - content = doc[qtok_start_idx : qtok_end_idx + 1] + # pairs up quotation-like characters based on acceptable start/end combos + # see constants for more info + qtoks = [tok for tok in doc if tok.is_quote or (re.match(r"\n", tok.text))] + qtok_idx_pairs = [(-1, -1)] + for n, q in enumerate(qtoks): + if ( + not bool(q.whitespace_) + and q.i not in [q_[1] for q_ in qtok_idx_pairs] + and q.i > qtok_idx_pairs[-1][1] + ): + for q_ in qtoks[n + 1 :]: + if (ord(q.text), ord(q_.text)) in constants.QUOTATION_MARK_PAIRS: + qtok_idx_pairs.append((q.i, q_.i)) + break + qtok_idx_pairs = qtok_idx_pairs[1:] + + def filter_quote_tokens(tok): + return any(qts_idx <= tok.i <= qte_idx for qts_idx, qte_idx in qtok_idx_pairs) + + for qtok_start_idx, qtok_end_idx in qtok_idx_pairs: + content = doc[qtok_start_idx:qtok_end_idx] cue = None speaker = None - # filter quotations by content + if ( - # quotations should have at least a couple tokens - # excluding the first/last quotation mark tokens - len(content) < 4 + len(content.text.split()) < min_quote_length # filter out titles of books and such, if possible - or all( - tok.is_title - for tok in content - # if tok.pos in {NOUN, PROPN} - if not (tok.is_punct or tok.is_stop) - ) - # TODO: require closing punctuation before the quotation mark? - # content[-2].is_punct is False + or all(tok.is_title for tok in content if not (tok.is_punct or tok.is_stop)) ): continue - # get window of adjacent/overlapping sentences - window_sents = ( - sent - for sent in doc.sents - # these boundary cases are a subtle bit of work... - if ( - (sent.start < qtok_start_idx and sent.end >= qtok_start_idx - 1) - or (sent.start <= qtok_end_idx + 1 and sent.end > qtok_end_idx) - ) - ) - # get candidate cue verbs in window - cue_cands = [ - tok - for sent in window_sents - for tok in sent - if ( - tok.pos == VERB + + for window_sents in [ + windower(content, "overlap"), + windower(content, "linebreaks"), + ]: + # get candidate cue verbs in window + cue_candidates = [ + tok + for sent in window_sents + for tok in sent + if tok.pos == VERB and tok.lemma_ in _reporting_verbs - # cue verbs must occur *outside* any quotation content - and not any( - qts_idx <= tok.i <= qte_idx for qts_idx, qte_idx in qtok_pair_idxs - ) + and not filter_quote_tokens(tok) + ] + cue_candidates = sorted( + cue_candidates, + key=lambda cc: min( + abs(cc.i - qtok_start_idx), abs(cc.i - qtok_end_idx) + ), ) - ] - # sort candidates by proximity to quote content - cue_cands = sorted( - cue_cands, - key=lambda cc: min(abs(cc.i - qtok_start_idx), abs(cc.i - qtok_end_idx)), - ) - for cue_cand in cue_cands: - if cue is not None: - break - for speaker_cand in cue_cand.children: - if speaker_cand.dep in _ACTIVE_SUBJ_DEPS: - cue = expand_verb(cue_cand) - speaker = expand_noun(speaker_cand) + for cue_cand in cue_candidates: + if cue is not None: break - if content and cue and speaker: - yield DQTriple( - speaker=sorted(speaker, key=attrgetter("i")), - cue=sorted(cue, key=attrgetter("i")), - content=content, - ) + speaker_cands = [ + speaker_cand + for speaker_cand in cue_cand.children + if speaker_cand.pos != PUNCT + and not filter_quote_tokens(speaker_cand) + and ( + (speaker_cand.i >= qtok_end_idx) + or (speaker_cand.i <= qtok_start_idx) + ) + ] + for speaker_cand in speaker_cands: + if speaker_cand.dep in _ACTIVE_SUBJ_DEPS: + cue = expand_verb(cue_cand) + speaker = expand_noun(speaker_cand) + break + if content and cue and speaker: + yield DQTriple( + speaker=sorted(speaker, key=attrgetter("i")), + cue=sorted(cue, key=attrgetter("i")), + content=doc[qtok_start_idx : qtok_end_idx + 1], + ) + break def expand_noun(tok: Token) -> list[Token]: @@ -305,7 +310,6 @@ def expand_noun(tok: Token) -> list[Token]: child for tc in tok_and_conjuncts for child in tc.children - # TODO: why doesn't compound import from spacy.symbols? if child.dep_ == "compound" ] return tok_and_conjuncts + compounds @@ -317,3 +321,107 @@ def expand_verb(tok: Token) -> list[Token]: child for child in tok.children if child.dep in _VERB_MODIFIER_DEPS ] return [tok] + verb_modifiers + + +def windower(quote: Span, method: Literal["overlap", "linebreaks"]) -> Iterable[Span]: + """ + Finds the range of sentences in which to look for quote attribution. + + 3 ways: + - "overlap": any sentences that overlap with the quote span + - "linebreaks": overlap sentences +/- one sentence, without crossing linebreaks after the quote + - None: overlap sentences +/- one sentence, + + Input: + quote (Span) - quote to be attributed + method (str) - how the sentence range will be determined + + Output: + sents (list) - list of sentences + """ + if method == "overlap": + return [ + sent + for sent in quote.doc.sents + if (sent.start < quote.start < sent.end) + or (sent.start < quote.end < sent.end) + ] + else: + sent_indexes = [ + n + for n, s in enumerate(quote.doc.sents) + if (s.start <= quote.start <= s.end) or (s.start <= quote.end <= s.end) + ] + + i_sent = sent_indexes[0] - 1 if sent_indexes[0] > 0 else 0 + j_sent = sent_indexes[-1] + 2 + sents = list(quote.doc.sents)[i_sent:j_sent] + if method == "linebreaks": + linebreaks = ( + [0] + + [tok.i for tok in quote.doc if re.match(r"\n", tok.text)] + + [quote.doc[-1].i] + ) + linebreak_limits = [ + lb for lb in linebreaks if sents[0].start < lb <= quote.end + 1 + ] + if linebreak_limits: + return [s for s in sents if s.end <= max(linebreak_limits)] + return sents + + +def prep_text_for_quote_detection(t: str, fix_plural_possessives: bool = True) -> str: + """ + Sorts out some common issues that trip up the quote detector. Works best one paragraph at a time -- use prep_document_for_quote_detection for the whole doc. + + - replaces consecutive apostrophes with a double quote (no idea why this happens but it does) + - adds spaces before or after double quotes that don't have them + - if enabled, fixes plural possessives by adding an "x", because the hanging apostrophe can trigger quote detection. + - adds a double quote to the end of paragraphs that are continuations of quotes and thus traditionally don't end with quotation marks + + Input: + t (str) - text to be prepped, preferably one paragraph + fix_plural_possessives (bool) - enables fix_plural_possessives + + Output: + t (str) - text prepped for quote detection + """ + if not t: + return + + t = t.replace("''", '"') + if fix_plural_possessives: + t = re.sub(r"(.{3,8}s\')(\s)", r"\1x\2", t) + while re.search(constants.DOUBLE_QUOTES_NOSPACE_REGEX, p): + match = re.search(constants.DOUBLE_QUOTES_NOSPACE_REGEX, p) + if ( + len(re.findall(constants.ANY_DOUBLE_QUOTE_REGEX, p[: match.start()])) % 2 + != 0 + ): + replacer = '" ' + else: + replacer = ' "' + p = p[: match.start()] + replacer + p[match.end() :] + if ( + not (p[0] == "'" and p[-1] == "'") + and p[0] in constants.ALL_QUOTES + and len(re.findall(constants.ANY_DOUBLE_QUOTE_REGEX, p[1:])) % 2 == 0 + ): + p += '"' + return p.strip() + + +def prep_document_for_quote_detection(t: str, para_char: str = "\n") -> str: + """ + Splits text into paragraphs (on para_char), runs prep_text_for_quote_detection on all paragraphs, then reassembles with para_char. + + Input: + t (str) - document to prep for quote detection + para_char (str) - paragraph boundary in t + + Output: + document prepped for quote detection + """ + return para_char.join( + [prep_text_for_quote_detection(t) for t in t.split(para_char) if t] + ) diff --git a/tests/extract/test_triples.py b/tests/extract/test_triples.py index 85c3f2d6..f9696632 100644 --- a/tests/extract/test_triples.py +++ b/tests/extract/test_triples.py @@ -191,8 +191,8 @@ def test_semistructured_statements(sss_doc, entity, cue, fragment_len_range, exp "text, exp", [ ( - 'Burton said, "I love cats!"', - [(["Burton"], ["said"], '"I love cats!"')], + 'Burton said, "I love those cats!"', + [(["Burton"], ["said"], '"I love those cats!"')], ), # NOTE: this case is failing as of spacy v3.2 # let's hide it for now so that tests pass overall @@ -201,14 +201,14 @@ def test_semistructured_statements(sss_doc, entity, cue, fragment_len_range, exp # [(["Burton", "Nick"], ["reply"], '"We love cats!"')], # ), ( - 'Burton explained from a podium. "I love cats," he said.', - [(["he"], ["said"], '"I love cats,"')], + 'Burton explained from a podium. "I love those cats," he said.', + [(["he"], ["said"], '"I love those cats,"')], ), ( - '"I love cats!" insists Burton. "I absolutely do."', + '"I love those cats!" insists Burton. "Yeah, I absolutely do."', [ - (["Burton"], ["insists"], '"I love cats!"'), - (["Burton"], ["insists"], '"I absolutely do."'), + (["Burton"], ["insists"], '"I love those cats!"'), + (["Burton"], ["insists"], '"Yeah, I absolutely do."'), ], ), ( @@ -227,7 +227,9 @@ def test_semistructured_statements(sss_doc, entity, cue, fragment_len_range, exp ) def test_direct_quotations(lang_en, text, exp): obs = list(extract.direct_quotations(lang_en(text))) - assert all(hasattr(dq, attr) for dq in obs for attr in ["speaker", "cue", "content"]) + assert all( + hasattr(dq, attr) for dq in obs for attr in ["speaker", "cue", "content"] + ) obs_text = [ ([tok.text for tok in speaker], [tok.text for tok in cue], content.text) for speaker, cue, content in obs @@ -255,3 +257,45 @@ def test_direct_quotations_spanish(lang_es, text, exp): for speaker, cue, content in obs ] assert obs_text == exp + + +@pytest.mark.parametrize( + "text, speakers", + [ + ( # tests that odd numbers of quotation marks can be parsed + """He approached the stable. "Where are the horses' carrots?" he asked.""", + ["he"], + ), + ( # tests that overlapping quotes are ignored + "The stranger was eating a burger. He told everyone, \"This 'hamburger with extra cheese and pickles' is good.\"", + ["He"], + ), + ( # tests parsing of quotes where linebreaks function as closing quotation marks + """Payroll taxes remain fully deductible under the new federal tax law. Similarly, Cuomo's charitable entities would allow taxpayers to pay taxes to local governments and school districts as charitable donations, which remain fully deductible.\n"We will maintain our wait-and-see approach to the state's SALT-mitigation plan," said Heather C. Briccetti, president and CEO of The Business Council of New York State. "In our own discussions with employers, we did not receive positive feedback on the payroll tax proposal, although we do appreciate that the final language made it optional.\n"The effect of the charitable giving gambit is ultimately dependent on IRS determination as to its deductibility," Briccetti said.\nThe New Yorkers who may benefit most from the state's plan are big earners who are important to state finances.""", + ['Heather', 'C.', 'Briccetti', 'Heather', 'C.', 'Briccetti', 'Briccetti'] + ), + ( # tests ending quotes at linebreaks + '\'uneasy\' on Gilmer street"\nPolice are investigating a shooting. They did not identify the 17-year-old because he is a minor.\n"Detectives are looking into it," Richmond police said in a news release Tuesday morning.', + ["Richmond", "police"], + ), + ( # tests second use of windower + "He pounces on counters and jabs the gun at tellers, only 2 pounds of pressure preventing that gun from firing.\n\"He's going to hurt somebody because he's carrying a revolver with the hammer cocked back. He's saying he's going to kill people,\" said Paul Martin, a Pinellas sheriff's detective who has been chasing the robber for two years. \"He's pointed guns at people's heads.\"\nThey call him the crowbar robber.", + ["Paul", "Martin", "Paul", "Martin"], + ), + ( # checks that attributions don't leak over linebreaks (if they aren't supposed to) + "And despite perhaps sometimes seeming like superheroes, sandwich aritsts are just like everyone else in society. Conney said in order to be what the people need, the job requires them to \"do their best to deal with it and fix whatever problem there is at that moment.\"\nThe chief said being involved in violence is never easy for anyone, but added that a sandwich artists' work isn't necessarily done when they leave the scene.", + ["Conney"], + ), + ( # find quotes in last sentences + 'Garnier said he also learned a lesson from the ordeal.\n"Think before you act," the clown said. "Your actions have repercussions. No matter how trivial and joking I thought it was, people took it seriously."', + ["clown", "clown"], + ), + ], +) +def test_adjustment_for_quote_detection(lang_en, text, speakers): + quotes = extract.direct_quotations(lang_en(text)) + assert [ + speaker.text + for quote in quotes + for speaker in quote.speaker + ] == speakers