From 79fa9998cb1cb3a8129a412aa810811f68772c19 Mon Sep 17 00:00:00 2001 From: Nico Scandolo Date: Tue, 2 Jun 2026 12:34:30 -0300 Subject: [PATCH 1/2] Fix missing space between block and inline sibling elements in JSoupTextExtractor Fixes #1925 --- .../parse/JSoupTextExtractor.java | 2 +- .../parse/JSoupTextExtractorTest.java | 30 +++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/core/src/main/java/org/apache/stormcrawler/parse/JSoupTextExtractor.java b/core/src/main/java/org/apache/stormcrawler/parse/JSoupTextExtractor.java index 4e8241353..a58d09900 100644 --- a/core/src/main/java/org/apache/stormcrawler/parse/JSoupTextExtractor.java +++ b/core/src/main/java/org/apache/stormcrawler/parse/JSoupTextExtractor.java @@ -146,7 +146,7 @@ public void tail(Node node, int depth) { excluded = null; } if (element.isBlock() - && (node.nextSibling() instanceof TextNode) + && node.nextSibling() != null && !lastCharIsWhitespace(accum)) { accum.append(' '); } diff --git a/core/src/test/java/org/apache/stormcrawler/parse/JSoupTextExtractorTest.java b/core/src/test/java/org/apache/stormcrawler/parse/JSoupTextExtractorTest.java index eb0f0c44e..75ef899e4 100644 --- a/core/src/test/java/org/apache/stormcrawler/parse/JSoupTextExtractorTest.java +++ b/core/src/test/java/org/apache/stormcrawler/parse/JSoupTextExtractorTest.java @@ -64,6 +64,36 @@ void testExclusionCase() throws IOException { assertEquals("the content of the page", text); } + @Test + void testBlockFollowedByInlineElement() { + Config conf = new Config(); + JSoupTextExtractor extractor = new JSoupTextExtractor(conf); + // Real-world case from https://www.acai-island.com/contact where + // "Email" and the address were concatenated as "Emailacaiisland1300@gmail.com" + String content = + "
" + + "

Email

" + + "acaiisland1300@gmail.com" + + "
"; + Document jsoupDoc = Parser.htmlParser().parseInput(content, "http://example.com"); + String text = extractor.text(jsoupDoc.body()); + assertEquals("Email acaiisland1300@gmail.com", text); + } + + @Test + void testBlockFollowedByInlineSpan() { + Config conf = new Config(); + JSoupTextExtractor extractor = new JSoupTextExtractor(conf); + String content = + "
" + + "
Phone
" + + "(631) 656-0088" + + "
"; + Document jsoupDoc = Parser.htmlParser().parseInput(content, "http://example.com"); + String text = extractor.text(jsoupDoc.body()); + assertEquals("Phone (631) 656-0088", text); + } + @Test void testTrimContent() throws IOException { Config conf = new Config(); From 2b3d08f6a0f8c18f72b749d7efd49d57a0ee3a68 Mon Sep 17 00:00:00 2001 From: Nico Scandolo Date: Tue, 2 Jun 2026 14:56:27 -0300 Subject: [PATCH 2/2] Address review: anonymize test data and update tail() comment --- .../stormcrawler/parse/JSoupTextExtractor.java | 2 +- .../stormcrawler/parse/JSoupTextExtractorTest.java | 13 ++++++------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/core/src/main/java/org/apache/stormcrawler/parse/JSoupTextExtractor.java b/core/src/main/java/org/apache/stormcrawler/parse/JSoupTextExtractor.java index a58d09900..dd5ab6d1e 100644 --- a/core/src/main/java/org/apache/stormcrawler/parse/JSoupTextExtractor.java +++ b/core/src/main/java/org/apache/stormcrawler/parse/JSoupTextExtractor.java @@ -139,7 +139,7 @@ public void head(Node node, int depth) { public void tail(Node node, int depth) { // make sure there is a space between block tags and immediately - // following text nodes
One
Two should be "One Two". + // following siblings
One
Two should be "One Two". if (node instanceof Element) { Element element = (Element) node; if (element == excluded) { diff --git a/core/src/test/java/org/apache/stormcrawler/parse/JSoupTextExtractorTest.java b/core/src/test/java/org/apache/stormcrawler/parse/JSoupTextExtractorTest.java index 75ef899e4..4f2cef1e3 100644 --- a/core/src/test/java/org/apache/stormcrawler/parse/JSoupTextExtractorTest.java +++ b/core/src/test/java/org/apache/stormcrawler/parse/JSoupTextExtractorTest.java @@ -68,16 +68,15 @@ void testExclusionCase() throws IOException { void testBlockFollowedByInlineElement() { Config conf = new Config(); JSoupTextExtractor extractor = new JSoupTextExtractor(conf); - // Real-world case from https://www.acai-island.com/contact where - // "Email" and the address were concatenated as "Emailacaiisland1300@gmail.com" + // block element followed by an inline anchor — see #1925 String content = "
" - + "

Email

" - + "acaiisland1300@gmail.com" + + "

Contact

" + + "info@example.com" + "
"; Document jsoupDoc = Parser.htmlParser().parseInput(content, "http://example.com"); String text = extractor.text(jsoupDoc.body()); - assertEquals("Email acaiisland1300@gmail.com", text); + assertEquals("Contact info@example.com", text); } @Test @@ -87,11 +86,11 @@ void testBlockFollowedByInlineSpan() { String content = "
" + "
Phone
" - + "(631) 656-0088" + + "555-0100" + "
"; Document jsoupDoc = Parser.htmlParser().parseInput(content, "http://example.com"); String text = extractor.text(jsoupDoc.body()); - assertEquals("Phone (631) 656-0088", text); + assertEquals("Phone 555-0100", text); } @Test