diff --git a/core/src/main/java/org/apache/stormcrawler/parse/JSoupTextExtractor.java b/core/src/main/java/org/apache/stormcrawler/parse/JSoupTextExtractor.java index 4e8241353..dd5ab6d1e 100644 --- a/core/src/main/java/org/apache/stormcrawler/parse/JSoupTextExtractor.java +++ b/core/src/main/java/org/apache/stormcrawler/parse/JSoupTextExtractor.java @@ -139,14 +139,14 @@ public void head(Node node, int depth) { public void tail(Node node, int depth) { // make sure there is a space between block tags and immediately - // following text nodes
One
Two should be "One Two". + // following siblings
One
Two should be "One Two". if (node instanceof Element) { Element element = (Element) node; if (element == excluded) { excluded = null; } if (element.isBlock() - && (node.nextSibling() instanceof TextNode) + && node.nextSibling() != null && !lastCharIsWhitespace(accum)) { accum.append(' '); } diff --git a/core/src/test/java/org/apache/stormcrawler/parse/JSoupTextExtractorTest.java b/core/src/test/java/org/apache/stormcrawler/parse/JSoupTextExtractorTest.java index eb0f0c44e..4f2cef1e3 100644 --- a/core/src/test/java/org/apache/stormcrawler/parse/JSoupTextExtractorTest.java +++ b/core/src/test/java/org/apache/stormcrawler/parse/JSoupTextExtractorTest.java @@ -64,6 +64,35 @@ void testExclusionCase() throws IOException { assertEquals("the content of the page", text); } + @Test + void testBlockFollowedByInlineElement() { + Config conf = new Config(); + JSoupTextExtractor extractor = new JSoupTextExtractor(conf); + // block element followed by an inline anchor — see #1925 + String content = + "
" + + "

Contact

" + + "info@example.com" + + "
"; + Document jsoupDoc = Parser.htmlParser().parseInput(content, "http://example.com"); + String text = extractor.text(jsoupDoc.body()); + assertEquals("Contact info@example.com", text); + } + + @Test + void testBlockFollowedByInlineSpan() { + Config conf = new Config(); + JSoupTextExtractor extractor = new JSoupTextExtractor(conf); + String content = + "
" + + "
Phone
" + + "555-0100" + + "
"; + Document jsoupDoc = Parser.htmlParser().parseInput(content, "http://example.com"); + String text = extractor.text(jsoupDoc.body()); + assertEquals("Phone 555-0100", text); + } + @Test void testTrimContent() throws IOException { Config conf = new Config();