diff --git a/core/src/main/java/org/apache/stormcrawler/parse/JSoupTextExtractor.java b/core/src/main/java/org/apache/stormcrawler/parse/JSoupTextExtractor.java
index 4e8241353..dd5ab6d1e 100644
--- a/core/src/main/java/org/apache/stormcrawler/parse/JSoupTextExtractor.java
+++ b/core/src/main/java/org/apache/stormcrawler/parse/JSoupTextExtractor.java
@@ -139,14 +139,14 @@ public void head(Node node, int depth) {
public void tail(Node node, int depth) {
// make sure there is a space between block tags and immediately
- // following text nodes
One
Two should be "One Two".
+ // following siblings One
Two should be "One Two".
if (node instanceof Element) {
Element element = (Element) node;
if (element == excluded) {
excluded = null;
}
if (element.isBlock()
- && (node.nextSibling() instanceof TextNode)
+ && node.nextSibling() != null
&& !lastCharIsWhitespace(accum)) {
accum.append(' ');
}
diff --git a/core/src/test/java/org/apache/stormcrawler/parse/JSoupTextExtractorTest.java b/core/src/test/java/org/apache/stormcrawler/parse/JSoupTextExtractorTest.java
index eb0f0c44e..4f2cef1e3 100644
--- a/core/src/test/java/org/apache/stormcrawler/parse/JSoupTextExtractorTest.java
+++ b/core/src/test/java/org/apache/stormcrawler/parse/JSoupTextExtractorTest.java
@@ -64,6 +64,35 @@ void testExclusionCase() throws IOException {
assertEquals("the content of the page", text);
}
+ @Test
+ void testBlockFollowedByInlineElement() {
+ Config conf = new Config();
+ JSoupTextExtractor extractor = new JSoupTextExtractor(conf);
+ // block element followed by an inline anchor — see #1925
+ String content =
+ "";
+ Document jsoupDoc = Parser.htmlParser().parseInput(content, "http://example.com");
+ String text = extractor.text(jsoupDoc.body());
+ assertEquals("Contact info@example.com", text);
+ }
+
+ @Test
+ void testBlockFollowedByInlineSpan() {
+ Config conf = new Config();
+ JSoupTextExtractor extractor = new JSoupTextExtractor(conf);
+ String content =
+ ""
+ + "
Phone
"
+ + "
555-0100"
+ + "
";
+ Document jsoupDoc = Parser.htmlParser().parseInput(content, "http://example.com");
+ String text = extractor.text(jsoupDoc.body());
+ assertEquals("Phone 555-0100", text);
+ }
+
@Test
void testTrimContent() throws IOException {
Config conf = new Config();