Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,13 @@
import org.apache.storm.utils.Utils;
import org.apache.stormcrawler.persistence.Status;
import org.apache.stormcrawler.util.ConfUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public abstract class ConfigurableTopology {

private static final Logger LOG = LoggerFactory.getLogger(ConfigurableTopology.class);

protected Config conf = new Config();

public static void start(ConfigurableTopology topology, String[] args) {
Expand Down Expand Up @@ -70,7 +74,7 @@ protected int submit(String name, Config conf, TopologyBuilder builder) {
try {
StormSubmitter.submitTopology(name, conf, builder.createTopology());
} catch (Exception e) {
e.printStackTrace();
LOG.error("Failed to submit topology: {}", name, e);
return -1;
}
return 0;
Expand Down
27 changes: 24 additions & 3 deletions core/src/main/java/org/apache/stormcrawler/bolt/FetcherBolt.java
Original file line number Diff line number Diff line change
Expand Up @@ -364,12 +364,26 @@ public synchronized FetchItemQueue getFetchItemQueue(String id, Metadata metadat
// custom crawl delay from metadata?
String v = metadata.getFirstValue(CRAWL_DELAY_KEY_NAME);
if (v != null) {
delay = Long.parseLong(v);
try {
delay = Long.parseLong(v);
} catch (NumberFormatException e) {
LOG.warn(
"Invalid crawl delay value '{}' in metadata for queue '{}', using default.",
v,
id);
}
}
// custom min crawl delay from metadata?
v = metadata.getFirstValue(CRAWL_MIN_DELAY_KEY_NAME);
if (v != null) {
minDelay = Long.parseLong(v);
try {
minDelay = Long.parseLong(v);
} catch (NumberFormatException e) {
LOG.warn(
"Invalid min crawl delay value '{}' in metadata for queue '{}', using default.",
v,
id);
}
}
}

Expand All @@ -388,7 +402,14 @@ public synchronized FetchItemQueue getFetchItemQueue(String id, Metadata metadat
if (metadata != null) {
final String val = metadata.getFirstValue(CRAWL_MAX_THREAD_KEY_NAME);
if (val != null) {
threadVal = Integer.parseInt(val);
try {
threadVal = Integer.parseInt(val);
} catch (NumberFormatException e) {
LOG.warn(
"Invalid max threads value '{}' in metadata for queue '{}', using default.",
val,
id);
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ public static void main(String[] args) throws ParseException {
LOG.error("URL filtering threw exception", e);
}
} catch (IOException e) {
e.printStackTrace();
LOG.error("Failed to initialize URLFilters", e);
System.exit(-1);
}
System.exit(0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,27 +18,15 @@
package org.apache.stormcrawler.aws.bolt;

import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Locale;
import java.util.regex.Pattern;
import org.apache.commons.codec.binary.Hex;
import org.apache.commons.codec.digest.DigestUtils;

public class CloudSearchUtils {

private static MessageDigest digester;

private static final Pattern INVALID_XML_CHARS =
Pattern.compile("[^\\t\\n\\r -\\uD7FF\\uE000-\\uFFFD]");

static {
try {
digester = MessageDigest.getInstance("SHA-512");
} catch (NoSuchAlgorithmException e) {
throw new RuntimeException(e);
}
}

private CloudSearchUtils() {}

/** Returns a normalised doc ID based on the URL of a document * */
Expand All @@ -51,8 +39,7 @@ public static String getID(String url) {
// letter or number and the following characters: _ - = # ; : / ? @
// &. Document IDs must be at least 1 and no more than 128
// characters long.
byte[] dig = digester.digest(url.getBytes(StandardCharsets.UTF_8));
String ID = Hex.encodeHexString(dig);
String ID = DigestUtils.sha512Hex(url.getBytes(StandardCharsets.UTF_8));
// is that even possible?
if (ID.length() > 128) {
throw new RuntimeException("ID larger than max 128 chars");
Expand Down Expand Up @@ -81,7 +68,7 @@ public static String cleanFieldName(String name) {
throw new RuntimeException("Field name must be between 3 and 64 chars : " + lowercase);
}
if (lowercase.equals("score")) {
throw new RuntimeException("Field name must be score");
throw new RuntimeException("Field name must NOT be score");
}
return lowercase;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@

import com.amazonaws.services.s3.model.AmazonS3Exception;
import com.amazonaws.services.s3.model.S3Object;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.Map;
import org.apache.commons.io.IOUtils;
Expand Down Expand Up @@ -65,12 +64,7 @@ public void execute(Tuple tuple) {
Metadata metadata = (Metadata) tuple.getValueByField("metadata");

// normalises URL
String key = "";
try {
key = URLEncoder.encode(url, "UTF-8");
} catch (UnsupportedEncodingException e) {
// ignore it - we know UTF-8 is valid
}
String key = URLEncoder.encode(url, java.nio.charset.StandardCharsets.UTF_8);
// check size of the key
if (key.length() >= 1024) {
LOG.info("Key too large : {}", key);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
import com.amazonaws.services.s3.model.PutObjectResult;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.Map;
import org.apache.storm.task.OutputCollector;
Expand Down Expand Up @@ -94,12 +93,7 @@ public void execute(Tuple tuple) {
}

// normalises URL
String key = "";
try {
key = URLEncoder.encode(url, "UTF-8");
} catch (UnsupportedEncodingException e) {
// ignore it - we know UTF-8 is valid
}
String key = URLEncoder.encode(url, java.nio.charset.StandardCharsets.UTF_8);
// check size of the key
if (key.length() >= 1024) {
LOG.info("Key too large : {}", key);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import org.apache.stormcrawler.parse.ParseFilter;
import org.apache.stormcrawler.parse.ParseResult;
import org.apache.stormcrawler.protocol.playwright.HttpProtocol;
import org.apache.stormcrawler.util.CharsetIdentification;
import org.jetbrains.annotations.NotNull;
import org.slf4j.LoggerFactory;
import org.w3c.dom.DocumentFragment;
Expand Down Expand Up @@ -221,7 +222,15 @@ public void filter(
}

private String detectReason(final String url, final byte[] content, final ParseResult parse) {
final String html = new String(content, StandardCharsets.UTF_8);
final Metadata md = parse.get(url).getMetadata();
final String charsetName = CharsetIdentification.getCharsetFast(md, content, -1);
java.nio.charset.Charset cs;
try {
cs = java.nio.charset.Charset.forName(charsetName);
} catch (Exception e) {
cs = StandardCharsets.UTF_8;
}
final String html = new String(content, cs);

// 1. SPA framework fingerprints
for (final String fp : fingerprints) {
Expand Down