diff --git a/core/src/main/java/org/apache/stormcrawler/ConfigurableTopology.java b/core/src/main/java/org/apache/stormcrawler/ConfigurableTopology.java index 539b391bd..1305be659 100644 --- a/core/src/main/java/org/apache/stormcrawler/ConfigurableTopology.java +++ b/core/src/main/java/org/apache/stormcrawler/ConfigurableTopology.java @@ -30,9 +30,13 @@ import org.apache.storm.utils.Utils; import org.apache.stormcrawler.persistence.Status; import org.apache.stormcrawler.util.ConfUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public abstract class ConfigurableTopology { + private static final Logger LOG = LoggerFactory.getLogger(ConfigurableTopology.class); + protected Config conf = new Config(); public static void start(ConfigurableTopology topology, String[] args) { @@ -70,7 +74,7 @@ protected int submit(String name, Config conf, TopologyBuilder builder) { try { StormSubmitter.submitTopology(name, conf, builder.createTopology()); } catch (Exception e) { - e.printStackTrace(); + LOG.error("Failed to submit topology: {}", name, e); return -1; } return 0; diff --git a/core/src/main/java/org/apache/stormcrawler/bolt/FetcherBolt.java b/core/src/main/java/org/apache/stormcrawler/bolt/FetcherBolt.java index c534e14f2..2439bd76c 100644 --- a/core/src/main/java/org/apache/stormcrawler/bolt/FetcherBolt.java +++ b/core/src/main/java/org/apache/stormcrawler/bolt/FetcherBolt.java @@ -364,12 +364,26 @@ public synchronized FetchItemQueue getFetchItemQueue(String id, Metadata metadat // custom crawl delay from metadata? String v = metadata.getFirstValue(CRAWL_DELAY_KEY_NAME); if (v != null) { - delay = Long.parseLong(v); + try { + delay = Long.parseLong(v); + } catch (NumberFormatException e) { + LOG.warn( + "Invalid crawl delay value '{}' in metadata for queue '{}', using default.", + v, + id); + } } // custom min crawl delay from metadata? v = metadata.getFirstValue(CRAWL_MIN_DELAY_KEY_NAME); if (v != null) { - minDelay = Long.parseLong(v); + try { + minDelay = Long.parseLong(v); + } catch (NumberFormatException e) { + LOG.warn( + "Invalid min crawl delay value '{}' in metadata for queue '{}', using default.", + v, + id); + } } } @@ -388,7 +402,14 @@ public synchronized FetchItemQueue getFetchItemQueue(String id, Metadata metadat if (metadata != null) { final String val = metadata.getFirstValue(CRAWL_MAX_THREAD_KEY_NAME); if (val != null) { - threadVal = Integer.parseInt(val); + try { + threadVal = Integer.parseInt(val); + } catch (NumberFormatException e) { + LOG.warn( + "Invalid max threads value '{}' in metadata for queue '{}', using default.", + val, + id); + } } } diff --git a/core/src/main/java/org/apache/stormcrawler/filtering/URLFilters.java b/core/src/main/java/org/apache/stormcrawler/filtering/URLFilters.java index 7453c9bf8..36e39c300 100644 --- a/core/src/main/java/org/apache/stormcrawler/filtering/URLFilters.java +++ b/core/src/main/java/org/apache/stormcrawler/filtering/URLFilters.java @@ -210,7 +210,7 @@ public static void main(String[] args) throws ParseException { LOG.error("URL filtering threw exception", e); } } catch (IOException e) { - e.printStackTrace(); + LOG.error("Failed to initialize URLFilters", e); System.exit(-1); } System.exit(0); diff --git a/core/src/main/java/org/apache/stormcrawler/filtering/regex/RegexURLFilterBase.java b/core/src/main/java/org/apache/stormcrawler/filtering/regex/RegexURLFilterBase.java index c5edd34fc..4e6618879 100644 --- a/core/src/main/java/org/apache/stormcrawler/filtering/regex/RegexURLFilterBase.java +++ b/core/src/main/java/org/apache/stormcrawler/filtering/regex/RegexURLFilterBase.java @@ -23,7 +23,6 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; -import java.io.Reader; import java.net.URL; import java.nio.charset.StandardCharsets; import java.util.ArrayList; @@ -80,24 +79,27 @@ private List readRules(ArrayNode rulesList) { private List readRules(String rulesFile) { List rules = new ArrayList<>(); - try { - InputStream regexStream = getClass().getClassLoader().getResourceAsStream(rulesFile); - Reader reader = new InputStreamReader(regexStream, StandardCharsets.UTF_8); - BufferedReader in = new BufferedReader(reader); - String line; - - while ((line = in.readLine()) != null) { - if (line.length() == 0) { - continue; - } - RegexRule rule = createRule(line); - if (rule != null) { - rules.add(rule); + try (InputStream regexStream = getClass().getClassLoader().getResourceAsStream(rulesFile)) { + if (regexStream == null) { + LOG.error("Regex filter file '{}' not found in classpath", rulesFile); + return rules; + } + try (BufferedReader in = + new BufferedReader( + new InputStreamReader(regexStream, StandardCharsets.UTF_8))) { + String line; + while ((line = in.readLine()) != null) { + if (line.length() == 0) { + continue; + } + RegexRule rule = createRule(line); + if (rule != null) { + rules.add(rule); + } } } } catch (IOException e) { - LOG.error("There was an error reading the default-regex-filters file"); - e.printStackTrace(); + LOG.error("There was an error reading the default-regex-filters file", e); } return rules; } diff --git a/core/src/main/java/org/apache/stormcrawler/parse/filter/DebugParseFilter.java b/core/src/main/java/org/apache/stormcrawler/parse/filter/DebugParseFilter.java index ae7e54d1f..b48047a30 100644 --- a/core/src/main/java/org/apache/stormcrawler/parse/filter/DebugParseFilter.java +++ b/core/src/main/java/org/apache/stormcrawler/parse/filter/DebugParseFilter.java @@ -28,11 +28,15 @@ import org.apache.stormcrawler.parse.ParseResult; import org.apache.xml.serialize.XMLSerializer; import org.jetbrains.annotations.NotNull; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.w3c.dom.DocumentFragment; /** Dumps the DOM representation of a document into a file. */ public class DebugParseFilter extends ParseFilter { + private static final Logger LOG = LoggerFactory.getLogger(DebugParseFilter.class); + private OutputStream os; @Override @@ -43,7 +47,7 @@ public void filter(String url, byte[] content, DocumentFragment doc, ParseResult serializer.serialize(doc); os.flush(); } catch (IOException e) { - e.printStackTrace(); + LOG.error("Exception while serializing DOM", e); } } @@ -53,7 +57,7 @@ public void configure(@NotNull Map stormConf, @NotNull JsonNode File outFile = Files.createTempFile("DOMDump", ".xml").toFile(); os = FileUtils.openOutputStream(outFile); } catch (IOException e) { - e.printStackTrace(); + LOG.error("Exception while configuring DebugParseFilter", e); } } @@ -61,4 +65,15 @@ public void configure(@NotNull Map stormConf, @NotNull JsonNode public boolean needsDOM() { return true; } + + @Override + public void cleanup() { + if (os != null) { + try { + os.close(); + } catch (IOException e) { + LOG.error("Exception while closing output stream in DebugParseFilter", e); + } + } + } } diff --git a/core/src/main/java/org/apache/stormcrawler/protocol/file/FileResponse.java b/core/src/main/java/org/apache/stormcrawler/protocol/file/FileResponse.java index 80ffc00d4..47504a649 100644 --- a/core/src/main/java/org/apache/stormcrawler/protocol/file/FileResponse.java +++ b/core/src/main/java/org/apache/stormcrawler/protocol/file/FileResponse.java @@ -24,8 +24,6 @@ import java.net.URL; import java.net.URLDecoder; import java.nio.charset.StandardCharsets; -import java.text.SimpleDateFormat; -import java.util.Date; import java.util.Locale; import org.apache.commons.io.IOUtils; import org.apache.http.HttpHeaders; @@ -37,8 +35,9 @@ public class FileResponse { - static final SimpleDateFormat dateFormat = - new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss zzz", Locale.US); + static final java.time.format.DateTimeFormatter DATE_FORMATTER = + java.time.format.DateTimeFormatter.ofPattern("EEE, dd MMM yyyy HH:mm:ss zzz", Locale.US) + .withZone(java.time.ZoneId.systemDefault()); static final org.slf4j.Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); @@ -101,8 +100,8 @@ private void getFileAsHttpResponse(File file) { return; } - try { - content = IOUtils.toByteArray(new FileInputStream(file), size); + try (FileInputStream fis = new FileInputStream(file)) { + content = IOUtils.toByteArray(fis, size); } catch (IOException | IllegalArgumentException e) { LOG.error("Exception while fetching file response {} ", file.getPath(), e); statusCode = HttpStatus.SC_METHOD_FAILURE; @@ -122,7 +121,7 @@ private void getDirAsHttpResponse(File file) { } private static String formatDate(long date) { - return dateFormat.format(new Date(date)); + return DATE_FORMATTER.format(java.time.Instant.ofEpochMilli(date)); } private byte[] generateSitemap(File dir) { diff --git a/core/src/main/java/org/apache/stormcrawler/util/CookieConverter.java b/core/src/main/java/org/apache/stormcrawler/util/CookieConverter.java index c4b14c7bb..5592fd508 100644 --- a/core/src/main/java/org/apache/stormcrawler/util/CookieConverter.java +++ b/core/src/main/java/org/apache/stormcrawler/util/CookieConverter.java @@ -18,8 +18,6 @@ package org.apache.stormcrawler.util; import java.net.URL; -import java.text.ParseException; -import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.List; @@ -30,8 +28,8 @@ /** Helper to extract cookies from cookies string. */ public class CookieConverter { - private static final SimpleDateFormat DATE_FORMAT = - new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss zzz", Locale.ENGLISH); + private static final org.slf4j.Logger LOG = + org.slf4j.LoggerFactory.getLogger(CookieConverter.class); /** * Get a list of cookies based on the cookies string taken from response header and the target @@ -110,17 +108,17 @@ public static List getCookies(String[] cookiesStrings, URL targetURL) { // check expiration if (expires != null) { try { - Date expirationDate = DATE_FORMAT.parse(expires); - cookie.setExpiryDate(expirationDate); - - // check that it hasn't expired? - if (cookie.isExpired(new Date())) { - continue; + Date expirationDate = org.apache.http.client.utils.DateUtils.parseDate(expires); + if (expirationDate != null) { + cookie.setExpiryDate(expirationDate); + + // check that it hasn't expired? + if (cookie.isExpired(new Date())) { + continue; + } } - - cookie.setExpiryDate(expirationDate); - } catch (ParseException e) { - // ignore exceptions + } catch (Exception e) { + LOG.debug("Could not parse cookie expiry date: {}", expires, e); } } diff --git a/core/src/main/java/org/apache/stormcrawler/util/RefreshTag.java b/core/src/main/java/org/apache/stormcrawler/util/RefreshTag.java index b4532d7fc..65953d72e 100644 --- a/core/src/main/java/org/apache/stormcrawler/util/RefreshTag.java +++ b/core/src/main/java/org/apache/stormcrawler/util/RefreshTag.java @@ -28,8 +28,8 @@ // Utility class used to extract refresh tags from HTML pages public abstract class RefreshTag { - private static final Matcher MATCHER = - Pattern.compile("^.*;\\s*URL='?(.+?)'?$", Pattern.CASE_INSENSITIVE).matcher(""); + private static final Pattern PATTERN = + Pattern.compile("^.*;\\s*URL='?(.+?)'?$", Pattern.CASE_INSENSITIVE); private static final Evaluator EVALUATOR = QueryParser.parse("meta[http-equiv~=(?i)refresh][content]"); @@ -42,8 +42,9 @@ public static String extractRefreshURL(String value) { // 0;URL=http://www.apollocolors.com/site try { - if (MATCHER.reset(value).matches()) { - return MATCHER.group(1); + Matcher matcher = PATTERN.matcher(value); + if (matcher.matches()) { + return matcher.group(1); } } catch (Exception e) { } diff --git a/external/aws/src/main/java/org/apache/stormcrawler/aws/bolt/CloudSearchUtils.java b/external/aws/src/main/java/org/apache/stormcrawler/aws/bolt/CloudSearchUtils.java index 06fb26d62..2a2fb442c 100644 --- a/external/aws/src/main/java/org/apache/stormcrawler/aws/bolt/CloudSearchUtils.java +++ b/external/aws/src/main/java/org/apache/stormcrawler/aws/bolt/CloudSearchUtils.java @@ -18,27 +18,15 @@ package org.apache.stormcrawler.aws.bolt; import java.nio.charset.StandardCharsets; -import java.security.MessageDigest; -import java.security.NoSuchAlgorithmException; import java.util.Locale; import java.util.regex.Pattern; -import org.apache.commons.codec.binary.Hex; +import org.apache.commons.codec.digest.DigestUtils; public class CloudSearchUtils { - private static MessageDigest digester; - private static final Pattern INVALID_XML_CHARS = Pattern.compile("[^\\t\\n\\r -\\uD7FF\\uE000-\\uFFFD]"); - static { - try { - digester = MessageDigest.getInstance("SHA-512"); - } catch (NoSuchAlgorithmException e) { - throw new RuntimeException(e); - } - } - private CloudSearchUtils() {} /** Returns a normalised doc ID based on the URL of a document * */ @@ -51,8 +39,7 @@ public static String getID(String url) { // letter or number and the following characters: _ - = # ; : / ? @ // &. Document IDs must be at least 1 and no more than 128 // characters long. - byte[] dig = digester.digest(url.getBytes(StandardCharsets.UTF_8)); - String ID = Hex.encodeHexString(dig); + String ID = DigestUtils.sha512Hex(url.getBytes(StandardCharsets.UTF_8)); // is that even possible? if (ID.length() > 128) { throw new RuntimeException("ID larger than max 128 chars"); @@ -81,7 +68,7 @@ public static String cleanFieldName(String name) { throw new RuntimeException("Field name must be between 3 and 64 chars : " + lowercase); } if (lowercase.equals("score")) { - throw new RuntimeException("Field name must be score"); + throw new RuntimeException("Field name must NOT be score"); } return lowercase; } diff --git a/external/aws/src/main/java/org/apache/stormcrawler/aws/s3/S3CacheChecker.java b/external/aws/src/main/java/org/apache/stormcrawler/aws/s3/S3CacheChecker.java index c560dc00c..5b7d996c3 100644 --- a/external/aws/src/main/java/org/apache/stormcrawler/aws/s3/S3CacheChecker.java +++ b/external/aws/src/main/java/org/apache/stormcrawler/aws/s3/S3CacheChecker.java @@ -19,7 +19,6 @@ import com.amazonaws.services.s3.model.AmazonS3Exception; import com.amazonaws.services.s3.model.S3Object; -import java.io.UnsupportedEncodingException; import java.net.URLEncoder; import java.util.Map; import org.apache.commons.io.IOUtils; @@ -65,12 +64,7 @@ public void execute(Tuple tuple) { Metadata metadata = (Metadata) tuple.getValueByField("metadata"); // normalises URL - String key = ""; - try { - key = URLEncoder.encode(url, "UTF-8"); - } catch (UnsupportedEncodingException e) { - // ignore it - we know UTF-8 is valid - } + String key = URLEncoder.encode(url, java.nio.charset.StandardCharsets.UTF_8); // check size of the key if (key.length() >= 1024) { LOG.info("Key too large : {}", key); diff --git a/external/aws/src/main/java/org/apache/stormcrawler/aws/s3/S3Cacher.java b/external/aws/src/main/java/org/apache/stormcrawler/aws/s3/S3Cacher.java index ce1203949..8619ed3b6 100644 --- a/external/aws/src/main/java/org/apache/stormcrawler/aws/s3/S3Cacher.java +++ b/external/aws/src/main/java/org/apache/stormcrawler/aws/s3/S3Cacher.java @@ -22,7 +22,6 @@ import com.amazonaws.services.s3.model.PutObjectResult; import java.io.ByteArrayInputStream; import java.io.IOException; -import java.io.UnsupportedEncodingException; import java.net.URLEncoder; import java.util.Map; import org.apache.storm.task.OutputCollector; @@ -94,12 +93,7 @@ public void execute(Tuple tuple) { } // normalises URL - String key = ""; - try { - key = URLEncoder.encode(url, "UTF-8"); - } catch (UnsupportedEncodingException e) { - // ignore it - we know UTF-8 is valid - } + String key = URLEncoder.encode(url, java.nio.charset.StandardCharsets.UTF_8); // check size of the key if (key.length() >= 1024) { LOG.info("Key too large : {}", key); diff --git a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/HttpProtocol.java b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/HttpProtocol.java index 420b9a0bc..93d530029 100644 --- a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/HttpProtocol.java +++ b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/HttpProtocol.java @@ -184,9 +184,8 @@ public ProtocolResponse getProtocolOutput(String url, Metadata md) throws Except // https://github.com/microsoft/playwright-java#is-playwright-thread-safe synchronized (this) { - - // tracing// Start tracing before creating / navigating a page. - if (md.containsKey(MD_TRACE)) { + boolean isTracing = md.containsKey(MD_TRACE); + if (isTracing) { context.tracing() .start( new Tracing.StartOptions() @@ -201,109 +200,120 @@ public ProtocolResponse getProtocolOutput(String url, Metadata md) throws Except final MutableInt status = new MutableInt(-1); byte[] content = new byte[0]; - try (Page page = context.newPage()) { - - page.onResponse( - response -> { - // make sure that this applies to the main page - if (response.url().equals(url)) { - // redirection? - if (Status.REDIRECTION.equals( - Status.fromHTTPCode(response.status()))) { - status.set(response.status()); - response.allHeaders() - .forEach( - (k, v) -> { - responseMetaData.addValue(k, v); - }); + try { + try (Page page = context.newPage()) { + + page.onResponse( + response -> { + // make sure that this applies to the main page + if (response.url().equals(url)) { + // redirection? + if (Status.REDIRECTION.equals( + Status.fromHTTPCode(response.status()))) { + status.set(response.status()); + response.allHeaders() + .forEach( + (k, v) -> { + responseMetaData.addValue(k, v); + }); + } } - } - }); - - page.onPageError( - handler -> { - // this applies to any resource - not just the main page - LOG.debug("Error when loading {} {}", url, handler); - }); - - // NOTE: The handler will only be called for the first url if the - // response is a redirect. - page.route( - lambdaUrl -> true, - route -> { - // abort if we know the main page is a redirection - if (status.get() != -1) { - LOG.debug("Aborting request for {}", route.request().url()); - route.abort(); - } else if (resourceTypesToSkip.contains( - route.request().resourceType())) { - route.abort(); - } else { - route.resume(); - } - }); - - // let playwright do the content loading - com.microsoft.playwright.Response response = - page.navigate( - url, - new Page.NavigateOptions() - .setTimeout(timeout) - .setWaitUntil(loadEvent)); - - // the status is not set unless - // a redirection - if (status.get() == -1) { - response.allHeaders() - .forEach( - (k, v) -> { - responseMetaData.addValue(k, v); - }); - - int httpStatus = response.status(); - boolean fetched = Status.FETCHED == Status.fromHTTPCode(httpStatus); - boolean contentCaptured = false; - - if (fetched || captureContentOnError) { - // run any configured post-navigate actions before capturing content - pageActions.apply(page, url, md, responseMetaData); - // retrieve the rendered content - content = page.content().getBytes(StandardCharsets.UTF_8); - contentCaptured = true; - } + }); + + page.onPageError( + handler -> { + // this applies to any resource - not just the main page + LOG.debug("Error when loading {} {}", url, handler); + }); + + // NOTE: The handler will only be called for the first url if the + // response is a redirect. + page.route( + lambdaUrl -> true, + route -> { + // abort if we know the main page is a redirection + if (status.get() != -1) { + LOG.debug("Aborting request for {}", route.request().url()); + route.abort(); + } else if (resourceTypesToSkip.contains( + route.request().resourceType())) { + route.abort(); + } else { + route.resume(); + } + }); + + // let playwright do the content loading + com.microsoft.playwright.Response response = + page.navigate( + url, + new Page.NavigateOptions() + .setTimeout(timeout) + .setWaitUntil(loadEvent)); + + // the status is not set unless + // a redirection + if (status.get() == -1) { + response.allHeaders() + .forEach( + (k, v) -> { + responseMetaData.addValue(k, v); + }); + + int httpStatus = response.status(); + boolean fetched = Status.FETCHED == Status.fromHTTPCode(httpStatus); + boolean contentCaptured = false; + + if (fetched || captureContentOnError) { + // run any configured post-navigate actions before capturing content + pageActions.apply(page, url, md, responseMetaData); + // retrieve the rendered content + content = page.content().getBytes(StandardCharsets.UTF_8); + contentCaptured = true; + } - if (!fetched && contentCaptured && overrideStatusOnContent) { - // expose the original origin status for diagnostics - responseMetaData.setValue( - "playwright.origin.status", Integer.toString(httpStatus)); - status.set(200); - } else { - status.set(httpStatus); - } + if (!fetched && contentCaptured && overrideStatusOnContent) { + // expose the original origin status for diagnostics + responseMetaData.setValue( + "playwright.origin.status", Integer.toString(httpStatus)); + status.set(200); + } else { + status.set(httpStatus); + } - // evaluate an expression and store the results - // in the metadata using the same string as key - for (String expression : evaluations) { - Object performance = page.evaluate(expression); - if (performance != null) { - String json = - mapper.writerWithDefaultPrettyPrinter() - .writeValueAsString(performance); - responseMetaData.setValue(expression, json); + // evaluate an expression and store the results + // in the metadata using the same string as key + for (String expression : evaluations) { + Object performance = page.evaluate(expression); + if (performance != null) { + String json = + mapper.writerWithDefaultPrettyPrinter() + .writeValueAsString(performance); + responseMetaData.setValue(expression, json); + } } } } - } - if (md.containsKey(MD_TRACE)) { - Path tmp = Files.createTempFile("trace-", ".zip", new FileAttribute[0]); - context.tracing().stop(new Tracing.StopOptions().setPath(tmp)); - responseMetaData.setValue(MD_TRACE, tmp.toString()); - } + if (isTracing) { + Path tmp = Files.createTempFile("trace-", ".zip", new FileAttribute[0]); + context.tracing().stop(new Tracing.StopOptions().setPath(tmp)); + responseMetaData.setValue(MD_TRACE, tmp.toString()); + } - responseMetaData.addValue(MD_KEY_END, Instant.now().toString()); + responseMetaData.addValue(MD_KEY_END, Instant.now().toString()); - return new ProtocolResponse(content, status.get(), responseMetaData); + return new ProtocolResponse(content, status.get(), responseMetaData); + + } finally { + if (isTracing && responseMetaData.getFirstValue(MD_TRACE) == null) { + try { + context.tracing().stop(new Tracing.StopOptions()); + } catch (Exception e) { + LOG.warn("Exception while stopping tracing on error", e); + } + } + } } } diff --git a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/parsefilter/JsRenderingDetector.java b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/parsefilter/JsRenderingDetector.java index 8be245e81..23c1a267d 100644 --- a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/parsefilter/JsRenderingDetector.java +++ b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/parsefilter/JsRenderingDetector.java @@ -28,6 +28,7 @@ import org.apache.stormcrawler.parse.ParseFilter; import org.apache.stormcrawler.parse.ParseResult; import org.apache.stormcrawler.protocol.playwright.HttpProtocol; +import org.apache.stormcrawler.util.CharsetIdentification; import org.jetbrains.annotations.NotNull; import org.slf4j.LoggerFactory; import org.w3c.dom.DocumentFragment; @@ -221,7 +222,15 @@ public void filter( } private String detectReason(final String url, final byte[] content, final ParseResult parse) { - final String html = new String(content, StandardCharsets.UTF_8); + final Metadata md = parse.get(url).getMetadata(); + final String charsetName = CharsetIdentification.getCharsetFast(md, content, -1); + java.nio.charset.Charset cs; + try { + cs = java.nio.charset.Charset.forName(charsetName); + } catch (Exception e) { + cs = StandardCharsets.UTF_8; + } + final String html = new String(content, cs); // 1. SPA framework fingerprints for (final String fp : fingerprints) {