Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
Expand Down Expand Up @@ -80,24 +79,27 @@ private List<RegexRule> readRules(ArrayNode rulesList) {
private List<RegexRule> readRules(String rulesFile) {
List<RegexRule> rules = new ArrayList<>();

try {
InputStream regexStream = getClass().getClassLoader().getResourceAsStream(rulesFile);
Reader reader = new InputStreamReader(regexStream, StandardCharsets.UTF_8);
BufferedReader in = new BufferedReader(reader);
String line;

while ((line = in.readLine()) != null) {
if (line.length() == 0) {
continue;
}
RegexRule rule = createRule(line);
if (rule != null) {
rules.add(rule);
try (InputStream regexStream = getClass().getClassLoader().getResourceAsStream(rulesFile)) {
if (regexStream == null) {
LOG.error("Regex filter file '{}' not found in classpath", rulesFile);
return rules;
}
try (BufferedReader in =
new BufferedReader(
new InputStreamReader(regexStream, StandardCharsets.UTF_8))) {
String line;
while ((line = in.readLine()) != null) {
if (line.length() == 0) {
continue;
}
RegexRule rule = createRule(line);
if (rule != null) {
rules.add(rule);
}
}
}
} catch (IOException e) {
LOG.error("There was an error reading the default-regex-filters file");
e.printStackTrace();
LOG.error("There was an error reading the default-regex-filters file", e);
}
return rules;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,15 @@
import org.apache.stormcrawler.parse.ParseResult;
import org.apache.xml.serialize.XMLSerializer;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.DocumentFragment;

/** Dumps the DOM representation of a document into a file. */
public class DebugParseFilter extends ParseFilter {

private static final Logger LOG = LoggerFactory.getLogger(DebugParseFilter.class);

private OutputStream os;

@Override
Expand All @@ -43,7 +47,7 @@ public void filter(String url, byte[] content, DocumentFragment doc, ParseResult
serializer.serialize(doc);
os.flush();
} catch (IOException e) {
e.printStackTrace();
LOG.error("Exception while serializing DOM", e);
}
}

Expand All @@ -53,12 +57,23 @@ public void configure(@NotNull Map<String, Object> stormConf, @NotNull JsonNode
File outFile = Files.createTempFile("DOMDump", ".xml").toFile();
os = FileUtils.openOutputStream(outFile);
} catch (IOException e) {
e.printStackTrace();
LOG.error("Exception while configuring DebugParseFilter", e);
}
}

@Override
public boolean needsDOM() {
return true;
}

@Override
public void cleanup() {
if (os != null) {
try {
os.close();
} catch (IOException e) {
LOG.error("Exception while closing output stream in DebugParseFilter", e);
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -184,9 +184,8 @@ public ProtocolResponse getProtocolOutput(String url, Metadata md) throws Except

// https://github.com/microsoft/playwright-java#is-playwright-thread-safe
synchronized (this) {

// tracing// Start tracing before creating / navigating a page.
if (md.containsKey(MD_TRACE)) {
boolean isTracing = md.containsKey(MD_TRACE);
if (isTracing) {
context.tracing()
.start(
new Tracing.StartOptions()
Expand All @@ -201,109 +200,120 @@ public ProtocolResponse getProtocolOutput(String url, Metadata md) throws Except
final MutableInt status = new MutableInt(-1);
byte[] content = new byte[0];

try (Page page = context.newPage()) {

page.onResponse(
response -> {
// make sure that this applies to the main page
if (response.url().equals(url)) {
// redirection?
if (Status.REDIRECTION.equals(
Status.fromHTTPCode(response.status()))) {
status.set(response.status());
response.allHeaders()
.forEach(
(k, v) -> {
responseMetaData.addValue(k, v);
});
try {
try (Page page = context.newPage()) {

page.onResponse(
response -> {
// make sure that this applies to the main page
if (response.url().equals(url)) {
// redirection?
if (Status.REDIRECTION.equals(
Status.fromHTTPCode(response.status()))) {
status.set(response.status());
response.allHeaders()
.forEach(
(k, v) -> {
responseMetaData.addValue(k, v);
});
}
}
}
});

page.onPageError(
handler -> {
// this applies to any resource - not just the main page
LOG.debug("Error when loading {} {}", url, handler);
});

// NOTE: The handler will only be called for the first url if the
// response is a redirect.
page.route(
lambdaUrl -> true,
route -> {
// abort if we know the main page is a redirection
if (status.get() != -1) {
LOG.debug("Aborting request for {}", route.request().url());
route.abort();
} else if (resourceTypesToSkip.contains(
route.request().resourceType())) {
route.abort();
} else {
route.resume();
}
});

// let playwright do the content loading
com.microsoft.playwright.Response response =
page.navigate(
url,
new Page.NavigateOptions()
.setTimeout(timeout)
.setWaitUntil(loadEvent));

// the status is not set unless
// a redirection
if (status.get() == -1) {
response.allHeaders()
.forEach(
(k, v) -> {
responseMetaData.addValue(k, v);
});

int httpStatus = response.status();
boolean fetched = Status.FETCHED == Status.fromHTTPCode(httpStatus);
boolean contentCaptured = false;

if (fetched || captureContentOnError) {
// run any configured post-navigate actions before capturing content
pageActions.apply(page, url, md, responseMetaData);
// retrieve the rendered content
content = page.content().getBytes(StandardCharsets.UTF_8);
contentCaptured = true;
}
});

page.onPageError(
handler -> {
// this applies to any resource - not just the main page
LOG.debug("Error when loading {} {}", url, handler);
});

// NOTE: The handler will only be called for the first url if the
// response is a redirect.
page.route(
lambdaUrl -> true,
route -> {
// abort if we know the main page is a redirection
if (status.get() != -1) {
LOG.debug("Aborting request for {}", route.request().url());
route.abort();
} else if (resourceTypesToSkip.contains(
route.request().resourceType())) {
route.abort();
} else {
route.resume();
}
});

// let playwright do the content loading
com.microsoft.playwright.Response response =
page.navigate(
url,
new Page.NavigateOptions()
.setTimeout(timeout)
.setWaitUntil(loadEvent));

// the status is not set unless
// a redirection
if (status.get() == -1) {
response.allHeaders()
.forEach(
(k, v) -> {
responseMetaData.addValue(k, v);
});

int httpStatus = response.status();
boolean fetched = Status.FETCHED == Status.fromHTTPCode(httpStatus);
boolean contentCaptured = false;

if (fetched || captureContentOnError) {
// run any configured post-navigate actions before capturing content
pageActions.apply(page, url, md, responseMetaData);
// retrieve the rendered content
content = page.content().getBytes(StandardCharsets.UTF_8);
contentCaptured = true;
}

if (!fetched && contentCaptured && overrideStatusOnContent) {
// expose the original origin status for diagnostics
responseMetaData.setValue(
"playwright.origin.status", Integer.toString(httpStatus));
status.set(200);
} else {
status.set(httpStatus);
}
if (!fetched && contentCaptured && overrideStatusOnContent) {
// expose the original origin status for diagnostics
responseMetaData.setValue(
"playwright.origin.status", Integer.toString(httpStatus));
status.set(200);
} else {
status.set(httpStatus);
}

// evaluate an expression and store the results
// in the metadata using the same string as key
for (String expression : evaluations) {
Object performance = page.evaluate(expression);
if (performance != null) {
String json =
mapper.writerWithDefaultPrettyPrinter()
.writeValueAsString(performance);
responseMetaData.setValue(expression, json);
// evaluate an expression and store the results
// in the metadata using the same string as key
for (String expression : evaluations) {
Object performance = page.evaluate(expression);
if (performance != null) {
String json =
mapper.writerWithDefaultPrettyPrinter()
.writeValueAsString(performance);
responseMetaData.setValue(expression, json);
}
}
}
}
}

if (md.containsKey(MD_TRACE)) {
Path tmp = Files.createTempFile("trace-", ".zip", new FileAttribute[0]);
context.tracing().stop(new Tracing.StopOptions().setPath(tmp));
responseMetaData.setValue(MD_TRACE, tmp.toString());
}
if (isTracing) {
Path tmp = Files.createTempFile("trace-", ".zip", new FileAttribute[0]);
context.tracing().stop(new Tracing.StopOptions().setPath(tmp));
responseMetaData.setValue(MD_TRACE, tmp.toString());
}

responseMetaData.addValue(MD_KEY_END, Instant.now().toString());
responseMetaData.addValue(MD_KEY_END, Instant.now().toString());

return new ProtocolResponse(content, status.get(), responseMetaData);
return new ProtocolResponse(content, status.get(), responseMetaData);

} finally {
if (isTracing && responseMetaData.getFirstValue(MD_TRACE) == null) {
try {
context.tracing().stop(new Tracing.StopOptions());
} catch (Exception e) {
LOG.warn("Exception while stopping tracing on error", e);
}
}
}
}
}

Expand Down