Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,13 @@
import org.apache.storm.utils.Utils;
import org.apache.stormcrawler.persistence.Status;
import org.apache.stormcrawler.util.ConfUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public abstract class ConfigurableTopology {

private static final Logger LOG = LoggerFactory.getLogger(ConfigurableTopology.class);

protected Config conf = new Config();

public static void start(ConfigurableTopology topology, String[] args) {
Expand Down Expand Up @@ -70,7 +74,7 @@ protected int submit(String name, Config conf, TopologyBuilder builder) {
try {
StormSubmitter.submitTopology(name, conf, builder.createTopology());
} catch (Exception e) {
e.printStackTrace();
LOG.error("Failed to submit topology: {}", name, e);
return -1;
}
return 0;
Expand Down
27 changes: 24 additions & 3 deletions core/src/main/java/org/apache/stormcrawler/bolt/FetcherBolt.java
Original file line number Diff line number Diff line change
Expand Up @@ -364,12 +364,26 @@ public synchronized FetchItemQueue getFetchItemQueue(String id, Metadata metadat
// custom crawl delay from metadata?
String v = metadata.getFirstValue(CRAWL_DELAY_KEY_NAME);
if (v != null) {
delay = Long.parseLong(v);
try {
delay = Long.parseLong(v);
} catch (NumberFormatException e) {
LOG.warn(
"Invalid crawl delay value '{}' in metadata for queue '{}', using default.",
v,
id);
}
}
// custom min crawl delay from metadata?
v = metadata.getFirstValue(CRAWL_MIN_DELAY_KEY_NAME);
if (v != null) {
minDelay = Long.parseLong(v);
try {
minDelay = Long.parseLong(v);
} catch (NumberFormatException e) {
LOG.warn(
"Invalid min crawl delay value '{}' in metadata for queue '{}', using default.",
v,
id);
}
}
}

Expand All @@ -388,7 +402,14 @@ public synchronized FetchItemQueue getFetchItemQueue(String id, Metadata metadat
if (metadata != null) {
final String val = metadata.getFirstValue(CRAWL_MAX_THREAD_KEY_NAME);
if (val != null) {
threadVal = Integer.parseInt(val);
try {
threadVal = Integer.parseInt(val);
} catch (NumberFormatException e) {
LOG.warn(
"Invalid max threads value '{}' in metadata for queue '{}', using default.",
val,
id);
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ public static void main(String[] args) throws ParseException {
LOG.error("URL filtering threw exception", e);
}
} catch (IOException e) {
e.printStackTrace();
LOG.error("Failed to initialize URLFilters", e);
System.exit(-1);
}
System.exit(0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
Expand Down Expand Up @@ -80,24 +79,27 @@ private List<RegexRule> readRules(ArrayNode rulesList) {
private List<RegexRule> readRules(String rulesFile) {
List<RegexRule> rules = new ArrayList<>();

try {
InputStream regexStream = getClass().getClassLoader().getResourceAsStream(rulesFile);
Reader reader = new InputStreamReader(regexStream, StandardCharsets.UTF_8);
BufferedReader in = new BufferedReader(reader);
String line;

while ((line = in.readLine()) != null) {
if (line.length() == 0) {
continue;
}
RegexRule rule = createRule(line);
if (rule != null) {
rules.add(rule);
try (InputStream regexStream = getClass().getClassLoader().getResourceAsStream(rulesFile)) {
if (regexStream == null) {
LOG.error("Regex filter file '{}' not found in classpath", rulesFile);
return rules;
}
try (BufferedReader in =
new BufferedReader(
new InputStreamReader(regexStream, StandardCharsets.UTF_8))) {
String line;
while ((line = in.readLine()) != null) {
if (line.length() == 0) {
continue;
}
RegexRule rule = createRule(line);
if (rule != null) {
rules.add(rule);
}
}
}
} catch (IOException e) {
LOG.error("There was an error reading the default-regex-filters file");
e.printStackTrace();
LOG.error("There was an error reading the default-regex-filters file", e);
}
return rules;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,15 @@
import org.apache.stormcrawler.parse.ParseResult;
import org.apache.xml.serialize.XMLSerializer;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.DocumentFragment;

/** Dumps the DOM representation of a document into a file. */
public class DebugParseFilter extends ParseFilter {

private static final Logger LOG = LoggerFactory.getLogger(DebugParseFilter.class);

private OutputStream os;

@Override
Expand All @@ -43,7 +47,7 @@ public void filter(String url, byte[] content, DocumentFragment doc, ParseResult
serializer.serialize(doc);
os.flush();
} catch (IOException e) {
e.printStackTrace();
LOG.error("Exception while serializing DOM", e);
}
}

Expand All @@ -53,12 +57,23 @@ public void configure(@NotNull Map<String, Object> stormConf, @NotNull JsonNode
File outFile = Files.createTempFile("DOMDump", ".xml").toFile();
os = FileUtils.openOutputStream(outFile);
} catch (IOException e) {
e.printStackTrace();
LOG.error("Exception while configuring DebugParseFilter", e);
}
}

@Override
public boolean needsDOM() {
return true;
}

@Override
public void cleanup() {
if (os != null) {
try {
os.close();
} catch (IOException e) {
LOG.error("Exception while closing output stream in DebugParseFilter", e);
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@
import java.net.URL;
import java.net.URLDecoder;
import java.nio.charset.StandardCharsets;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Locale;
import org.apache.commons.io.IOUtils;
import org.apache.http.HttpHeaders;
Expand All @@ -37,8 +35,9 @@

public class FileResponse {

static final SimpleDateFormat dateFormat =
new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss zzz", Locale.US);
static final java.time.format.DateTimeFormatter DATE_FORMATTER =
java.time.format.DateTimeFormatter.ofPattern("EEE, dd MMM yyyy HH:mm:ss zzz", Locale.US)
.withZone(java.time.ZoneId.systemDefault());
static final org.slf4j.Logger LOG =
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

Expand Down Expand Up @@ -101,8 +100,8 @@ private void getFileAsHttpResponse(File file) {
return;
}

try {
content = IOUtils.toByteArray(new FileInputStream(file), size);
try (FileInputStream fis = new FileInputStream(file)) {
content = IOUtils.toByteArray(fis, size);
} catch (IOException | IllegalArgumentException e) {
LOG.error("Exception while fetching file response {} ", file.getPath(), e);
statusCode = HttpStatus.SC_METHOD_FAILURE;
Expand All @@ -122,7 +121,7 @@ private void getDirAsHttpResponse(File file) {
}

private static String formatDate(long date) {
return dateFormat.format(new Date(date));
return DATE_FORMATTER.format(java.time.Instant.ofEpochMilli(date));
}

private byte[] generateSitemap(File dir) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@
package org.apache.stormcrawler.util;

import java.net.URL;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
Expand All @@ -30,8 +28,8 @@
/** Helper to extract cookies from cookies string. */
public class CookieConverter {

private static final SimpleDateFormat DATE_FORMAT =
new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss zzz", Locale.ENGLISH);
private static final org.slf4j.Logger LOG =
org.slf4j.LoggerFactory.getLogger(CookieConverter.class);

/**
* Get a list of cookies based on the cookies string taken from response header and the target
Expand Down Expand Up @@ -110,17 +108,17 @@ public static List<Cookie> getCookies(String[] cookiesStrings, URL targetURL) {
// check expiration
if (expires != null) {
try {
Date expirationDate = DATE_FORMAT.parse(expires);
cookie.setExpiryDate(expirationDate);

// check that it hasn't expired?
if (cookie.isExpired(new Date())) {
continue;
Date expirationDate = org.apache.http.client.utils.DateUtils.parseDate(expires);
if (expirationDate != null) {
cookie.setExpiryDate(expirationDate);

// check that it hasn't expired?
if (cookie.isExpired(new Date())) {
continue;
}
}

cookie.setExpiryDate(expirationDate);
} catch (ParseException e) {
// ignore exceptions
} catch (Exception e) {
LOG.debug("Could not parse cookie expiry date: {}", expires, e);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@
// Utility class used to extract refresh tags from HTML pages
public abstract class RefreshTag {

private static final Matcher MATCHER =
Pattern.compile("^.*;\\s*URL='?(.+?)'?$", Pattern.CASE_INSENSITIVE).matcher("");
private static final Pattern PATTERN =
Pattern.compile("^.*;\\s*URL='?(.+?)'?$", Pattern.CASE_INSENSITIVE);

private static final Evaluator EVALUATOR =
QueryParser.parse("meta[http-equiv~=(?i)refresh][content]");
Expand All @@ -42,8 +42,9 @@ public static String extractRefreshURL(String value) {

// 0;URL=http://www.apollocolors.com/site
try {
if (MATCHER.reset(value).matches()) {
return MATCHER.group(1);
Matcher matcher = PATTERN.matcher(value);
if (matcher.matches()) {
return matcher.group(1);
}
} catch (Exception e) {
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,27 +18,15 @@
package org.apache.stormcrawler.aws.bolt;

import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Locale;
import java.util.regex.Pattern;
import org.apache.commons.codec.binary.Hex;
import org.apache.commons.codec.digest.DigestUtils;

public class CloudSearchUtils {

private static MessageDigest digester;

private static final Pattern INVALID_XML_CHARS =
Pattern.compile("[^\\t\\n\\r -\\uD7FF\\uE000-\\uFFFD]");

static {
try {
digester = MessageDigest.getInstance("SHA-512");
} catch (NoSuchAlgorithmException e) {
throw new RuntimeException(e);
}
}

private CloudSearchUtils() {}

/** Returns a normalised doc ID based on the URL of a document * */
Expand All @@ -51,8 +39,7 @@ public static String getID(String url) {
// letter or number and the following characters: _ - = # ; : / ? @
// &. Document IDs must be at least 1 and no more than 128
// characters long.
byte[] dig = digester.digest(url.getBytes(StandardCharsets.UTF_8));
String ID = Hex.encodeHexString(dig);
String ID = DigestUtils.sha512Hex(url.getBytes(StandardCharsets.UTF_8));
// is that even possible?
if (ID.length() > 128) {
throw new RuntimeException("ID larger than max 128 chars");
Expand Down Expand Up @@ -81,7 +68,7 @@ public static String cleanFieldName(String name) {
throw new RuntimeException("Field name must be between 3 and 64 chars : " + lowercase);
}
if (lowercase.equals("score")) {
throw new RuntimeException("Field name must be score");
throw new RuntimeException("Field name must NOT be score");
}
return lowercase;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@

import com.amazonaws.services.s3.model.AmazonS3Exception;
import com.amazonaws.services.s3.model.S3Object;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.Map;
import org.apache.commons.io.IOUtils;
Expand Down Expand Up @@ -65,12 +64,7 @@ public void execute(Tuple tuple) {
Metadata metadata = (Metadata) tuple.getValueByField("metadata");

// normalises URL
String key = "";
try {
key = URLEncoder.encode(url, "UTF-8");
} catch (UnsupportedEncodingException e) {
// ignore it - we know UTF-8 is valid
}
String key = URLEncoder.encode(url, java.nio.charset.StandardCharsets.UTF_8);
// check size of the key
if (key.length() >= 1024) {
LOG.info("Key too large : {}", key);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
import com.amazonaws.services.s3.model.PutObjectResult;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.Map;
import org.apache.storm.task.OutputCollector;
Expand Down Expand Up @@ -94,12 +93,7 @@ public void execute(Tuple tuple) {
}

// normalises URL
String key = "";
try {
key = URLEncoder.encode(url, "UTF-8");
} catch (UnsupportedEncodingException e) {
// ignore it - we know UTF-8 is valid
}
String key = URLEncoder.encode(url, java.nio.charset.StandardCharsets.UTF_8);
// check size of the key
if (key.length() >= 1024) {
LOG.info("Key too large : {}", key);
Expand Down
Loading