[
https://issues.apache.org/jira/browse/NUTCH-3161?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=18073831#comment-18073831
]
ASF GitHub Bot commented on NUTCH-3161:
---------------------------------------
sebastian-nagel commented on code in PR #904:
URL: https://github.com/apache/nutch/pull/904#discussion_r3089526476
##########
src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java:
##########
@@ -93,6 +86,82 @@ public class HtmlParser implements Parser {
* <code>byte[]</code> representation of an html file
*/
+ /**
+ * Extracts charset value from a string like "charset=utf-8" or "charset =
utf-8".
+ * Uses linear scan to avoid ReDoS. Value must start with [a-z] and contain
only [a-z0-9_-].
+ */
+ private static String extractCharsetValue(String s, int fromIndex) {
+ int idx = s.indexOf(CHARSET_EQ, fromIndex);
+ if (idx < 0) {
+ return null;
+ }
+ int start = idx + CHARSET_EQ.length();
+ while (start < s.length() && (s.charAt(start) == ' ' || s.charAt(start) ==
'\t')) {
+ start++;
+ }
+ if (start >= s.length()) {
+ return null;
+ }
+ char first = s.charAt(start);
+ if (first != '"' && first != '\'' && (first < 'a' || first > 'z') &&
(first < 'A' || first > 'Z')) {
+ return null;
+ }
+ if (first == '"' || first == '\'') {
+ start++;
+ }
+ int end = start;
+ while (end < s.length()) {
+ char c = s.charAt(end);
+ if (c == ' ' || c == '\t' || c == ';' || c == '"' || c == '\'' || c ==
'>') {
+ break;
+ }
+ if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c
<= '9') || c == '_' || c == '-') {
+ end++;
+ } else {
+ break;
+ }
+ }
+ return end > start ? s.substring(start, end) : null;
+ }
+
+ /**
+ * Finds charset from HTML string using linear scans only (no backtracking
regex).
+ * Checks meta http-equiv Content-Type then HTML5 meta charset.
+ * Package-private for unit testing.
+ */
+ static String extractCharsetFromMeta(String str) {
+ String lower = str.toLowerCase();
Review Comment:
Should add `Locale.ROOT`.
##########
src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java:
##########
@@ -102,20 +171,7 @@ private static String sniffCharacterEncoding(byte[]
content) {
// {U+0041, U+0082, U+00B7}.
String str = new String(content, 0, length, StandardCharsets.US_ASCII);
Review Comment:
The content bytes are converted into a String assuming ASCII encoding. If a
"hand-crafted" scanner is used, could directly operate on the bytes avoiding
the conversion to a string. Alternatively, could use a CharSequence wrapping
the bytes, cf.
[ByteArrayCharSequence](https://github.com/commoncrawl/nutch/blob/cc/src/java/org/commoncrawl/util/ByteArrayCharSequence.java).
But this can be an improvement for later.
##########
src/java/org/apache/nutch/parse/ParseOutputFormat.java:
##########
@@ -177,8 +194,8 @@ public RecordWriter<Text, Parse>
getRecordWriter(TaskAttemptContext context)
Path data = new Path(new Path(out, ParseData.DIR_NAME), name);
Path crawl = new Path(new Path(out, CrawlDatum.PARSE_DIR_NAME), name);
- final String[] parseMDtoCrawlDB = conf.get("db.parsemeta.to.crawldb", "")
Review Comment:
It's a string from the configuration, it's controlled by the user and should
be short. It's parsed once, so this is for sure not critical.
Apart from that should simply rely on
[getTrimmedStrings](https://hadoop.apache.org/docs/current/api/org/apache/hadoop/conf/Configuration.html#getTrimmedStrings(java.lang.String,java.lang.String...)).
##########
src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java:
##########
@@ -64,15 +62,10 @@ public class HtmlParser implements Parser {
// NUTCH-2042 (cf. TIKA-357): increased to 8 kB
private static final int CHUNK_SIZE = 8192;
- // NUTCH-1006 Meta equiv with single quotes not accepted
- private static Pattern metaPattern = Pattern.compile(
- "<meta\\s+([^>]*http-equiv=(\"|')?content-type(\"|')?[^>]*)>",
- Pattern.CASE_INSENSITIVE);
- private static Pattern charsetPattern = Pattern.compile(
- "charset=\\s*([a-z][_\\-0-9a-z]*)", Pattern.CASE_INSENSITIVE);
- private static Pattern charsetPatternHTML5 = Pattern.compile(
- "<meta\\s+charset\\s*=\\s*[\"']?([a-z][_\\-0-9a-z]*)[^>]*>",
- Pattern.CASE_INSENSITIVE);
+ private static final String META_TAG_START = "<meta";
+ private static final String CHARSET_EQ = "charset=";
Review Comment:
The regex allows white space between "charset" and "=".
##########
src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java:
##########
@@ -93,6 +86,82 @@ public class HtmlParser implements Parser {
* <code>byte[]</code> representation of an html file
*/
+ /**
+ * Extracts charset value from a string like "charset=utf-8" or "charset =
utf-8".
+ * Uses linear scan to avoid ReDoS. Value must start with [a-z] and contain
only [a-z0-9_-].
+ */
+ private static String extractCharsetValue(String s, int fromIndex) {
+ int idx = s.indexOf(CHARSET_EQ, fromIndex);
+ if (idx < 0) {
+ return null;
+ }
+ int start = idx + CHARSET_EQ.length();
+ while (start < s.length() && (s.charAt(start) == ' ' || s.charAt(start) ==
'\t')) {
+ start++;
+ }
+ if (start >= s.length()) {
+ return null;
+ }
+ char first = s.charAt(start);
+ if (first != '"' && first != '\'' && (first < 'a' || first > 'z') &&
(first < 'A' || first > 'Z')) {
+ return null;
+ }
+ if (first == '"' || first == '\'') {
+ start++;
+ }
+ int end = start;
+ while (end < s.length()) {
+ char c = s.charAt(end);
+ if (c == ' ' || c == '\t' || c == ';' || c == '"' || c == '\'' || c ==
'>') {
+ break;
+ }
+ if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c
<= '9') || c == '_' || c == '-') {
+ end++;
+ } else {
+ break;
+ }
+ }
+ return end > start ? s.substring(start, end) : null;
+ }
+
+ /**
+ * Finds charset from HTML string using linear scans only (no backtracking
regex).
+ * Checks meta http-equiv Content-Type then HTML5 meta charset.
+ * Package-private for unit testing.
+ */
+ static String extractCharsetFromMeta(String str) {
+ String lower = str.toLowerCase();
+ int pos = 0;
+ while (true) {
+ int metaStart = lower.indexOf(META_TAG_START, pos);
+ if (metaStart < 0) {
+ break;
+ }
+ int tagEnd = str.indexOf('>', metaStart);
+ if (tagEnd < 0) {
+ break;
+ }
+ String tagContent = str.substring(metaStart, tagEnd);
Review Comment:
Of course, if we strictly ensure that the method is used on String only
including ASCII characters and using always the root locale, this should be
safe.
##########
src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java:
##########
@@ -93,6 +86,82 @@ public class HtmlParser implements Parser {
* <code>byte[]</code> representation of an html file
*/
+ /**
+ * Extracts charset value from a string like "charset=utf-8" or "charset =
utf-8".
+ * Uses linear scan to avoid ReDoS. Value must start with [a-z] and contain
only [a-z0-9_-].
+ */
+ private static String extractCharsetValue(String s, int fromIndex) {
+ int idx = s.indexOf(CHARSET_EQ, fromIndex);
+ if (idx < 0) {
+ return null;
+ }
+ int start = idx + CHARSET_EQ.length();
+ while (start < s.length() && (s.charAt(start) == ' ' || s.charAt(start) ==
'\t')) {
+ start++;
+ }
+ if (start >= s.length()) {
+ return null;
+ }
+ char first = s.charAt(start);
+ if (first != '"' && first != '\'' && (first < 'a' || first > 'z') &&
(first < 'A' || first > 'Z')) {
+ return null;
+ }
+ if (first == '"' || first == '\'') {
+ start++;
+ }
+ int end = start;
+ while (end < s.length()) {
+ char c = s.charAt(end);
+ if (c == ' ' || c == '\t' || c == ';' || c == '"' || c == '\'' || c ==
'>') {
+ break;
+ }
+ if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c
<= '9') || c == '_' || c == '-') {
+ end++;
+ } else {
+ break;
+ }
+ }
+ return end > start ? s.substring(start, end) : null;
+ }
+
+ /**
+ * Finds charset from HTML string using linear scans only (no backtracking
regex).
+ * Checks meta http-equiv Content-Type then HTML5 meta charset.
+ * Package-private for unit testing.
+ */
+ static String extractCharsetFromMeta(String str) {
+ String lower = str.toLowerCase();
+ int pos = 0;
+ while (true) {
+ int metaStart = lower.indexOf(META_TAG_START, pos);
+ if (metaStart < 0) {
+ break;
+ }
+ int tagEnd = str.indexOf('>', metaStart);
+ if (tagEnd < 0) {
+ break;
+ }
+ String tagContent = str.substring(metaStart, tagEnd);
Review Comment:
Assuming that indexes in the lower-case string and the original one are the
same might be dangerous, for example, in German there is the pair "ß" <> "SS"
(would held for uppercasing the string). See [Unicode TR #21 Case
Mappings](https://www.unicode.org/L2/L1999/99190.htm): "Case mappings may
produce strings of different length than the original."
##########
src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java:
##########
@@ -93,6 +86,82 @@ public class HtmlParser implements Parser {
* <code>byte[]</code> representation of an html file
*/
+ /**
+ * Extracts charset value from a string like "charset=utf-8" or "charset =
utf-8".
+ * Uses linear scan to avoid ReDoS. Value must start with [a-z] and contain
only [a-z0-9_-].
+ */
+ private static String extractCharsetValue(String s, int fromIndex) {
+ int idx = s.indexOf(CHARSET_EQ, fromIndex);
+ if (idx < 0) {
+ return null;
+ }
+ int start = idx + CHARSET_EQ.length();
+ while (start < s.length() && (s.charAt(start) == ' ' || s.charAt(start) ==
'\t')) {
Review Comment:
`\\s` matches more characters than blank (U+0020) and the tab character.
##########
src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java:
##########
@@ -93,6 +86,82 @@ public class HtmlParser implements Parser {
* <code>byte[]</code> representation of an html file
*/
+ /**
+ * Extracts charset value from a string like "charset=utf-8" or "charset =
utf-8".
+ * Uses linear scan to avoid ReDoS. Value must start with [a-z] and contain
only [a-z0-9_-].
+ */
+ private static String extractCharsetValue(String s, int fromIndex) {
+ int idx = s.indexOf(CHARSET_EQ, fromIndex);
+ if (idx < 0) {
+ return null;
+ }
+ int start = idx + CHARSET_EQ.length();
+ while (start < s.length() && (s.charAt(start) == ' ' || s.charAt(start) ==
'\t')) {
+ start++;
+ }
+ if (start >= s.length()) {
+ return null;
+ }
+ char first = s.charAt(start);
+ if (first != '"' && first != '\'' && (first < 'a' || first > 'z') &&
(first < 'A' || first > 'Z')) {
+ return null;
+ }
+ if (first == '"' || first == '\'') {
+ start++;
+ }
+ int end = start;
+ while (end < s.length()) {
+ char c = s.charAt(end);
+ if (c == ' ' || c == '\t' || c == ';' || c == '"' || c == '\'' || c ==
'>') {
+ break;
+ }
+ if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c
<= '9') || c == '_' || c == '-') {
+ end++;
+ } else {
+ break;
+ }
+ }
+ return end > start ? s.substring(start, end) : null;
+ }
+
+ /**
+ * Finds charset from HTML string using linear scans only (no backtracking
regex).
+ * Checks meta http-equiv Content-Type then HTML5 meta charset.
+ * Package-private for unit testing.
+ */
+ static String extractCharsetFromMeta(String str) {
+ String lower = str.toLowerCase();
+ int pos = 0;
+ while (true) {
+ int metaStart = lower.indexOf(META_TAG_START, pos);
+ if (metaStart < 0) {
+ break;
+ }
+ int tagEnd = str.indexOf('>', metaStart);
+ if (tagEnd < 0) {
+ break;
+ }
+ String tagContent = str.substring(metaStart, tagEnd);
+ String tagLower = tagContent.toLowerCase();
Review Comment:
Same: must use `Locale.ROOT`.
> Address Sonarcloud High and Medium Security Hotspots
> ----------------------------------------------------
>
> Key: NUTCH-3161
> URL: https://issues.apache.org/jira/browse/NUTCH-3161
> Project: Nutch
> Issue Type: Task
> Components: ci/cd, security
> Affects Versions: 1.22
> Reporter: Lewis John McGibbney
> Assignee: Lewis John McGibbney
> Priority: Critical
> Fix For: 1.23
>
>
> Sonarcloud Security Hotspots can be viewed at
> [https://sonarcloud.io/project/security_hotspots?id=apache_nutch]
--
This message was sent by Atlassian Jira
(v8.20.10#820010)