sebastian-nagel commented on code in PR #904:
URL: https://github.com/apache/nutch/pull/904#discussion_r3089526476


##########
src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java:
##########
@@ -93,6 +86,82 @@ public class HtmlParser implements Parser {
    *          <code>byte[]</code> representation of an html file
    */
 
+  /**
+   * Extracts charset value from a string like "charset=utf-8" or "charset = 
utf-8".
+   * Uses linear scan to avoid ReDoS. Value must start with [a-z] and contain 
only [a-z0-9_-].
+   */
+  private static String extractCharsetValue(String s, int fromIndex) {
+    int idx = s.indexOf(CHARSET_EQ, fromIndex);
+    if (idx < 0) {
+      return null;
+    }
+    int start = idx + CHARSET_EQ.length();
+    while (start < s.length() && (s.charAt(start) == ' ' || s.charAt(start) == 
'\t')) {
+      start++;
+    }
+    if (start >= s.length()) {
+      return null;
+    }
+    char first = s.charAt(start);
+    if (first != '"' && first != '\'' && (first < 'a' || first > 'z') && 
(first < 'A' || first > 'Z')) {
+      return null;
+    }
+    if (first == '"' || first == '\'') {
+      start++;
+    }
+    int end = start;
+    while (end < s.length()) {
+      char c = s.charAt(end);
+      if (c == ' ' || c == '\t' || c == ';' || c == '"' || c == '\'' || c == 
'>') {
+        break;
+      }
+      if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c 
<= '9') || c == '_' || c == '-') {
+        end++;
+      } else {
+        break;
+      }
+    }
+    return end > start ? s.substring(start, end) : null;
+  }
+
+  /**
+   * Finds charset from HTML string using linear scans only (no backtracking 
regex).
+   * Checks meta http-equiv Content-Type then HTML5 meta charset.
+   * Package-private for unit testing.
+   */
+  static String extractCharsetFromMeta(String str) {
+    String lower = str.toLowerCase();

Review Comment:
   Should add `Locale.ROOT`.



##########
src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java:
##########
@@ -102,20 +171,7 @@ private static String sniffCharacterEncoding(byte[] 
content) {
     // {U+0041, U+0082, U+00B7}.
     String str = new String(content, 0, length, StandardCharsets.US_ASCII);

Review Comment:
   The content bytes are converted into a String assuming ASCII encoding. If a 
"hand-crafted" scanner is used, could directly operate on the bytes avoiding 
the conversion to a string. Alternatively, could use a CharSequence wrapping 
the bytes, cf. 
[ByteArrayCharSequence](https://github.com/commoncrawl/nutch/blob/cc/src/java/org/commoncrawl/util/ByteArrayCharSequence.java).
 But this can be an improvement for later.



##########
src/java/org/apache/nutch/parse/ParseOutputFormat.java:
##########
@@ -177,8 +194,8 @@ public RecordWriter<Text, Parse> 
getRecordWriter(TaskAttemptContext context)
     Path data = new Path(new Path(out, ParseData.DIR_NAME), name);
     Path crawl = new Path(new Path(out, CrawlDatum.PARSE_DIR_NAME), name);
 
-    final String[] parseMDtoCrawlDB = conf.get("db.parsemeta.to.crawldb", "")

Review Comment:
   It's a string from the configuration, it's controlled by the user and should 
be short. It's parsed once, so this is for sure not critical.
   
   Apart from that should simply rely on 
[getTrimmedStrings](https://hadoop.apache.org/docs/current/api/org/apache/hadoop/conf/Configuration.html#getTrimmedStrings(java.lang.String,java.lang.String...)).



##########
src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java:
##########
@@ -64,15 +62,10 @@ public class HtmlParser implements Parser {
   // NUTCH-2042 (cf. TIKA-357): increased to 8 kB
   private static final int CHUNK_SIZE = 8192;
 
-  // NUTCH-1006 Meta equiv with single quotes not accepted
-  private static Pattern metaPattern = Pattern.compile(
-      "<meta\\s+([^>]*http-equiv=(\"|')?content-type(\"|')?[^>]*)>",
-      Pattern.CASE_INSENSITIVE);
-  private static Pattern charsetPattern = Pattern.compile(
-      "charset=\\s*([a-z][_\\-0-9a-z]*)", Pattern.CASE_INSENSITIVE);
-  private static Pattern charsetPatternHTML5 = Pattern.compile(
-      "<meta\\s+charset\\s*=\\s*[\"']?([a-z][_\\-0-9a-z]*)[^>]*>",
-      Pattern.CASE_INSENSITIVE);
+  private static final String META_TAG_START = "<meta";
+  private static final String CHARSET_EQ = "charset=";

Review Comment:
   The regex allows white space between "charset" and "=".



##########
src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java:
##########
@@ -93,6 +86,82 @@ public class HtmlParser implements Parser {
    *          <code>byte[]</code> representation of an html file
    */
 
+  /**
+   * Extracts charset value from a string like "charset=utf-8" or "charset = 
utf-8".
+   * Uses linear scan to avoid ReDoS. Value must start with [a-z] and contain 
only [a-z0-9_-].
+   */
+  private static String extractCharsetValue(String s, int fromIndex) {
+    int idx = s.indexOf(CHARSET_EQ, fromIndex);
+    if (idx < 0) {
+      return null;
+    }
+    int start = idx + CHARSET_EQ.length();
+    while (start < s.length() && (s.charAt(start) == ' ' || s.charAt(start) == 
'\t')) {
+      start++;
+    }
+    if (start >= s.length()) {
+      return null;
+    }
+    char first = s.charAt(start);
+    if (first != '"' && first != '\'' && (first < 'a' || first > 'z') && 
(first < 'A' || first > 'Z')) {
+      return null;
+    }
+    if (first == '"' || first == '\'') {
+      start++;
+    }
+    int end = start;
+    while (end < s.length()) {
+      char c = s.charAt(end);
+      if (c == ' ' || c == '\t' || c == ';' || c == '"' || c == '\'' || c == 
'>') {
+        break;
+      }
+      if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c 
<= '9') || c == '_' || c == '-') {
+        end++;
+      } else {
+        break;
+      }
+    }
+    return end > start ? s.substring(start, end) : null;
+  }
+
+  /**
+   * Finds charset from HTML string using linear scans only (no backtracking 
regex).
+   * Checks meta http-equiv Content-Type then HTML5 meta charset.
+   * Package-private for unit testing.
+   */
+  static String extractCharsetFromMeta(String str) {
+    String lower = str.toLowerCase();
+    int pos = 0;
+    while (true) {
+      int metaStart = lower.indexOf(META_TAG_START, pos);
+      if (metaStart < 0) {
+        break;
+      }
+      int tagEnd = str.indexOf('>', metaStart);
+      if (tagEnd < 0) {
+        break;
+      }
+      String tagContent = str.substring(metaStart, tagEnd);

Review Comment:
   Of course, if we strictly ensure that the method is used on String only 
including ASCII characters and using always the root locale, this should be 
safe.



##########
src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java:
##########
@@ -93,6 +86,82 @@ public class HtmlParser implements Parser {
    *          <code>byte[]</code> representation of an html file
    */
 
+  /**
+   * Extracts charset value from a string like "charset=utf-8" or "charset = 
utf-8".
+   * Uses linear scan to avoid ReDoS. Value must start with [a-z] and contain 
only [a-z0-9_-].
+   */
+  private static String extractCharsetValue(String s, int fromIndex) {
+    int idx = s.indexOf(CHARSET_EQ, fromIndex);
+    if (idx < 0) {
+      return null;
+    }
+    int start = idx + CHARSET_EQ.length();
+    while (start < s.length() && (s.charAt(start) == ' ' || s.charAt(start) == 
'\t')) {
+      start++;
+    }
+    if (start >= s.length()) {
+      return null;
+    }
+    char first = s.charAt(start);
+    if (first != '"' && first != '\'' && (first < 'a' || first > 'z') && 
(first < 'A' || first > 'Z')) {
+      return null;
+    }
+    if (first == '"' || first == '\'') {
+      start++;
+    }
+    int end = start;
+    while (end < s.length()) {
+      char c = s.charAt(end);
+      if (c == ' ' || c == '\t' || c == ';' || c == '"' || c == '\'' || c == 
'>') {
+        break;
+      }
+      if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c 
<= '9') || c == '_' || c == '-') {
+        end++;
+      } else {
+        break;
+      }
+    }
+    return end > start ? s.substring(start, end) : null;
+  }
+
+  /**
+   * Finds charset from HTML string using linear scans only (no backtracking 
regex).
+   * Checks meta http-equiv Content-Type then HTML5 meta charset.
+   * Package-private for unit testing.
+   */
+  static String extractCharsetFromMeta(String str) {
+    String lower = str.toLowerCase();
+    int pos = 0;
+    while (true) {
+      int metaStart = lower.indexOf(META_TAG_START, pos);
+      if (metaStart < 0) {
+        break;
+      }
+      int tagEnd = str.indexOf('>', metaStart);
+      if (tagEnd < 0) {
+        break;
+      }
+      String tagContent = str.substring(metaStart, tagEnd);

Review Comment:
   Assuming that indexes in the lower-case string and the original one are the 
same might be dangerous, for example, in German there is the pair "ß" <> "SS" 
(would held for uppercasing the string). See [Unicode TR #21 Case 
Mappings](https://www.unicode.org/L2/L1999/99190.htm): "Case mappings may 
produce strings of different length than the original."



##########
src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java:
##########
@@ -93,6 +86,82 @@ public class HtmlParser implements Parser {
    *          <code>byte[]</code> representation of an html file
    */
 
+  /**
+   * Extracts charset value from a string like "charset=utf-8" or "charset = 
utf-8".
+   * Uses linear scan to avoid ReDoS. Value must start with [a-z] and contain 
only [a-z0-9_-].
+   */
+  private static String extractCharsetValue(String s, int fromIndex) {
+    int idx = s.indexOf(CHARSET_EQ, fromIndex);
+    if (idx < 0) {
+      return null;
+    }
+    int start = idx + CHARSET_EQ.length();
+    while (start < s.length() && (s.charAt(start) == ' ' || s.charAt(start) == 
'\t')) {

Review Comment:
   `\\s` matches more characters than blank (U+0020) and the tab character. 



##########
src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java:
##########
@@ -93,6 +86,82 @@ public class HtmlParser implements Parser {
    *          <code>byte[]</code> representation of an html file
    */
 
+  /**
+   * Extracts charset value from a string like "charset=utf-8" or "charset = 
utf-8".
+   * Uses linear scan to avoid ReDoS. Value must start with [a-z] and contain 
only [a-z0-9_-].
+   */
+  private static String extractCharsetValue(String s, int fromIndex) {
+    int idx = s.indexOf(CHARSET_EQ, fromIndex);
+    if (idx < 0) {
+      return null;
+    }
+    int start = idx + CHARSET_EQ.length();
+    while (start < s.length() && (s.charAt(start) == ' ' || s.charAt(start) == 
'\t')) {
+      start++;
+    }
+    if (start >= s.length()) {
+      return null;
+    }
+    char first = s.charAt(start);
+    if (first != '"' && first != '\'' && (first < 'a' || first > 'z') && 
(first < 'A' || first > 'Z')) {
+      return null;
+    }
+    if (first == '"' || first == '\'') {
+      start++;
+    }
+    int end = start;
+    while (end < s.length()) {
+      char c = s.charAt(end);
+      if (c == ' ' || c == '\t' || c == ';' || c == '"' || c == '\'' || c == 
'>') {
+        break;
+      }
+      if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c 
<= '9') || c == '_' || c == '-') {
+        end++;
+      } else {
+        break;
+      }
+    }
+    return end > start ? s.substring(start, end) : null;
+  }
+
+  /**
+   * Finds charset from HTML string using linear scans only (no backtracking 
regex).
+   * Checks meta http-equiv Content-Type then HTML5 meta charset.
+   * Package-private for unit testing.
+   */
+  static String extractCharsetFromMeta(String str) {
+    String lower = str.toLowerCase();
+    int pos = 0;
+    while (true) {
+      int metaStart = lower.indexOf(META_TAG_START, pos);
+      if (metaStart < 0) {
+        break;
+      }
+      int tagEnd = str.indexOf('>', metaStart);
+      if (tagEnd < 0) {
+        break;
+      }
+      String tagContent = str.substring(metaStart, tagEnd);
+      String tagLower = tagContent.toLowerCase();

Review Comment:
   Same: must use `Locale.ROOT`.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to