[ 
https://issues.apache.org/jira/browse/NUTCH-3161?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=18073831#comment-18073831
 ] 

ASF GitHub Bot commented on NUTCH-3161:
---------------------------------------

sebastian-nagel commented on code in PR #904:
URL: https://github.com/apache/nutch/pull/904#discussion_r3089526476


##########
src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java:
##########
@@ -93,6 +86,82 @@ public class HtmlParser implements Parser {
    *          <code>byte[]</code> representation of an html file
    */
 
+  /**
+   * Extracts charset value from a string like "charset=utf-8" or "charset = 
utf-8".
+   * Uses linear scan to avoid ReDoS. Value must start with [a-z] and contain 
only [a-z0-9_-].
+   */
+  private static String extractCharsetValue(String s, int fromIndex) {
+    int idx = s.indexOf(CHARSET_EQ, fromIndex);
+    if (idx < 0) {
+      return null;
+    }
+    int start = idx + CHARSET_EQ.length();
+    while (start < s.length() && (s.charAt(start) == ' ' || s.charAt(start) == 
'\t')) {
+      start++;
+    }
+    if (start >= s.length()) {
+      return null;
+    }
+    char first = s.charAt(start);
+    if (first != '"' && first != '\'' && (first < 'a' || first > 'z') && 
(first < 'A' || first > 'Z')) {
+      return null;
+    }
+    if (first == '"' || first == '\'') {
+      start++;
+    }
+    int end = start;
+    while (end < s.length()) {
+      char c = s.charAt(end);
+      if (c == ' ' || c == '\t' || c == ';' || c == '"' || c == '\'' || c == 
'>') {
+        break;
+      }
+      if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c 
<= '9') || c == '_' || c == '-') {
+        end++;
+      } else {
+        break;
+      }
+    }
+    return end > start ? s.substring(start, end) : null;
+  }
+
+  /**
+   * Finds charset from HTML string using linear scans only (no backtracking 
regex).
+   * Checks meta http-equiv Content-Type then HTML5 meta charset.
+   * Package-private for unit testing.
+   */
+  static String extractCharsetFromMeta(String str) {
+    String lower = str.toLowerCase();

Review Comment:
   Should add `Locale.ROOT`.



##########
src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java:
##########
@@ -102,20 +171,7 @@ private static String sniffCharacterEncoding(byte[] 
content) {
     // {U+0041, U+0082, U+00B7}.
     String str = new String(content, 0, length, StandardCharsets.US_ASCII);

Review Comment:
   The content bytes are converted into a String assuming ASCII encoding. If a 
"hand-crafted" scanner is used, could directly operate on the bytes avoiding 
the conversion to a string. Alternatively, could use a CharSequence wrapping 
the bytes, cf. 
[ByteArrayCharSequence](https://github.com/commoncrawl/nutch/blob/cc/src/java/org/commoncrawl/util/ByteArrayCharSequence.java).
 But this can be an improvement for later.



##########
src/java/org/apache/nutch/parse/ParseOutputFormat.java:
##########
@@ -177,8 +194,8 @@ public RecordWriter<Text, Parse> 
getRecordWriter(TaskAttemptContext context)
     Path data = new Path(new Path(out, ParseData.DIR_NAME), name);
     Path crawl = new Path(new Path(out, CrawlDatum.PARSE_DIR_NAME), name);
 
-    final String[] parseMDtoCrawlDB = conf.get("db.parsemeta.to.crawldb", "")

Review Comment:
   It's a string from the configuration, it's controlled by the user and should 
be short. It's parsed once, so this is for sure not critical.
   
   Apart from that should simply rely on 
[getTrimmedStrings](https://hadoop.apache.org/docs/current/api/org/apache/hadoop/conf/Configuration.html#getTrimmedStrings(java.lang.String,java.lang.String...)).



##########
src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java:
##########
@@ -64,15 +62,10 @@ public class HtmlParser implements Parser {
   // NUTCH-2042 (cf. TIKA-357): increased to 8 kB
   private static final int CHUNK_SIZE = 8192;
 
-  // NUTCH-1006 Meta equiv with single quotes not accepted
-  private static Pattern metaPattern = Pattern.compile(
-      "<meta\\s+([^>]*http-equiv=(\"|')?content-type(\"|')?[^>]*)>",
-      Pattern.CASE_INSENSITIVE);
-  private static Pattern charsetPattern = Pattern.compile(
-      "charset=\\s*([a-z][_\\-0-9a-z]*)", Pattern.CASE_INSENSITIVE);
-  private static Pattern charsetPatternHTML5 = Pattern.compile(
-      "<meta\\s+charset\\s*=\\s*[\"']?([a-z][_\\-0-9a-z]*)[^>]*>",
-      Pattern.CASE_INSENSITIVE);
+  private static final String META_TAG_START = "<meta";
+  private static final String CHARSET_EQ = "charset=";

Review Comment:
   The regex allows white space between "charset" and "=".



##########
src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java:
##########
@@ -93,6 +86,82 @@ public class HtmlParser implements Parser {
    *          <code>byte[]</code> representation of an html file
    */
 
+  /**
+   * Extracts charset value from a string like "charset=utf-8" or "charset = 
utf-8".
+   * Uses linear scan to avoid ReDoS. Value must start with [a-z] and contain 
only [a-z0-9_-].
+   */
+  private static String extractCharsetValue(String s, int fromIndex) {
+    int idx = s.indexOf(CHARSET_EQ, fromIndex);
+    if (idx < 0) {
+      return null;
+    }
+    int start = idx + CHARSET_EQ.length();
+    while (start < s.length() && (s.charAt(start) == ' ' || s.charAt(start) == 
'\t')) {
+      start++;
+    }
+    if (start >= s.length()) {
+      return null;
+    }
+    char first = s.charAt(start);
+    if (first != '"' && first != '\'' && (first < 'a' || first > 'z') && 
(first < 'A' || first > 'Z')) {
+      return null;
+    }
+    if (first == '"' || first == '\'') {
+      start++;
+    }
+    int end = start;
+    while (end < s.length()) {
+      char c = s.charAt(end);
+      if (c == ' ' || c == '\t' || c == ';' || c == '"' || c == '\'' || c == 
'>') {
+        break;
+      }
+      if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c 
<= '9') || c == '_' || c == '-') {
+        end++;
+      } else {
+        break;
+      }
+    }
+    return end > start ? s.substring(start, end) : null;
+  }
+
+  /**
+   * Finds charset from HTML string using linear scans only (no backtracking 
regex).
+   * Checks meta http-equiv Content-Type then HTML5 meta charset.
+   * Package-private for unit testing.
+   */
+  static String extractCharsetFromMeta(String str) {
+    String lower = str.toLowerCase();
+    int pos = 0;
+    while (true) {
+      int metaStart = lower.indexOf(META_TAG_START, pos);
+      if (metaStart < 0) {
+        break;
+      }
+      int tagEnd = str.indexOf('>', metaStart);
+      if (tagEnd < 0) {
+        break;
+      }
+      String tagContent = str.substring(metaStart, tagEnd);

Review Comment:
   Of course, if we strictly ensure that the method is used on String only 
including ASCII characters and using always the root locale, this should be 
safe.



##########
src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java:
##########
@@ -93,6 +86,82 @@ public class HtmlParser implements Parser {
    *          <code>byte[]</code> representation of an html file
    */
 
+  /**
+   * Extracts charset value from a string like "charset=utf-8" or "charset = 
utf-8".
+   * Uses linear scan to avoid ReDoS. Value must start with [a-z] and contain 
only [a-z0-9_-].
+   */
+  private static String extractCharsetValue(String s, int fromIndex) {
+    int idx = s.indexOf(CHARSET_EQ, fromIndex);
+    if (idx < 0) {
+      return null;
+    }
+    int start = idx + CHARSET_EQ.length();
+    while (start < s.length() && (s.charAt(start) == ' ' || s.charAt(start) == 
'\t')) {
+      start++;
+    }
+    if (start >= s.length()) {
+      return null;
+    }
+    char first = s.charAt(start);
+    if (first != '"' && first != '\'' && (first < 'a' || first > 'z') && 
(first < 'A' || first > 'Z')) {
+      return null;
+    }
+    if (first == '"' || first == '\'') {
+      start++;
+    }
+    int end = start;
+    while (end < s.length()) {
+      char c = s.charAt(end);
+      if (c == ' ' || c == '\t' || c == ';' || c == '"' || c == '\'' || c == 
'>') {
+        break;
+      }
+      if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c 
<= '9') || c == '_' || c == '-') {
+        end++;
+      } else {
+        break;
+      }
+    }
+    return end > start ? s.substring(start, end) : null;
+  }
+
+  /**
+   * Finds charset from HTML string using linear scans only (no backtracking 
regex).
+   * Checks meta http-equiv Content-Type then HTML5 meta charset.
+   * Package-private for unit testing.
+   */
+  static String extractCharsetFromMeta(String str) {
+    String lower = str.toLowerCase();
+    int pos = 0;
+    while (true) {
+      int metaStart = lower.indexOf(META_TAG_START, pos);
+      if (metaStart < 0) {
+        break;
+      }
+      int tagEnd = str.indexOf('>', metaStart);
+      if (tagEnd < 0) {
+        break;
+      }
+      String tagContent = str.substring(metaStart, tagEnd);

Review Comment:
   Assuming that indexes in the lower-case string and the original one are the 
same might be dangerous, for example, in German there is the pair "ß" <> "SS" 
(would held for uppercasing the string). See [Unicode TR #21 Case 
Mappings](https://www.unicode.org/L2/L1999/99190.htm): "Case mappings may 
produce strings of different length than the original."



##########
src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java:
##########
@@ -93,6 +86,82 @@ public class HtmlParser implements Parser {
    *          <code>byte[]</code> representation of an html file
    */
 
+  /**
+   * Extracts charset value from a string like "charset=utf-8" or "charset = 
utf-8".
+   * Uses linear scan to avoid ReDoS. Value must start with [a-z] and contain 
only [a-z0-9_-].
+   */
+  private static String extractCharsetValue(String s, int fromIndex) {
+    int idx = s.indexOf(CHARSET_EQ, fromIndex);
+    if (idx < 0) {
+      return null;
+    }
+    int start = idx + CHARSET_EQ.length();
+    while (start < s.length() && (s.charAt(start) == ' ' || s.charAt(start) == 
'\t')) {

Review Comment:
   `\\s` matches more characters than blank (U+0020) and the tab character. 



##########
src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java:
##########
@@ -93,6 +86,82 @@ public class HtmlParser implements Parser {
    *          <code>byte[]</code> representation of an html file
    */
 
+  /**
+   * Extracts charset value from a string like "charset=utf-8" or "charset = 
utf-8".
+   * Uses linear scan to avoid ReDoS. Value must start with [a-z] and contain 
only [a-z0-9_-].
+   */
+  private static String extractCharsetValue(String s, int fromIndex) {
+    int idx = s.indexOf(CHARSET_EQ, fromIndex);
+    if (idx < 0) {
+      return null;
+    }
+    int start = idx + CHARSET_EQ.length();
+    while (start < s.length() && (s.charAt(start) == ' ' || s.charAt(start) == 
'\t')) {
+      start++;
+    }
+    if (start >= s.length()) {
+      return null;
+    }
+    char first = s.charAt(start);
+    if (first != '"' && first != '\'' && (first < 'a' || first > 'z') && 
(first < 'A' || first > 'Z')) {
+      return null;
+    }
+    if (first == '"' || first == '\'') {
+      start++;
+    }
+    int end = start;
+    while (end < s.length()) {
+      char c = s.charAt(end);
+      if (c == ' ' || c == '\t' || c == ';' || c == '"' || c == '\'' || c == 
'>') {
+        break;
+      }
+      if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c 
<= '9') || c == '_' || c == '-') {
+        end++;
+      } else {
+        break;
+      }
+    }
+    return end > start ? s.substring(start, end) : null;
+  }
+
+  /**
+   * Finds charset from HTML string using linear scans only (no backtracking 
regex).
+   * Checks meta http-equiv Content-Type then HTML5 meta charset.
+   * Package-private for unit testing.
+   */
+  static String extractCharsetFromMeta(String str) {
+    String lower = str.toLowerCase();
+    int pos = 0;
+    while (true) {
+      int metaStart = lower.indexOf(META_TAG_START, pos);
+      if (metaStart < 0) {
+        break;
+      }
+      int tagEnd = str.indexOf('>', metaStart);
+      if (tagEnd < 0) {
+        break;
+      }
+      String tagContent = str.substring(metaStart, tagEnd);
+      String tagLower = tagContent.toLowerCase();

Review Comment:
   Same: must use `Locale.ROOT`.





> Address Sonarcloud High and Medium Security Hotspots
> ----------------------------------------------------
>
>                 Key: NUTCH-3161
>                 URL: https://issues.apache.org/jira/browse/NUTCH-3161
>             Project: Nutch
>          Issue Type: Task
>          Components: ci/cd, security
>    Affects Versions: 1.22
>            Reporter: Lewis John McGibbney
>            Assignee: Lewis John McGibbney
>            Priority: Critical
>             Fix For: 1.23
>
>
> Sonarcloud Security Hotspots can be viewed at 
> [https://sonarcloud.io/project/security_hotspots?id=apache_nutch]



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to