This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_1x by this push:
     new 8d7659061 tighten up regex in StandardsText
8d7659061 is described below

commit 8d765906183296906466afa4e61ebcad059a813c
Author: tallison <[email protected]>
AuthorDate: Mon May 23 09:10:45 2022 -0400

    tighten up regex in StandardsText
---
 tika-core/src/main/java/org/apache/tika/sax/StandardsText.java | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java 
b/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java
index bdba930a7..b4109d9dc 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java
@@ -60,7 +60,8 @@ public class StandardsText {
        private static final String REGEX_APPLICABLE_DOCUMENTS = 
"(?i:.*APPLICABLE\\sDOCUMENTS|REFERENCE|STANDARD|REQUIREMENT|GUIDELINE|COMPLIANCE.*)";
 
        // Regular expression to match the alphanumeric identifier of the 
standard
-       private static final String REGEX_IDENTIFIER = 
"(?<identifier>([0-9]{3,}|([A-Z]+(-|_|\\.)?[0-9]{2,}))((-|_|\\.)?[A-Z0-9]+)*)";
+       private static final String REGEX_IDENTIFIER = 
"(?<identifier>([0-9]{3,20}|([A-Z]+(-|_|\\.)" +
+                       "?[0-9]{2,20}))((-|_|\\.)?[A-Z0-9]+){0,10})";
 
        // Regular expression to match the standard organization
        private static final String REGEX_ORGANIZATION = 
StandardOrganizations.getOrganzationsRegex();
@@ -71,8 +72,9 @@ public class StandardsText {
 
        // Regular expression to match a string that is supposed to be a 
standard
        // reference
-       private static final String REGEX_FALLBACK = "\\(?" + 
"(?<mainOrganization>[A-Z]\\w+)"
-                       + "\\)?((\\s?(?<separator>\\/)\\s?)(\\w+\\s)*\\(?" + 
"(?<secondOrganization>[A-Z]\\w+)" + "\\)?)?"
+       private static final String REGEX_FALLBACK = "\\(?" + 
"(?<mainOrganization>[A-Z]\\w{1,100})"
+                       + 
"\\)?((\\s?(?<separator>\\/)\\s?)(\\w{1,100}\\s)*\\(?" + 
"(?<secondOrganization>[A" +
+                       "-Z]\\w{1,100})" + "\\)?)?"
                        + REGEX_STANDARD_TYPE + "?" + "(-|\\s)?" + 
REGEX_IDENTIFIER;
 
        // Regular expression to match the standard organization within a string

Reply via email to