This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_1x by this push:
new 8d7659061 tighten up regex in StandardsText
8d7659061 is described below
commit 8d765906183296906466afa4e61ebcad059a813c
Author: tallison <[email protected]>
AuthorDate: Mon May 23 09:10:45 2022 -0400
tighten up regex in StandardsText
---
tika-core/src/main/java/org/apache/tika/sax/StandardsText.java | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java
b/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java
index bdba930a7..b4109d9dc 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java
@@ -60,7 +60,8 @@ public class StandardsText {
private static final String REGEX_APPLICABLE_DOCUMENTS =
"(?i:.*APPLICABLE\\sDOCUMENTS|REFERENCE|STANDARD|REQUIREMENT|GUIDELINE|COMPLIANCE.*)";
// Regular expression to match the alphanumeric identifier of the
standard
- private static final String REGEX_IDENTIFIER =
"(?<identifier>([0-9]{3,}|([A-Z]+(-|_|\\.)?[0-9]{2,}))((-|_|\\.)?[A-Z0-9]+)*)";
+ private static final String REGEX_IDENTIFIER =
"(?<identifier>([0-9]{3,20}|([A-Z]+(-|_|\\.)" +
+ "?[0-9]{2,20}))((-|_|\\.)?[A-Z0-9]+){0,10})";
// Regular expression to match the standard organization
private static final String REGEX_ORGANIZATION =
StandardOrganizations.getOrganzationsRegex();
@@ -71,8 +72,9 @@ public class StandardsText {
// Regular expression to match a string that is supposed to be a
standard
// reference
- private static final String REGEX_FALLBACK = "\\(?" +
"(?<mainOrganization>[A-Z]\\w+)"
- + "\\)?((\\s?(?<separator>\\/)\\s?)(\\w+\\s)*\\(?" +
"(?<secondOrganization>[A-Z]\\w+)" + "\\)?)?"
+ private static final String REGEX_FALLBACK = "\\(?" +
"(?<mainOrganization>[A-Z]\\w{1,100})"
+ +
"\\)?((\\s?(?<separator>\\/)\\s?)(\\w{1,100}\\s)*\\(?" +
"(?<secondOrganization>[A" +
+ "-Z]\\w{1,100})" + "\\)?)?"
+ REGEX_STANDARD_TYPE + "?" + "(-|\\s)?" +
REGEX_IDENTIFIER;
// Regular expression to match the standard organization within a string