Repository: commons-text Updated Branches: refs/heads/SANDBOX-498-OPTIONS [created] 331f80bfc
SANDBOX-498 Add parser options and initialise regular expressions once Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/331f80bf Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/331f80bf Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/331f80bf Branch: refs/heads/SANDBOX-498-OPTIONS Commit: 331f80bfcf0380fcc35a6d18a327aef4a9e844e4 Parents: bf8bfb0 Author: Bruno P. Kinoshita <brunodepau...@yahoo.com.br> Authored: Mon Apr 20 15:41:05 2015 +1200 Committer: Bruno P. Kinoshita <brunodepau...@yahoo.com.br> Committed: Mon Apr 20 15:41:09 2015 +1200 ---------------------------------------------------------------------- .../commons/text/names/HumanNameParser.java | 73 ++++++++++++-------- .../commons/text/names/ParserOptions.java | 59 ++++++++++++++++ 2 files changed, 102 insertions(+), 30 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/commons-text/blob/331f80bf/src/main/java/org/apache/commons/text/names/HumanNameParser.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/names/HumanNameParser.java b/src/main/java/org/apache/commons/text/names/HumanNameParser.java index 5407d15..e7a3927 100644 --- a/src/main/java/org/apache/commons/text/names/HumanNameParser.java +++ b/src/main/java/org/apache/commons/text/names/HumanNameParser.java @@ -17,8 +17,6 @@ */ package org.apache.commons.text.names; -import java.util.Arrays; -import java.util.List; import java.util.Objects; import org.apache.commons.lang3.StringUtils; @@ -100,22 +98,51 @@ import org.apache.commons.lang3.StringUtils; */ public final class HumanNameParser { - private final List<String> suffixes; - private final List<String> prefixes; + /** + * The options used by the parser. + */ + private final ParserOptions options; + + /* + * Regular expressions used by the parser. + */ + // The regex use is a bit tricky. *Everything* matched by the regex will be replaced, + // but you can select a particular parenthesized submatch to be returned. + // Also, note that each regex requres that the preceding ones have been run, and matches chopped out. + // names that starts or end w/ an apostrophe break this + private final static String NICKNAMES_REGEX = "(?i) ('|\\\"|\\(\\\"*'*)(.+?)('|\\\"|\\\"*'*\\)) "; + // note the lookahead, which isn't returned or replaced + private final static String LEADING_INIT_REGEX = "(?i)(^(.\\.*)(?= \\p{L}{2}))"; + private final static String FIRST_NAME_REGEX = "(?i)^([^ ]+)"; + private final String suffixRegex; + private final String lastRegex; + /** * Creates a new parser. */ public HumanNameParser() { - // TODO make this configurable - this.suffixes = Arrays.asList( - "esq", "esquire", "jr", - "sr", "2", "ii", "iii", "iv"); - this.prefixes = Arrays.asList( - "bar", "ben", "bin", "da", "dal", - "de la", "de", "del", "der", "di", "ibn", "la", "le", - "san", "st", "ste", "van", "van der", "van den", "vel", - "von" ); + this(ParserOptions.DEFAULT_OPTIONS); + } + + /** + * Creates a new parser by providing options. + */ + public HumanNameParser(ParserOptions options) { + this.options = options; + final String suffixes = StringUtils.join(options.getSuffixes(), "\\.*|") + "\\.*"; + final String prefixes = StringUtils.join(options.getPrefixes(), " |") + " "; + suffixRegex = "(?i),* *((" + suffixes + ")$)"; + lastRegex = "(?i)(?!^)\\b([^ ]+ y |" + prefixes + ")*[^ ]+$"; + } + + /** + * Gets the parser options. + * + * @return parser options + */ + public ParserOptions getOptions() { + return options; } /** @@ -129,23 +156,9 @@ public final class HumanNameParser { Objects.requireNonNull(name, "Parameter 'name' must not be null."); NameString nameString = new NameString(name); - // TODO compile regexes only once when the parser is created - String suffixes = StringUtils.join(this.suffixes, "\\.*|") + "\\.*"; - String prefixes = StringUtils.join(this.prefixes, " |") + " "; - - // The regex use is a bit tricky. *Everything* matched by the regex will be replaced, - // but you can select a particular parenthesized submatch to be returned. - // Also, note that each regex requres that the preceding ones have been run, and matches chopped out. - // names that starts or end w/ an apostrophe break this - String nicknamesRegex = "(?i) ('|\\\"|\\(\\\"*'*)(.+?)('|\\\"|\\\"*'*\\)) "; - String suffixRegex = "(?i),* *((" + suffixes + ")$)"; - String lastRegex = "(?i)(?!^)\\b([^ ]+ y |" + prefixes + ")*[^ ]+$"; - // note the lookahead, which isn't returned or replaced - String leadingInitRegex = "(?i)(^(.\\.*)(?= \\p{L}{2}))"; - String firstRegex = "(?i)^([^ ]+)"; // get nickname, if there is one - String nickname = nameString.chopWithRegex(nicknamesRegex, 2); + String nickname = nameString.chopWithRegex(NICKNAMES_REGEX, 2); // get suffix, if there is one String suffix = nameString.chopWithRegex(suffixRegex, 1); @@ -157,10 +170,10 @@ public final class HumanNameParser { String last = nameString.chopWithRegex(lastRegex, 0); // get the first initial, if there is one - String leadingInit = nameString.chopWithRegex(leadingInitRegex, 1); + String leadingInit = nameString.chopWithRegex(LEADING_INIT_REGEX, 1); // get the first name - String first = nameString.chopWithRegex(firstRegex, 0); + String first = nameString.chopWithRegex(FIRST_NAME_REGEX, 0); if (StringUtils.isBlank(first)) { throw new NameParseException("Couldn't find a first name in '{" + nameString.getWrappedString() + "}'"); } http://git-wip-us.apache.org/repos/asf/commons-text/blob/331f80bf/src/main/java/org/apache/commons/text/names/ParserOptions.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/names/ParserOptions.java b/src/main/java/org/apache/commons/text/names/ParserOptions.java new file mode 100644 index 0000000..6bca771 --- /dev/null +++ b/src/main/java/org/apache/commons/text/names/ParserOptions.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.names; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +/** + * Options for the {@link HumanNameParser} parser. + */ +public final class ParserOptions { + + public static final ParserOptions DEFAULT_OPTIONS = new ParserOptions(); + + private final Set<String> suffixes; + + private final Set<String> prefixes; + + public ParserOptions() { + this.suffixes = new HashSet<String>(Arrays.asList( + "esq", "esquire", "jr", + "sr", "2", "ii", "iii", "iv")); + this.prefixes = new HashSet<String>(Arrays.asList( + "bar", "ben", "bin", "da", "dal", + "de la", "de", "del", "der", "di", "ibn", "la", "le", + "san", "st", "ste", "van", "van der", "van den", "vel", + "von")); + } + + /** + * @return the suffixes + */ + public Set<String> getSuffixes() { + return suffixes; + } + + /** + * @return the prefixes + */ + public Set<String> getPrefixes() { + return prefixes; + } + +}