Move classes from the internal package into the package where they are used and make them package private.
Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/df681238 Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/df681238 Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/df681238 Branch: refs/heads/master Commit: df681238bf5bcb2fece950b644a7d00a712d0cc8 Parents: 75db6de Author: Benedikt Ritter <brit...@apache.org> Authored: Sun Apr 19 10:32:13 2015 +0200 Committer: Benedikt Ritter <brit...@apache.org> Committed: Sun Apr 19 10:37:50 2015 +0200 ---------------------------------------------------------------------- .../commons/text/similarity/CosineDistance.java | 6 -- .../apache/commons/text/similarity/Counter.java | 60 ++++++++++++++++++++ .../commons/text/similarity/RegexTokenizer.java | 50 ++++++++++++++++ .../commons/text/similarity/Tokenizer.java | 34 +++++++++++ .../text/similarity/internal/Counter.java | 60 -------------------- .../similarity/internal/RegexTokenizer.java | 50 ---------------- .../text/similarity/internal/Tokenizer.java | 34 ----------- .../text/similarity/internal/package-info.java | 23 -------- .../commons/text/similarity/package-info.java | 2 +- 9 files changed, 145 insertions(+), 174 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/commons-text/blob/df681238/src/main/java/org/apache/commons/text/similarity/CosineDistance.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/CosineDistance.java b/src/main/java/org/apache/commons/text/similarity/CosineDistance.java index 2fa4515..c5e8853 100644 --- a/src/main/java/org/apache/commons/text/similarity/CosineDistance.java +++ b/src/main/java/org/apache/commons/text/similarity/CosineDistance.java @@ -18,17 +18,11 @@ package org.apache.commons.text.similarity; import java.util.Map; -import org.apache.commons.text.similarity.internal.Counter; -import org.apache.commons.text.similarity.internal.RegexTokenizer; -import org.apache.commons.text.similarity.internal.Tokenizer; - /** * Measures the cosine distance between two character sequences. * * <p>It utilizes the CosineSimilarity to compute the distance. Character sequences * are converted into vectors through a simple tokenizer that works with </p> - * - * @see org.apache.commons.text.similarity.internal.RegexTokenizer */ public class CosineDistance implements EditDistance<Double> { /** http://git-wip-us.apache.org/repos/asf/commons-text/blob/df681238/src/main/java/org/apache/commons/text/similarity/Counter.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/Counter.java b/src/main/java/org/apache/commons/text/similarity/Counter.java new file mode 100644 index 0000000..5eefc51 --- /dev/null +++ b/src/main/java/org/apache/commons/text/similarity/Counter.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +import java.util.HashMap; +import java.util.Map; + +/** + * Java implementation of Python's collections Counter module. + * + * <p>It counts how many times each element provided occurred in an array and + * returns a dict with the element as key and the count as value.</p> + * + * @see <a href="https://docs.python.org/dev/library/collections.html#collections.Counter"> + * https://docs.python.org/dev/library/collections.html#collections.Counter</a> + */ +final class Counter { + + /** + * Hidden constructor. + */ + private Counter() { + super(); + } + + /** + * It counts how many times each element provided occurred in an array and + * returns a dict with the element as key and the count as value. + * + * @param tokens array of tokens + * @return dict, where the elements are key, and the count the value + */ + public static Map<CharSequence, Integer> of(CharSequence[] tokens) { + final Map<CharSequence, Integer> innerCounter = new HashMap<CharSequence, Integer>(); + for (CharSequence token : tokens) { + if (innerCounter.containsKey(token)) { + int value = innerCounter.get(token); + innerCounter.put(token, ++value); + } else { + innerCounter.put(token, 1); + } + } + return innerCounter; + } + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/df681238/src/main/java/org/apache/commons/text/similarity/RegexTokenizer.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/RegexTokenizer.java b/src/main/java/org/apache/commons/text/similarity/RegexTokenizer.java new file mode 100644 index 0000000..5a6c5d3 --- /dev/null +++ b/src/main/java/org/apache/commons/text/similarity/RegexTokenizer.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * A simple word tokenizer that utilizes regex to find words. It applies a regex + * {@code}(\w)+{@code} over the input text to extract words from a given character + * sequence. + */ +class RegexTokenizer implements Tokenizer<CharSequence> { + + /** + * {@inheritDoc} + * + * @throws IllegalArgumentException if the input text is blank + */ + @Override + public CharSequence[] tokenize(CharSequence text) { + if (text == null || text.toString().trim().equals("")) { + throw new IllegalArgumentException("Invalid text"); + } + Pattern pattern = Pattern.compile("(\\w)+"); + Matcher matcher = pattern.matcher(text.toString()); + List<String> tokens = new ArrayList<String>(); + while (matcher.find()) { + tokens.add(matcher.group(0)); + } + return tokens.toArray(new String[0]); + } + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/df681238/src/main/java/org/apache/commons/text/similarity/Tokenizer.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/Tokenizer.java b/src/main/java/org/apache/commons/text/similarity/Tokenizer.java new file mode 100644 index 0000000..0a69d24 --- /dev/null +++ b/src/main/java/org/apache/commons/text/similarity/Tokenizer.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +/** + * A tokenizer. Can produce arrays of tokens from a given type. + * + * @param <T> given type + */ +interface Tokenizer<T> { + + /** + * Returns an array of tokens. + * + * @param text input text + * @return array of tokens + */ + T[] tokenize(CharSequence text); + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/df681238/src/main/java/org/apache/commons/text/similarity/internal/Counter.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/internal/Counter.java b/src/main/java/org/apache/commons/text/similarity/internal/Counter.java deleted file mode 100644 index c0dd2e6..0000000 --- a/src/main/java/org/apache/commons/text/similarity/internal/Counter.java +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.text.similarity.internal; - -import java.util.HashMap; -import java.util.Map; - -/** - * Java implementation of Python's collections Counter module. - * - * <p>It counts how many times each element provided occurred in an array and - * returns a dict with the element as key and the count as value.</p> - * - * @see <a href="https://docs.python.org/dev/library/collections.html#collections.Counter"> - * https://docs.python.org/dev/library/collections.html#collections.Counter</a> - */ -public final class Counter { - - /** - * Hidden constructor. - */ - private Counter() { - super(); - } - - /** - * It counts how many times each element provided occurred in an array and - * returns a dict with the element as key and the count as value. - * - * @param tokens array of tokens - * @return dict, where the elements are key, and the count the value - */ - public static Map<CharSequence, Integer> of(CharSequence[] tokens) { - final Map<CharSequence, Integer> innerCounter = new HashMap<CharSequence, Integer>(); - for (CharSequence token : tokens) { - if (innerCounter.containsKey(token)) { - int value = innerCounter.get(token); - innerCounter.put(token, ++value); - } else { - innerCounter.put(token, 1); - } - } - return innerCounter; - } - -} http://git-wip-us.apache.org/repos/asf/commons-text/blob/df681238/src/main/java/org/apache/commons/text/similarity/internal/RegexTokenizer.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/internal/RegexTokenizer.java b/src/main/java/org/apache/commons/text/similarity/internal/RegexTokenizer.java deleted file mode 100644 index 082ac05..0000000 --- a/src/main/java/org/apache/commons/text/similarity/internal/RegexTokenizer.java +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.text.similarity.internal; - -import java.util.ArrayList; -import java.util.List; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -/** - * A simple word tokenizer that utilizes regex to find words. It applies a regex - * {@code}(\w)+{@code} over the input text to extract words from a given character - * sequence. - */ -public class RegexTokenizer implements Tokenizer<CharSequence> { - - /** - * {@inheritDoc} - * - * @throws IllegalArgumentException if the input text is blank - */ - @Override - public CharSequence[] tokenize(CharSequence text) { - if (text == null || text.toString().trim().equals("")) { - throw new IllegalArgumentException("Invalid text"); - } - Pattern pattern = Pattern.compile("(\\w)+"); - Matcher matcher = pattern.matcher(text.toString()); - List<String> tokens = new ArrayList<String>(); - while (matcher.find()) { - tokens.add(matcher.group(0)); - } - return tokens.toArray(new String[0]); - } - -} http://git-wip-us.apache.org/repos/asf/commons-text/blob/df681238/src/main/java/org/apache/commons/text/similarity/internal/Tokenizer.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/internal/Tokenizer.java b/src/main/java/org/apache/commons/text/similarity/internal/Tokenizer.java deleted file mode 100644 index 9dc63e4..0000000 --- a/src/main/java/org/apache/commons/text/similarity/internal/Tokenizer.java +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.text.similarity.internal; - -/** - * A tokenizer. Can produce arrays of tokens from a given type. - * - * @param <T> given type - */ -public interface Tokenizer<T> { - - /** - * Returns an array of tokens. - * - * @param text input text - * @return array of tokens - */ - T[] tokenize(CharSequence text); - -} http://git-wip-us.apache.org/repos/asf/commons-text/blob/df681238/src/main/java/org/apache/commons/text/similarity/internal/package-info.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/internal/package-info.java b/src/main/java/org/apache/commons/text/similarity/internal/package-info.java deleted file mode 100644 index 548e2b7..0000000 --- a/src/main/java/org/apache/commons/text/similarity/internal/package-info.java +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * Classes used internally by similarity algorithms. Internal use only, backward compatibility - * not guaranteed. - * - * @since 0.1 - */ -package org.apache.commons.text.similarity.internal; http://git-wip-us.apache.org/repos/asf/commons-text/blob/df681238/src/main/java/org/apache/commons/text/similarity/package-info.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/package-info.java b/src/main/java/org/apache/commons/text/similarity/package-info.java index bd1e400..d2a19a6 100644 --- a/src/main/java/org/apache/commons/text/similarity/package-info.java +++ b/src/main/java/org/apache/commons/text/similarity/package-info.java @@ -33,7 +33,7 @@ * </ul> * * <p>The {@link org.apache.commons.text.similarity.CosineDistance Cosine Distance} - * utilises a {@link org.apache.commons.text.similarity.internal.RegexTokenizer regular expression tokenizer (\w+)}. + * utilises a {@link org.apache.commons.text.similarity.RegexTokenizer regular expression tokenizer (\w+)}. * And the {@link org.apache.commons.text.similarity.LevenshteinDistance Levenshtein Distance}'s * behaviour can be changed to take into consideration a maximum throughput.</p> *