[text] TEXT-19 Add alphabet converter

kinow Sun, 02 Oct 2016 00:23:18 -0700

Repository: commons-text
Updated Branches:
  refs/heads/master 07f4dd9a8 -> c05e0251a



TEXT-19 Add alphabet converter

closes #5


Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/c05e0251
Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/c05e0251
Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/c05e0251

Branch: refs/heads/master
Commit: c05e0251a0178f8544696e3d1451826b4d66cc40
Parents: 07f4dd9
Author: eallweil <eallw...@paypal.com>
Authored: Mon Sep 19 09:51:00 2016 +0300
Committer: Bruno P. Kinoshita <brunodepau...@yahoo.com.br>
Committed: Sun Oct 2 20:14:13 2016 +1300

----------------------------------------------------------------------
 .../apache/commons/text/AlphabetConverter.java  | 436 +++++++++++++++++++
 .../commons/text/AlphabetConverterTest.java     | 204 +++++++++
 2 files changed, 640 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/commons-text/blob/c05e0251/src/main/java/org/apache/commons/text/AlphabetConverter.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/AlphabetConverter.java 
b/src/main/java/org/apache/commons/text/AlphabetConverter.java
new file mode 100644
index 0000000..5fc3528
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/AlphabetConverter.java
@@ -0,0 +1,436 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text;
+
+import java.io.UnsupportedEncodingException;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.LinkedHashSet;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Objects;
+import java.util.Set;
+
+/**
+ * <p>
+ * Convert from one alphabet to another, with the possibility of leaving 
certain characters unencoded.
+ * </p>
+ *
+ * <p>
+ * The target and do not encode languages must be in the Unicode BMP, but the 
source language does not.
+ * </p>
+ *
+ * <p>
+ * The encoding will all be of a fixed length, except for the 'do not encode' 
chars, which will be of length 1
+ * </p>
+ *
+ * <h3>Sample usage</h3>
+ *
+ * <pre>
+ * Character[] originals; // a, b, c, d
+ * Character[] encoding; // 0, 1, d
+ * Character[] doNotEncode; // d
+ *
+ * AlphabetConverter ac = 
AlphabetConverter.createConverterFromChars(originals, encoding, doNotEncode);
+ *
+ * ac.encode("a"); // 00
+ * ac.encode("b"); // 01
+ * ac.encode("c"); // 0d
+ * ac.encode("d"); // d
+ * ac.encode("abcd"); // 00010dd
+ * </pre>
+ *
+ * <p>
+ * #ThreadSafe# AlphabetConverter class methods are threadsafe as they do not 
change internal state.
+ * </p>
+ *
+ * @since 0.1
+ */
+public class AlphabetConverter {
+
+    private final Map<Integer, String> originalToEncoded;
+    private final Map<String, String> encodedToOriginal;
+
+    private final int encodedLetterLength;
+
+    private static final String ARROW = " -> ";
+    private static final String LINE_SEPARATOR = 
System.getProperty("line.separator");
+
+    /**
+     * Hidden constructor for alphabet converter. Used by static helper 
methods.
+     *
+     * @param originalToEncoded original string to be encoded
+     * @param encodedToOriginal encoding alphabet
+     * @param doNotEncodeMap encoding black list
+     * @param encodedLetterLength length of the encoded letter
+     */
+    private AlphabetConverter(Map<Integer, String> originalToEncoded, 
Map<String, String> encodedToOriginal,
+            Map<Integer, String> doNotEncodeMap, int encodedLetterLength) {
+
+        this.originalToEncoded = originalToEncoded;
+        this.encodedToOriginal = encodedToOriginal;
+        this.encodedLetterLength = encodedLetterLength;
+    }
+
+    /**
+     * Encode a given string.
+     *
+     * @param original the string to be encoded
+     * @return the encoded string, {@code null} if the given string is null
+     * @throws UnsupportedEncodingException if chars that are not supported 
are encountered
+     */
+    public String encode(String original) throws UnsupportedEncodingException {
+        if (original == null) {
+            return null;
+        }
+
+        StringBuilder sb = new StringBuilder();
+
+        for (int i = 0; i < original.length();) {
+            int codepoint = original.codePointAt(i);
+
+            String nextLetter = originalToEncoded.get(codepoint);
+
+            if (nextLetter == null) {
+                throw new UnsupportedEncodingException(
+                        "Couldn't find encoding for '" + 
codePointToString(codepoint) + "' in " + original);
+            }
+
+            sb.append(nextLetter);
+
+            i += Character.charCount(codepoint);
+        }
+
+        return sb.toString();
+    }
+
+    /**
+     * Decodes a given string
+     *
+     * @param encoded a string that has been encoded using this 
AlphabetConverter
+     * @return the decoded string, {@code null} if the given string is null
+     * @throws UnsupportedEncodingException if unexpected characters that 
cannot be handled are encountered
+     */
+    public String decode(String encoded) throws UnsupportedEncodingException {
+        if (encoded == null) {
+            return null;
+        }
+
+        StringBuilder result = new StringBuilder();
+
+        for (int j = 0; j < encoded.length();) {
+            Integer i = encoded.codePointAt(j);
+            String s = codePointToString(i);
+
+            if (s.equals(originalToEncoded.get(i))) {
+                result.append(s);
+                j++; // because we do not encode in Unicode extended the 
length of each encoded char is 1
+            } else {
+                if (j + encodedLetterLength > encoded.length()) {
+                    throw new UnsupportedEncodingException("Unexpected end of 
string while decoding " + encoded);
+                } else {
+                    String nextGroup = encoded.substring(j, j + 
encodedLetterLength);
+                    String next = encodedToOriginal.get(nextGroup);
+                    if (next == null) {
+                        throw new UnsupportedEncodingException(
+                                "Unexpected string without decoding (" + 
nextGroup + ") in " + encoded);
+                    } else {
+                        result.append(next);
+                        j += encodedLetterLength;
+                    }
+                }
+            }
+        }
+
+        return result.toString();
+    }
+
+    /**
+     * Get the length of characters in the encoded alphabet that are necessary 
for each character in the original
+     * alphabet.
+     *
+     * @return the length of the encoded char
+     */
+    public int getEncodedCharLength() {
+        return encodedLetterLength;
+    }
+
+    /**
+     * Get the mapping from integer code point of source language to encoded 
string. Use to reconstruct converter from
+     * serialized map
+     *
+     * @return the original map
+     */
+    public Map<Integer, String> getOriginalToEncoded() {
+        return Collections.unmodifiableMap(originalToEncoded);
+    }
+
+    /**
+     * Recursive method used when creating encoder/decoder
+     */
+    private void addSingleEncoding(int level, String currentEncoding, 
Collection<Integer> encoding,
+            Iterator<Integer> originals, Map<Integer, String> doNotEncodeMap) {
+
+        if (level > 0) {
+            for (int encodingLetter : encoding) {
+                if (originals.hasNext()) {
+
+                    // this skips the doNotEncode chars if they are in the
+                    // leftmost place
+                    if (level != encodedLetterLength || 
!doNotEncodeMap.containsKey(encodingLetter)) {
+                        addSingleEncoding(level - 1, currentEncoding + 
codePointToString(encodingLetter), encoding,
+                                originals, doNotEncodeMap);
+                    }
+                } else {
+                    return; // done encoding all the original alphabet
+                }
+            }
+        } else {
+            Integer next = originals.next();
+
+            while (doNotEncodeMap.containsKey(next)) {
+                String originalLetterAsString = codePointToString(next);
+
+                originalToEncoded.put(next, originalLetterAsString);
+                encodedToOriginal.put(originalLetterAsString, 
originalLetterAsString);
+
+                if (!originals.hasNext()) {
+                    return;
+                }
+
+                next = originals.next();
+            }
+
+            String originalLetterAsString = codePointToString(next);
+
+            originalToEncoded.put(next, currentEncoding);
+            encodedToOriginal.put(currentEncoding, originalLetterAsString);
+        }
+    }
+
+    @Override
+    public String toString() {
+        StringBuilder sb = new StringBuilder();
+
+        for (Entry<Integer, String> entry : originalToEncoded.entrySet()) {
+            
sb.append(codePointToString(entry.getKey())).append(ARROW).append(entry.getValue()).append(LINE_SEPARATOR);
+        }
+
+        return sb.toString();
+    }
+
+    @Override
+    public boolean equals(Object obj) {
+        if (obj == null) {
+            return false;
+        }
+        if (obj == this) {
+            return true;
+        }
+        if (obj instanceof AlphabetConverter == false) {
+            return false;
+        }
+        final AlphabetConverter other = (AlphabetConverter) obj;
+        return originalToEncoded.equals(other.originalToEncoded) && 
encodedToOriginal.equals(other.encodedToOriginal)
+                && encodedLetterLength == other.encodedLetterLength;
+    }
+
+    @Override
+    public int hashCode() {
+        return Objects.hash(originalToEncoded, encodedToOriginal, 
encodedLetterLength);
+    }
+
+    // -- static methods
+
+    /**
+     * Create a new converter from a map.
+     *
+     * @param originalToEncoded a map returned from getOriginalToEncoded()
+     * @return the reconstructed AlphabetConverter
+     * @see AlphabetConverter#getOriginalToEncoded()
+     */
+    public static AlphabetConverter createConverterFromMap(Map<Integer, 
String> originalToEncoded) {
+        final Map<Integer, String> unmodifiableOriginalToEncoded = 
Collections.unmodifiableMap(originalToEncoded);
+        Map<String, String> encodedToOriginal = new LinkedHashMap<>();
+        Map<Integer, String> doNotEncodeMap = new HashMap<>();
+
+        int encodedLetterLength = 1;
+
+        for (Entry<Integer, String> e : 
unmodifiableOriginalToEncoded.entrySet()) {
+            String originalAsString = codePointToString(e.getKey());
+            encodedToOriginal.put(e.getValue(), originalAsString);
+
+            if (e.getValue().equals(originalAsString)) {
+                doNotEncodeMap.put(e.getKey(), e.getValue());
+            }
+
+            if (e.getValue().length() > encodedLetterLength) {
+                encodedLetterLength = e.getValue().length();
+            }
+        }
+
+        return new AlphabetConverter(unmodifiableOriginalToEncoded, 
encodedToOriginal, doNotEncodeMap,
+                encodedLetterLength);
+    }
+
+    /**
+     * Create an alphabet converter, for converting from the original 
alphabet, to the encoded alphabet, while leaving
+     * the characters in <em>doNotEncode</em> as they are (if possible).
+     * 
+     * Duplicate letters in either original or encoding will be ignored. 
+     *
+     * @param original an array of chars representing the original alphabet
+     * @param encoding an array of chars representing the alphabet to be used 
for encoding
+     * @param doNotEncode an array of chars to be encoded using the original 
alphabet - every char here must appear in
+     *            both the previous params
+     * @return the AlphabetConverter
+     * @throws IllegalArgumentException if an AlphabetConverter cannot be 
constructed
+     */
+    public static AlphabetConverter createConverterFromChars(Character[] 
original, Character[] encoding,
+            Character[] doNotEncode) {
+        return 
AlphabetConverter.createConverter(convertCharsToIntegers(original), 
convertCharsToIntegers(encoding),
+                convertCharsToIntegers(doNotEncode));
+    }
+
+    private static Integer[] convertCharsToIntegers(Character[] chars) {
+        if (chars == null || chars.length == 0) {
+            return new Integer[0];
+        }
+        Integer[] integers = new Integer[chars.length];
+        for (int i = 0; i < chars.length; i++) {
+            integers[i] = (int) chars[i];
+        }
+        return integers;
+    }
+
+    /**
+     * Create an alphabet converter, for converting from the original 
alphabet, to the encoded alphabet, while leaving
+     * the characters in <em>doNotEncode</em> as they are (if possible)
+     * 
+     * Duplicate letters in either original or encoding will be ignored 
+     *
+     * @param original an array of ints representing the original alphabet in 
codepoints
+     * @param encoding an array of ints representing the alphabet to be used 
for encoding, in codepoints
+     * @param doNotEncode an array of ints representing the chars to be 
encoded using the original alphabet - every char
+     *            here must appear in both the previous params
+     * @return the AlphabetConverter
+     * @throws IllegalArgumentException if an AlphabetConverter cannot be 
constructed
+     */
+    public static AlphabetConverter createConverter(Integer[] original, 
Integer[] encoding, Integer[] doNotEncode) {
+
+        Set<Integer> originalCopy = new LinkedHashSet<>(Arrays.<Integer> 
asList(original));
+        Set<Integer> encodingCopy = new LinkedHashSet<>(Arrays.<Integer> 
asList(encoding));
+        Set<Integer> doNotEncodeCopy = new LinkedHashSet<>(Arrays.<Integer> 
asList(doNotEncode));
+
+        final Map<Integer, String> originalToEncoded = new LinkedHashMap<>();
+        final Map<String, String> encodedToOriginal = new LinkedHashMap<>();
+        final Map<Integer, String> doNotEncodeMap = new HashMap<>();
+
+        int encodedLetterLength;
+
+        for (int i : doNotEncodeCopy) {
+            if (!originalCopy.contains(i)) {
+                throw new IllegalArgumentException(
+                        "Can not use 'do not encode' list because original 
alphabet does not contain '"
+                                + codePointToString(i) + "'");
+            }
+
+            if (!encodingCopy.contains(i)) {
+                throw new IllegalArgumentException(
+                        "Can not use 'do not encode' list because encoding 
alphabet does not contain '"
+                                + codePointToString(i) + "'");
+            }
+
+            doNotEncodeMap.put(i, codePointToString(i));
+        }
+
+        if (encodingCopy.size() >= originalCopy.size()) {
+            encodedLetterLength = 1;
+
+            Iterator<Integer> it = encodingCopy.iterator();
+
+            for (int originalLetter : originalCopy) {
+                String originalLetterAsString = 
codePointToString(originalLetter);
+
+                if (doNotEncodeMap.containsKey(originalLetter)) {
+                    originalToEncoded.put(originalLetter, 
originalLetterAsString);
+                    encodedToOriginal.put(originalLetterAsString, 
originalLetterAsString);
+                } else {
+                    Integer next = it.next();
+
+                    while (doNotEncodeCopy.contains(next)) {
+                        next = it.next();
+                    }
+
+                    String encodedLetter = codePointToString(next);
+
+                    originalToEncoded.put(originalLetter, encodedLetter);
+                    encodedToOriginal.put(encodedLetter, 
originalLetterAsString);
+                }
+            }
+
+            return new AlphabetConverter(originalToEncoded, encodedToOriginal, 
doNotEncodeMap, encodedLetterLength);
+
+        } else if (encodingCopy.size() - doNotEncodeCopy.size() < 2) {
+            throw new IllegalArgumentException(
+                    "Must have at least two encoding characters (not counting 
those in the 'do not encode' list), but has "
+                            + (encodingCopy.size() - doNotEncodeCopy.size()));
+        } else {
+            // we start with one which is our minimum, and because we do the
+            // first division outside the loop
+            int lettersSoFar = 1;
+
+            // the first division takes into account that the doNotEncode
+            // letters can't be in the leftmost place
+            int lettersLeft = (originalCopy.size() - doNotEncodeCopy.size())
+                    / (encodingCopy.size() - doNotEncodeCopy.size());
+
+            while (lettersLeft / encodingCopy.size() >= 1) {
+                lettersLeft = lettersLeft / encodingCopy.size();
+                lettersSoFar++;
+            }
+
+            encodedLetterLength = lettersSoFar + 1;
+
+            AlphabetConverter ac = new AlphabetConverter(originalToEncoded, 
encodedToOriginal, doNotEncodeMap,
+                    encodedLetterLength);
+
+            ac.addSingleEncoding(encodedLetterLength, "", encodingCopy, 
originalCopy.iterator(), doNotEncodeMap);
+
+            return ac;
+        }
+    }
+
+    /**
+     * Create new String that contains just the given code point.
+     *
+     * @param i code point
+     * @return a new string with the new code point
+     * @see 
http://www.oracle.com/us/technologies/java/supplementary-142654.html
+     */
+    private static String codePointToString(int i) {
+        if (Character.charCount(i) == 1) {
+            return String.valueOf((char) i);
+        } else {
+            return new String(Character.toChars(i));
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/commons-text/blob/c05e0251/src/test/java/org/apache/commons/text/AlphabetConverterTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/commons/text/AlphabetConverterTest.java 
b/src/test/java/org/apache/commons/text/AlphabetConverterTest.java
new file mode 100644
index 0000000..94c4a30
--- /dev/null
+++ b/src/test/java/org/apache/commons/text/AlphabetConverterTest.java
@@ -0,0 +1,204 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text;
+
+import java.io.UnsupportedEncodingException;
+import java.util.Arrays;
+import java.util.List;
+
+import org.junit.Assert;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.ExpectedException;
+
+/**
+ * Unit tests for {@link org.apache.commons.lang3.text.AlphabetConverter}.
+ */
+public class AlphabetConverterTest {
+
+    private static Character[] lower_case_english = {' 
','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z'};
+    private static Character[] english_and_numbers = 
{'0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z','
 ' };
+    private static Character[] lower_case_english_and_numbers = 
{'0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','
 ' };
+    private static Character[] numbers = 
{'0','1','2','3','4','5','6','7','8','9'};
+    private static Character[] binary = {'0','1'};
+    private static Character[] hebrew = {'_', ' ', 
'\u05e7','\u05e8','\u05d0','\u05d8','\u05d5','\u05df','\u05dd','\u05e4','\u05e9','\u05d3','\u05d2','\u05db','\u05e2','\u05d9','\u05d7','\u05dc','\u05da','\u05e3','\u05d6','\u05e1','\u05d1','\u05d4','\u05e0','\u05de','\u05e6','\u05ea','\u05e5'};
+    private static Character[] empty = {};
+
+    private static Integer[] unicode = 
{32,35395,35397,36302,36291,35203,35201,35215,35219,35268,97,98,99,100,101,102,103,104,105,106,107,108,109,110,1001,1002,1003,1004,1005};
+    private static Integer[] lower_case_english_codepoints = 
{32,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122};
+    private static Integer[] doNotEncodePoints = {32,97,98,99}; // space, a, 
b, c
+    
+    @Rule
+    public ExpectedException thrown = ExpectedException.none();
+    
+    @Test
+    public void encodeFailureTest() throws UnsupportedEncodingException {
+        thrown.expect(UnsupportedEncodingException.class);
+        thrown.expectMessage("Couldn't find encoding for '3'");
+        test(binary, numbers, empty, "3");
+    }
+
+    @Test
+    public void binaryTest() throws UnsupportedEncodingException {
+        test(binary, numbers, empty, "0", "1", "10", "11");
+        test(numbers, binary, empty, "12345", "0");
+        test(lower_case_english, binary, empty, "abc", "a");
+    }
+
+    @Test
+    public void hebrewTest() throws UnsupportedEncodingException {
+        test(hebrew, binary, empty, "\u05d0", "\u05e2", 
"\u05d0\u05dc\u05e3_\u05d0\u05d5\u05d4\u05d1\u05dc_\u05d1\u05d9\u05ea_\u05d6\u05d4_\u05d1\u05d9\u05ea_\u05d2\u05d9\u05de\u05dc_\u05d6\u05d4_\u05db\u05de\u05dc_\u05d2\u05d3\u05d5\u05dc");
+        test(hebrew, numbers, empty, "\u05d0", "\u05e2", 
"\u05d0\u05dc\u05e3_\u05d0\u05d5\u05d4\u05d1\u05dc_\u05d1\u05d9\u05ea_\u05d6\u05d4_\u05d1\u05d9\u05ea_\u05d2\u05d9\u05de\u05dc_\u05d6\u05d4_\u05db\u05de\u05dc_\u05d2\u05d3\u05d5\u05dc");
+        test(numbers, hebrew, empty, "123456789", "1", "5");
+        test(lower_case_english, hebrew, empty, "this is a test");
+    }
+
+    @Test
+    public void doNotEncodeTest() throws UnsupportedEncodingException {
+        test(english_and_numbers, lower_case_english_and_numbers, 
lower_case_english, "1", "456", "abc", "ABC", "this will not be converted but 
THIS WILL");
+        test(english_and_numbers, lower_case_english_and_numbers, numbers, 
"1", "456", "abc", "ABC", "this will be converted but 12345 and this will be");
+    }
+
+    private AlphabetConverter createJavadocExample() {
+        Character[] original = {'a','b','c','d'};
+        Character[] encoding = {'0','1','d'};
+        Character[] doNotEncode = {'d'};
+        
+        return AlphabetConverter.createConverterFromChars(original, encoding, 
doNotEncode);
+    }
+    
+    /*
+     * Test example in javadocs for consistency
+     */
+    @Test
+    public void javadocExampleTest() throws UnsupportedEncodingException {
+        AlphabetConverter ac = createJavadocExample();
+        
+        Assert.assertEquals("00", ac.encode("a"));
+        Assert.assertEquals("01", ac.encode("b"));
+        Assert.assertEquals("0d", ac.encode("c"));
+        Assert.assertEquals("d", ac.encode("d"));
+        Assert.assertEquals("00010dd", ac.encode("abcd"));
+    }
+
+    @Test
+    public void unexpectedEndwhileDecodingTest() throws 
UnsupportedEncodingException {
+        String toDecode = "00d01d0";
+        
+        thrown.expect(UnsupportedEncodingException.class);
+        thrown.expectMessage("Unexpected end of string while decoding " + 
toDecode);
+
+        AlphabetConverter ac = createJavadocExample();
+        ac.decode(toDecode);
+    }
+
+    @Test
+    public void unexpectedStringWhileDecodingTest() throws 
UnsupportedEncodingException {
+        String toDecode = "00XX";
+        
+        thrown.expect(UnsupportedEncodingException.class);
+        thrown.expectMessage("Unexpected string without decoding (XX) in " + 
toDecode);
+
+        AlphabetConverter ac = createJavadocExample();
+        ac.decode(toDecode);
+    }
+
+    /*
+     * Test constructor from code points
+     */
+    @Test
+    public void unicodeTest() throws UnsupportedEncodingException {
+        AlphabetConverter ac = AlphabetConverter.createConverter(unicode, 
lower_case_english_codepoints, doNotEncodePoints);
+        
+        Assert.assertEquals(2, ac.getEncodedCharLength());
+        
+        String original = "\u8a43\u8a45 \u8dce ab \u8dc3 c \u8983";
+        String encoded = ac.encode(original);
+        String decoded = ac.decode(encoded);
+        
+        Assert.assertEquals("Encoded '" + original + "' into '" + encoded + 
"', but decoded into '" + decoded + "'", original, decoded);
+    }
+
+    @Test
+    public void noEncodingLettersTest() {
+        thrown.expect(IllegalArgumentException.class);
+        thrown.expectMessage("Must have at least two encoding characters (not 
counting those in the 'do not encode' list), but has 0");
+
+        AlphabetConverter.createConverterFromChars(english_and_numbers, 
numbers, numbers);
+    }
+
+    @Test
+    public void onlyOneEncodingLettersTest() {
+        thrown.expect(IllegalArgumentException.class);
+        thrown.expectMessage("Must have at least two encoding characters (not 
counting those in the 'do not encode' list), but has 1");
+
+        Character[] numbersPlusUnderscore = Arrays.copyOf(numbers, 
numbers.length + 1);
+        numbersPlusUnderscore[numbersPlusUnderscore.length -1] = '_';
+
+        AlphabetConverter.createConverterFromChars(english_and_numbers, 
numbersPlusUnderscore, numbers);
+    }
+
+    @Test
+    public void missingDoNotEncodeLettersFromEncodingTest() {
+        thrown.expect(IllegalArgumentException.class);
+        thrown.expectMessage("Can not use 'do not encode' list because 
encoding alphabet does not contain");
+
+        AlphabetConverter.createConverterFromChars(english_and_numbers, 
lower_case_english, numbers);
+    }
+
+    @Test
+    public void missingDoNotEncodeLettersFromOriginalTest() {
+        thrown.expect(IllegalArgumentException.class);
+        thrown.expectMessage("Can not use 'do not encode' list because 
original alphabet does not contain");
+
+        AlphabetConverter.createConverterFromChars(lower_case_english, 
english_and_numbers, numbers);
+    }
+
+    private void test(Character[] originalChars, Character[] encodingChars, 
Character[] doNotEncodeChars, String... strings) throws 
UnsupportedEncodingException {
+        
+        AlphabetConverter ac = 
AlphabetConverter.createConverterFromChars(originalChars, encodingChars, 
doNotEncodeChars);
+        
+        AlphabetConverter reconstructedAlphabetConverter = 
AlphabetConverter.createConverterFromMap(ac.getOriginalToEncoded());
+        
+        Assert.assertEquals(ac, reconstructedAlphabetConverter);
+        Assert.assertEquals(ac.hashCode(), 
reconstructedAlphabetConverter.hashCode());
+        Assert.assertEquals(ac.toString(), 
reconstructedAlphabetConverter.toString());
+        Assert.assertEquals(null, ac.encode(null)); // test null conversions
+        Assert.assertEquals("", ac.encode("")); // test empty conversion
+
+        // test all the trial strings
+        for (String s : strings) {
+            String encoded = ac.encode(s);
+
+            // test that only encoding chars are used
+            List<Character> originalEncodingChars = 
Arrays.asList(encodingChars);
+            for (int i = 0; i < encoded.length(); i++) {
+                
Assert.assertTrue(originalEncodingChars.contains(encoded.charAt(i)));
+            }
+
+            String decoded = ac.decode(encoded);
+
+            // test that only the original alphabet is used after decoding
+            List<Character> originalCharsList = Arrays.asList(originalChars);
+            for (int i = 0; i < decoded.length(); i++) {
+                
Assert.assertTrue(originalCharsList.contains(decoded.charAt(i)));
+            }
+            
+            Assert.assertEquals("Encoded '" + s + "' into '" + encoded + "', 
but decoded into '" + decoded + "'", s, decoded);
+        }
+    }
+}

[text] TEXT-19 Add alphabet converter

Reply via email to