fhueske commented on a change in pull request #6823: [FLINK-10134] UTF-16 support for TextInputFormat bug refixed URL: https://github.com/apache/flink/pull/6823#discussion_r225577009
########## File path: flink-java/src/test/java/org/apache/flink/api/java/io/TextInputFormatTest.java ########## @@ -207,12 +207,212 @@ private void testRemovingTrailingCR(String lineBreaker, String delimiter) { assertEquals(content, result); } + } catch (Throwable t) { + System.err.println("test failed with exception: " + t.getMessage()); + t.printStackTrace(System.err); + fail("Test erroneous"); } - catch (Throwable t) { + } + + /** + * Test different file encodings,for example: UTF-8, UTF-8 with bom, UTF-16LE, UTF-16BE, UTF-32LE, UTF-32BE. + */ + @Test + public void testFileCharset() { + String first = "First line"; + + // Test special different languages + for (final String data : new String[]{"Hello", "ハロー", "привет", "Bonjour", "Сайн байна уу", "안녕하세요."}) { + testAllFileCharsetNoDelimiter(data); + } + + // Test special symbol + for (final String delimiterStr : new String[]{"\\", "^", "|", "[", ".", "*"}) { + first = "Fir" + delimiterStr + "st li" + delimiterStr + "ne"; + testAllFileCharsetWithDelimiter(first, delimiterStr); + } + } + + private void testAllFileCharsetNoDelimiter(String first) { + testAllFileCharsetWithDelimiter(first, ""); + } + + private void testAllFileCharsetWithDelimiter(String first, String delimiter) { + try { + final byte[] noBom = new byte[]{}; + final byte[] utf8Bom = new byte[]{(byte) 0xEF, (byte) 0xBB, (byte) 0xBF}; + final byte[] utf16LEBom = new byte[]{(byte) 0xFF, (byte) 0xFE}; + final byte[] utf16BEBom = new byte[]{(byte) 0xFE, (byte) 0xFF}; + final byte[] utf32LEBom = new byte[]{(byte) 0xFF, (byte) 0xFE, (byte) 0x00, (byte) 0x00}; + final byte[] utf32BEBom = new byte[]{(byte) 0x00, (byte) 0x00, (byte) 0xFE, (byte) 0xFF}; + + // test UTF-8 have bom + testFileCharset(first, "UTF-8", "UTF-32", 1, utf8Bom, delimiter.getBytes("UTF-8")); Review comment: we should have the following tests (fileCharset, hasBom, specifiedCharset): * UTF-8, no, UTF-8 * UTF-8, yes, UTF-8 * UTF-16BE, no, UTF-16 * UTF-16BE, yes, UTF-16 * UTF-16LE, yes, UTF-16 * UTF-16LE, no, UTF-16LE * UTF-16BE, no, UTF-16BE * UTF-16BE, yes, UTF-16LE * UTF-16LE, yes, UTF-16BE * UTF-32BE, no, UTF-32 * UTF-32BE, yes, UTF-32 * UTF-32LE, yes, UTF-32 * UTF-32LE, no, UTF-32LE * UTF-32BE, no, UTF-32BE * UTF-32BE, yes, UTF-32LE * UTF-32LE, yes, UTF-32BE ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services