Repository: beam Updated Branches: refs/heads/master c5cf90c70 -> 9ff22a4ef
[BEAM-2060] Add charset support in XmlSink Project: http://git-wip-us.apache.org/repos/asf/beam/repo Commit: http://git-wip-us.apache.org/repos/asf/beam/commit/43647471 Tree: http://git-wip-us.apache.org/repos/asf/beam/tree/43647471 Diff: http://git-wip-us.apache.org/repos/asf/beam/diff/43647471 Branch: refs/heads/master Commit: 43647471f9f86b168c30958ef83a0947b5d4eb56 Parents: c5cf90c Author: Jean-Baptiste Onofré <jbono...@apache.org> Authored: Wed Apr 26 16:09:54 2017 +0200 Committer: Luke Cwik <lc...@google.com> Committed: Wed Apr 26 10:54:58 2017 -0700 ---------------------------------------------------------------------- .../java/org/apache/beam/sdk/io/xml/XmlIO.java | 27 +++++++++++++++-- .../org/apache/beam/sdk/io/xml/XmlSink.java | 2 +- .../org/apache/beam/sdk/io/xml/XmlSinkTest.java | 31 +++++++++++++++++--- 3 files changed, 53 insertions(+), 7 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/beam/blob/43647471/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlIO.java ---------------------------------------------------------------------- diff --git a/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlIO.java b/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlIO.java index f8a9edc..ce36abe 100644 --- a/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlIO.java +++ b/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlIO.java @@ -211,10 +211,20 @@ public class XmlIO { * ... * </words> * }</pre> + * + * <p>By default the UTF-8 charset is used. This can be overridden, for example: + * + * <pre>{@code + * p.apply(XmlIO.<Type>write() + * .withRecordClass(Type.class) + * .withRootElement(root_element) + * .withCharset(StandardCharsets.ISO_8859_1) + * .toFilenamePrefix(output_filename)); + * }</pre> */ // CHECKSTYLE.ON: JavadocStyle public static <T> Write<T> write() { - return new AutoValue_XmlIO_Write.Builder<T>().build(); + return new AutoValue_XmlIO_Write.Builder<T>().setCharset("UTF-8").build(); } /** Implementation of {@link #read}. */ @@ -432,6 +442,9 @@ public class XmlIO { @Nullable abstract String getRootElement(); + @Nullable + abstract String getCharset(); + abstract Builder<T> toBuilder(); @AutoValue.Builder @@ -442,6 +455,8 @@ public class XmlIO { abstract Builder<T> setRootElement(String rootElement); + abstract Builder<T> setCharset(String charset); + abstract Write<T> build(); } @@ -469,11 +484,17 @@ public class XmlIO { return toBuilder().setRootElement(rootElement).build(); } + /** Sets the charset used to write the file. */ + public Write<T> withCharset(Charset charset) { + return toBuilder().setCharset(charset.name()).build(); + } + @Override public void validate(PCollection<T> input) { checkNotNull(getRecordClass(), "Missing a class to bind to a JAXB context."); checkNotNull(getRootElement(), "Missing a root element name."); checkNotNull(getFilenamePrefix(), "Missing a filename to write to."); + checkNotNull(getCharset(), "Missing charset"); try { JAXBContext.newInstance(getRecordClass()); } catch (JAXBException e) { @@ -498,7 +519,9 @@ public class XmlIO { .addIfNotNull( DisplayData.item("rootElement", getRootElement()).withLabel("XML Root Element")) .addIfNotNull( - DisplayData.item("recordClass", getRecordClass()).withLabel("XML Record Class")); + DisplayData.item("recordClass", getRecordClass()).withLabel("XML Record Class")) + .addIfNotNull( + DisplayData.item("charset", getCharset()).withLabel("Charset")); } } } http://git-wip-us.apache.org/repos/asf/beam/blob/43647471/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlSink.java ---------------------------------------------------------------------- diff --git a/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlSink.java b/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlSink.java index 2e7dba1..a1ebf6c 100644 --- a/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlSink.java +++ b/sdks/java/io/xml/src/main/java/org/apache/beam/sdk/io/xml/XmlSink.java @@ -85,7 +85,7 @@ class XmlSink<T> extends FileBasedSink<T> { marshaller = context.createMarshaller(); marshaller.setProperty(Marshaller.JAXB_FORMATTED_OUTPUT, Boolean.TRUE); marshaller.setProperty(Marshaller.JAXB_FRAGMENT, Boolean.TRUE); - marshaller.setProperty(Marshaller.JAXB_ENCODING, "UTF-8"); + marshaller.setProperty(Marshaller.JAXB_ENCODING, getSink().spec.getCharset()); return new XmlWriter<>(this, marshaller); } http://git-wip-us.apache.org/repos/asf/beam/blob/43647471/sdks/java/io/xml/src/test/java/org/apache/beam/sdk/io/xml/XmlSinkTest.java ---------------------------------------------------------------------- diff --git a/sdks/java/io/xml/src/test/java/org/apache/beam/sdk/io/xml/XmlSinkTest.java b/sdks/java/io/xml/src/test/java/org/apache/beam/sdk/io/xml/XmlSinkTest.java index a6e1b87..bf15cfe 100644 --- a/sdks/java/io/xml/src/test/java/org/apache/beam/sdk/io/xml/XmlSinkTest.java +++ b/sdks/java/io/xml/src/test/java/org/apache/beam/sdk/io/xml/XmlSinkTest.java @@ -26,9 +26,11 @@ import static org.junit.Assert.assertNotNull; import com.google.common.collect.Lists; import java.io.BufferedReader; import java.io.File; +import java.io.FileInputStream; import java.io.FileOutputStream; -import java.io.FileReader; +import java.io.InputStreamReader; import java.nio.channels.WritableByteChannel; +import java.nio.charset.StandardCharsets; import java.nio.file.Path; import java.util.ArrayList; import java.util.Arrays; @@ -82,7 +84,26 @@ public class XmlSinkTest { List<String> lines = Arrays.asList("<birds>", "<bird>", "<species>robin</species>", "<adjective>bemused</adjective>", "</bird>", "<bird>", "<species>goose</species>", "<adjective>evasive</adjective>", "</bird>", "</birds>"); - runTestWrite(writer, bundle, lines); + runTestWrite(writer, bundle, lines, StandardCharsets.UTF_8.name()); + } + + @Test + public void testXmlWriterCharset() throws Exception { + PipelineOptions options = PipelineOptionsFactory.create(); + XmlWriteOperation<Bird> writeOp = + XmlIO.<Bird>write() + .toFilenamePrefix(testFilePrefix) + .withRecordClass(Bird.class) + .withRootElement("birds") + .withCharset(StandardCharsets.ISO_8859_1) + .createSink() + .createWriteOperation(options); + XmlWriter<Bird> writer = writeOp.createWriter(options); + + List<Bird> bundle = Lists.newArrayList(new Bird("bréche", "pinçon")); + List<String> lines = Arrays.asList("<birds>", "<bird>", "<species>pinçon</species>", + "<adjective>bréche</adjective>", "</bird>", "</birds>"); + runTestWrite(writer, bundle, lines, StandardCharsets.ISO_8859_1.name()); } /** @@ -181,14 +202,16 @@ public class XmlSinkTest { /** * Write a bundle with an XmlWriter and verify the output is expected. */ - private <T> void runTestWrite(XmlWriter<T> writer, List<T> bundle, List<String> expected) + private <T> void runTestWrite(XmlWriter<T> writer, List<T> bundle, List<String> expected, + String charset) throws Exception { File tmpFile = tmpFolder.newFile("foo.txt"); try (FileOutputStream fileOutputStream = new FileOutputStream(tmpFile)) { writeBundle(writer, bundle, fileOutputStream.getChannel()); } List<String> lines = new ArrayList<>(); - try (BufferedReader reader = new BufferedReader(new FileReader(tmpFile))) { + try (BufferedReader reader = new BufferedReader( + new InputStreamReader(new FileInputStream(tmpFile), charset))) { for (;;) { String line = reader.readLine(); if (line == null) {