[ https://issues.apache.org/jira/browse/BEAM-3429?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Matt Darwin updated BEAM-3429: ------------------------------ Description: I think I have found a bug in TextIO, in the way it handles URIs. I am told that the TextIO.Write.to(String) method should take a URI string, but this doesn't seem to work for me. Test case inline: {code:java} import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import java.io.File; import java.io.IOException; import java.net.URI; import java.nio.file.*; import java.util.*; import org.apache.beam.sdk.io.TextIO; import org.apache.beam.sdk.testing.TestPipeline; import org.apache.beam.sdk.transforms.Create; import org.apache.beam.sdk.values.PCollection; import org.junit.Rule; import org.junit.Test; public class Beam3429Test { @Rule public final transient TestPipeline pipeline = TestPipeline.create(); @Test public void testBeamTextIO() throws IOException { List<String> words = Arrays.asList("tom", "huck", "polly"); Create.Values<String> source = Create.of(words); PCollection<String> coll = this.pipeline.apply(source); String fileName = "test" + System.currentTimeMillis() + ".txt"; String fileURI = new File(fileName).toPath().toUri().toString(); // ie file:///full/unix/path/to/test.file coll.apply(TextIO.write().to(fileURI)); //test passes if we use the line below instead of the above // coll.apply(TextIO.write().to(fileName)); pipeline.run(); //read all sharded files: Path dir = Paths.get(URI.create(fileURI)).getParent(); List<Path> files = new ArrayList<>(); Files.newDirectoryStream(dir, new DirectoryStream.Filter<Path>() { @Override public boolean accept(Path entry) throws IOException { return entry.toString().contains(fileName); } }).forEach(path -> files.add(path)); assertFalse("no files produced!", files.isEmpty()); List<String> fileContents = new ArrayList<>(); for (Path f : files) { fileContents.addAll(Files.readAllLines(f)); } assertTrue(fileContents.contains("tom")); assertTrue(fileContents.contains("polly")); } } {code} was: I think I have found a bug in TextIO, in the way it handles URIs. I am told that the TextIO.Write.to(String) method should take a URI string, but this doesn't seem to work for me. Test case inline: {code:java} @Test public void testBeamTextIO() throws IOException { List<String> words = Arrays.asList("tom", "huck", "polly"); Create.Values<String> source = Create.of(words); PCollection<String> coll = this.pipeline.apply(source); String fileName = "test" + System.currentTimeMillis() + ".txt"; String fileURI = new File(fileName).toPath().toUri().toString(); // ie file:///full/unix/path/to/test.file coll.apply(TextIO.write().to(fileURI)); //test passes if we use the line below instead of the above // coll.apply(TextIO.write().to(fileName)); pipeline.run(); //read all sharded files: Path dir = Paths.get(URI.create(fileURI)).getParent(); List<Path> files = new ArrayList<>(); Files.newDirectoryStream(dir, new DirectoryStream.Filter<Path>() { @Override public boolean accept(Path entry) throws IOException { return entry.toString().contains(fileName); } }).forEach(path -> files.add(path)); assertFalse("no files produced!", files.isEmpty()); List<String> fileContents = new ArrayList<>(); for (Path f : files) { fileContents.addAll(Files.readAllLines(f)); } assertTrue(fileContents.contains("tom")); assertTrue(fileContents.contains("polly")); } {code} > TextIO.Write not handling URI properly > -------------------------------------- > > Key: BEAM-3429 > URL: https://issues.apache.org/jira/browse/BEAM-3429 > Project: Beam > Issue Type: Bug > Components: beam-model > Affects Versions: 2.2.0 > Environment: Mac OS X > Reporter: Matt Darwin > Assignee: Kenneth Knowles > > I think I have found a bug in TextIO, in the way it handles URIs. I am told > that the TextIO.Write.to(String) method should take a URI string, but this > doesn't seem to work for me. > Test case inline: > {code:java} > import static org.junit.Assert.assertFalse; > import static org.junit.Assert.assertTrue; > import java.io.File; > import java.io.IOException; > import java.net.URI; > import java.nio.file.*; > import java.util.*; > import org.apache.beam.sdk.io.TextIO; > import org.apache.beam.sdk.testing.TestPipeline; > import org.apache.beam.sdk.transforms.Create; > import org.apache.beam.sdk.values.PCollection; > import org.junit.Rule; > import org.junit.Test; > public class Beam3429Test { > @Rule > public final transient TestPipeline pipeline = TestPipeline.create(); > @Test > public void testBeamTextIO() throws IOException { > List<String> words = Arrays.asList("tom", "huck", "polly"); > Create.Values<String> source = Create.of(words); > PCollection<String> coll = this.pipeline.apply(source); > String fileName = "test" + System.currentTimeMillis() + ".txt"; > String fileURI = new File(fileName).toPath().toUri().toString(); > // ie file:///full/unix/path/to/test.file > coll.apply(TextIO.write().to(fileURI)); > //test passes if we use the line below instead of the above > // coll.apply(TextIO.write().to(fileName)); > pipeline.run(); > //read all sharded files: > Path dir = Paths.get(URI.create(fileURI)).getParent(); > List<Path> files = new ArrayList<>(); > Files.newDirectoryStream(dir, new DirectoryStream.Filter<Path>() { > @Override public boolean accept(Path entry) throws IOException { > return entry.toString().contains(fileName); > } > }).forEach(path -> files.add(path)); > assertFalse("no files produced!", files.isEmpty()); > List<String> fileContents = new ArrayList<>(); > for (Path f : files) { > fileContents.addAll(Files.readAllLines(f)); > } > assertTrue(fileContents.contains("tom")); > assertTrue(fileContents.contains("polly")); > } > } > {code} -- This message was sent by Atlassian JIRA (v6.4.14#64029)