[ 
https://issues.apache.org/jira/browse/BEAM-3429?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Matt Darwin updated BEAM-3429:
------------------------------
    Description: 
I think I have found a bug in TextIO, in the way it handles URIs.  I am told 
that the TextIO.Write.to(String) method should take a URI string, but this 
doesn't seem to work for me.

Test case inline:


{code:java}


import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;

import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.nio.file.*;
import java.util.*;
import org.apache.beam.sdk.io.TextIO;
import org.apache.beam.sdk.testing.TestPipeline;
import org.apache.beam.sdk.transforms.Create;
import org.apache.beam.sdk.values.PCollection;
import org.junit.Rule;
import org.junit.Test;

public class Beam3429Test {

    @Rule
    public final transient TestPipeline pipeline = TestPipeline.create();

    @Test
    public void testBeamTextIO() throws IOException {
        List<String> words = Arrays.asList("tom", "huck", "polly");
        Create.Values<String> source = Create.of(words);
        PCollection<String> coll = this.pipeline.apply(source);
        String fileName = "test" + System.currentTimeMillis() + ".txt";
        String fileURI = new File(fileName).toPath().toUri().toString();
        // ie file:///full/unix/path/to/test.file

        coll.apply(TextIO.write().to(fileURI));
        //test passes if we use the line below instead of the above
//        coll.apply(TextIO.write().to(fileName));
        pipeline.run();

        //read all sharded files:
        Path dir = Paths.get(URI.create(fileURI)).getParent();

        List<Path> files = new ArrayList<>();
        Files.newDirectoryStream(dir, new DirectoryStream.Filter<Path>() {
            @Override public boolean accept(Path entry) throws IOException {
                return entry.toString().contains(fileName);
            }
        }).forEach(path -> files.add(path));

        assertFalse("no files produced!", files.isEmpty());
        List<String> fileContents = new ArrayList<>();
        for (Path f : files) {
            fileContents.addAll(Files.readAllLines(f));
        }
        assertTrue(fileContents.contains("tom"));
        assertTrue(fileContents.contains("polly"));

    }
}
{code}




  was:
I think I have found a bug in TextIO, in the way it handles URIs.  I am told 
that the TextIO.Write.to(String) method should take a URI string, but this 
doesn't seem to work for me.

Test case inline:


{code:java}
    @Test
    public void testBeamTextIO() throws IOException {
        List<String> words = Arrays.asList("tom", "huck", "polly");
        Create.Values<String> source = Create.of(words);
        PCollection<String> coll = this.pipeline.apply(source);
        String fileName = "test" + System.currentTimeMillis() + ".txt";
        String fileURI = new File(fileName).toPath().toUri().toString();
        // ie file:///full/unix/path/to/test.file

        coll.apply(TextIO.write().to(fileURI));
        //test passes if we use the line below instead of the above
//        coll.apply(TextIO.write().to(fileName));
        pipeline.run();

        //read all sharded files:
        Path dir = Paths.get(URI.create(fileURI)).getParent();

        List<Path> files = new ArrayList<>();
        Files.newDirectoryStream(dir, new DirectoryStream.Filter<Path>() {
            @Override public boolean accept(Path entry) throws IOException {
                return entry.toString().contains(fileName);
            }
        }).forEach(path -> files.add(path));

        assertFalse("no files produced!", files.isEmpty());
        List<String> fileContents = new ArrayList<>();
        for (Path f : files) {
            fileContents.addAll(Files.readAllLines(f));
        }
        assertTrue(fileContents.contains("tom"));
        assertTrue(fileContents.contains("polly"));

    }
{code}





> TextIO.Write not handling URI properly
> --------------------------------------
>
>                 Key: BEAM-3429
>                 URL: https://issues.apache.org/jira/browse/BEAM-3429
>             Project: Beam
>          Issue Type: Bug
>          Components: beam-model
>    Affects Versions: 2.2.0
>         Environment: Mac OS X
>            Reporter: Matt Darwin
>            Assignee: Kenneth Knowles
>
> I think I have found a bug in TextIO, in the way it handles URIs.  I am told 
> that the TextIO.Write.to(String) method should take a URI string, but this 
> doesn't seem to work for me.
> Test case inline:
> {code:java}
> import static org.junit.Assert.assertFalse;
> import static org.junit.Assert.assertTrue;
> import java.io.File;
> import java.io.IOException;
> import java.net.URI;
> import java.nio.file.*;
> import java.util.*;
> import org.apache.beam.sdk.io.TextIO;
> import org.apache.beam.sdk.testing.TestPipeline;
> import org.apache.beam.sdk.transforms.Create;
> import org.apache.beam.sdk.values.PCollection;
> import org.junit.Rule;
> import org.junit.Test;
> public class Beam3429Test {
>     @Rule
>     public final transient TestPipeline pipeline = TestPipeline.create();
>     @Test
>     public void testBeamTextIO() throws IOException {
>         List<String> words = Arrays.asList("tom", "huck", "polly");
>         Create.Values<String> source = Create.of(words);
>         PCollection<String> coll = this.pipeline.apply(source);
>         String fileName = "test" + System.currentTimeMillis() + ".txt";
>         String fileURI = new File(fileName).toPath().toUri().toString();
>         // ie file:///full/unix/path/to/test.file
>         coll.apply(TextIO.write().to(fileURI));
>         //test passes if we use the line below instead of the above
> //        coll.apply(TextIO.write().to(fileName));
>         pipeline.run();
>         //read all sharded files:
>         Path dir = Paths.get(URI.create(fileURI)).getParent();
>         List<Path> files = new ArrayList<>();
>         Files.newDirectoryStream(dir, new DirectoryStream.Filter<Path>() {
>             @Override public boolean accept(Path entry) throws IOException {
>                 return entry.toString().contains(fileName);
>             }
>         }).forEach(path -> files.add(path));
>         assertFalse("no files produced!", files.isEmpty());
>         List<String> fileContents = new ArrayList<>();
>         for (Path f : files) {
>             fileContents.addAll(Files.readAllLines(f));
>         }
>         assertTrue(fileContents.contains("tom"));
>         assertTrue(fileContents.contains("polly"));
>     }
> }
> {code}



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

Reply via email to