[ 
https://issues.apache.org/jira/browse/MAPREDUCE-5050?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Albert Chu updated MAPREDUCE-5050:
----------------------------------
    Description: 
I'm trying to simulate running Hadoop on Lustre by configuring it to use the 
local file system using a single cloudera VM (cdh3u4).

I can generate the data just fine, but when running the sorting portion of the 
program, I get an error about not being able to find the _partition.lst file. 
It exists in the generated data directory.

Perusing the Terasort code, I see in the main method that has a Path reference 
to partition.lst, which is created with the parent directory.

{noformat}
  public int run(String[] args) throws Exception {
       LOG.info("starting");
      JobConf job = (JobConf) getConf();
>>  Path inputDir = new Path(args[0]);
>>  inputDir = inputDir.makeQualified(inputDir.getFileSystem(job));
>>  Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME);
      URI partitionUri = new URI(partitionFile.toString() +
                               "#" + TeraInputFormat.PARTITION_FILENAME);
      TeraInputFormat.setInputPaths(job, new Path(args[0]));
      FileOutputFormat.setOutputPath(job, new Path(args[1]));
      job.setJobName("TeraSort");
      job.setJarByClass(TeraSort.class);
      job.setOutputKeyClass(Text.class);
      job.setOutputValueClass(Text.class);
      job.setInputFormat(TeraInputFormat.class);
      job.setOutputFormat(TeraOutputFormat.class);
      job.setPartitionerClass(TotalOrderPartitioner.class);
      TeraInputFormat.writePartitionFile(job, partitionFile);
      DistributedCache.addCacheFile(partitionUri, job);
      DistributedCache.createSymlink(job);
      job.setInt("dfs.replication", 1);
      TeraOutputFormat.setFinalSync(job, true);
      JobClient.runJob(job);
      LOG.info("done");
      return 0;
  }
{noformat}

But in the configure method, the Path isn't created with the parent directory 
reference.

{noformat}
    public void configure(JobConf job) {

      try {
        FileSystem fs = FileSystem.getLocal(job);
>>    Path partFile = new Path(TeraInputFormat.PARTITION_FILENAME);
        splitPoints = readPartitions(fs, partFile, job);
        trie = buildTrie(splitPoints, 0, splitPoints.length, new Text(), 2);
      } catch (IOException ie) {
        throw new IllegalArgumentException("can't read paritions file", ie);
      }

    }
{noformat}

I modified the code as follows, and now sorting portion of the Terasort test 
works using the
general file system. I think the above code is a bug.

{noformat}
    public void configure(JobConf job) {

      try {
        FileSystem fs = FileSystem.getLocal(job);

  >>  Path[] inputPaths = TeraInputFormat.getInputPaths(job);
  >>  Path partFile = new Path(inputPaths[0], 
TeraInputFormat.PARTITION_FILENAME);

        splitPoints = readPartitions(fs, partFile, job);
        trie = buildTrie(splitPoints, 0, splitPoints.length, new Text(), 2);
      } catch (IOException ie) {
        throw new IllegalArgumentException("can't read paritions file", ie);
      }

    }
{noformat}

  was:
I'm trying to simulate running Hadoop on Lustre by configuring it to use the 
local file system using a single cloudera VM (cdh3u4).

I can generate the data just fine, but when running the sorting portion of the 
program, I get an error about not being able to find the _partition.lst file. 
It exists in the generated data directory.

Perusing the Terasort code, I see in the main method that has a Path reference 
to partition.lst, which is created with the parent directory.

  public int run(String[] args) throws Exception {
       LOG.info("starting");
      JobConf job = (JobConf) getConf();
>>  Path inputDir = new Path(args[0]);
>>  inputDir = inputDir.makeQualified(inputDir.getFileSystem(job));
>>  Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME);
      URI partitionUri = new URI(partitionFile.toString() +
                               "#" + TeraInputFormat.PARTITION_FILENAME);
      TeraInputFormat.setInputPaths(job, new Path(args[0]));
      FileOutputFormat.setOutputPath(job, new Path(args[1]));
      job.setJobName("TeraSort");
      job.setJarByClass(TeraSort.class);
      job.setOutputKeyClass(Text.class);
      job.setOutputValueClass(Text.class);
      job.setInputFormat(TeraInputFormat.class);
      job.setOutputFormat(TeraOutputFormat.class);
      job.setPartitionerClass(TotalOrderPartitioner.class);
      TeraInputFormat.writePartitionFile(job, partitionFile);
      DistributedCache.addCacheFile(partitionUri, job);
      DistributedCache.createSymlink(job);
      job.setInt("dfs.replication", 1);
      TeraOutputFormat.setFinalSync(job, true);
      JobClient.runJob(job);
      LOG.info("done");
      return 0;
  }

But in the configure method, the Path isn't created with the parent directory 
reference.

    public void configure(JobConf job) {

      try {
        FileSystem fs = FileSystem.getLocal(job);
>>    Path partFile = new Path(TeraInputFormat.PARTITION_FILENAME);
        splitPoints = readPartitions(fs, partFile, job);
        trie = buildTrie(splitPoints, 0, splitPoints.length, new Text(), 2);
      } catch (IOException ie) {
        throw new IllegalArgumentException("can't read paritions file", ie);
      }

    }

I modified the code as follows, and now sorting portion of the Terasort test 
works using the
general file system. I think the above code is a bug.

    public void configure(JobConf job) {

      try {
        FileSystem fs = FileSystem.getLocal(job);

  >>  Path[] inputPaths = TeraInputFormat.getInputPaths(job);
  >>  Path partFile = new Path(inputPaths[0], 
TeraInputFormat.PARTITION_FILENAME);

        splitPoints = readPartitions(fs, partFile, job);
        trie = buildTrie(splitPoints, 0, splitPoints.length, new Text(), 2);
      } catch (IOException ie) {
        throw new IllegalArgumentException("can't read paritions file", ie);
      }

    }



> Cannot find partition.lst in Terasort on Hadoop/Local File System
> -----------------------------------------------------------------
>
>                 Key: MAPREDUCE-5050
>                 URL: https://issues.apache.org/jira/browse/MAPREDUCE-5050
>             Project: Hadoop Map/Reduce
>          Issue Type: Bug
>          Components: examples
>    Affects Versions: 0.20.2
>         Environment: Cloudera VM CDH3u4, VMWare, Linux, Java SE 1.6.0_31-b04
>            Reporter: Matt Parker
>            Priority: Minor
>
> I'm trying to simulate running Hadoop on Lustre by configuring it to use the 
> local file system using a single cloudera VM (cdh3u4).
> I can generate the data just fine, but when running the sorting portion of 
> the program, I get an error about not being able to find the _partition.lst 
> file. It exists in the generated data directory.
> Perusing the Terasort code, I see in the main method that has a Path 
> reference to partition.lst, which is created with the parent directory.
> {noformat}
>   public int run(String[] args) throws Exception {
>        LOG.info("starting");
>       JobConf job = (JobConf) getConf();
> >>  Path inputDir = new Path(args[0]);
> >>  inputDir = inputDir.makeQualified(inputDir.getFileSystem(job));
> >>  Path partitionFile = new Path(inputDir, 
> >> TeraInputFormat.PARTITION_FILENAME);
>       URI partitionUri = new URI(partitionFile.toString() +
>                                "#" + TeraInputFormat.PARTITION_FILENAME);
>       TeraInputFormat.setInputPaths(job, new Path(args[0]));
>       FileOutputFormat.setOutputPath(job, new Path(args[1]));
>       job.setJobName("TeraSort");
>       job.setJarByClass(TeraSort.class);
>       job.setOutputKeyClass(Text.class);
>       job.setOutputValueClass(Text.class);
>       job.setInputFormat(TeraInputFormat.class);
>       job.setOutputFormat(TeraOutputFormat.class);
>       job.setPartitionerClass(TotalOrderPartitioner.class);
>       TeraInputFormat.writePartitionFile(job, partitionFile);
>       DistributedCache.addCacheFile(partitionUri, job);
>       DistributedCache.createSymlink(job);
>       job.setInt("dfs.replication", 1);
>       TeraOutputFormat.setFinalSync(job, true);
>       JobClient.runJob(job);
>       LOG.info("done");
>       return 0;
>   }
> {noformat}
> But in the configure method, the Path isn't created with the parent directory 
> reference.
> {noformat}
>     public void configure(JobConf job) {
>       try {
>         FileSystem fs = FileSystem.getLocal(job);
> >>    Path partFile = new Path(TeraInputFormat.PARTITION_FILENAME);
>         splitPoints = readPartitions(fs, partFile, job);
>         trie = buildTrie(splitPoints, 0, splitPoints.length, new Text(), 2);
>       } catch (IOException ie) {
>         throw new IllegalArgumentException("can't read paritions file", ie);
>       }
>     }
> {noformat}
> I modified the code as follows, and now sorting portion of the Terasort test 
> works using the
> general file system. I think the above code is a bug.
> {noformat}
>     public void configure(JobConf job) {
>       try {
>         FileSystem fs = FileSystem.getLocal(job);
>   >>  Path[] inputPaths = TeraInputFormat.getInputPaths(job);
>   >>  Path partFile = new Path(inputPaths[0], 
> TeraInputFormat.PARTITION_FILENAME);
>         splitPoints = readPartitions(fs, partFile, job);
>         trie = buildTrie(splitPoints, 0, splitPoints.length, new Text(), 2);
>       } catch (IOException ie) {
>         throw new IllegalArgumentException("can't read paritions file", ie);
>       }
>     }
> {noformat}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to