Github user shivzone commented on a diff in the pull request: https://github.com/apache/incubator-hawq/pull/1326#discussion_r159297466 --- Diff: pxf/pxf-hdfs/src/main/java/org/apache/hawq/pxf/plugins/hdfs/ParquetDataFragmenter.java --- @@ -0,0 +1,103 @@ +package org.apache.hawq.pxf.plugins.hdfs; + +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapreduce.InputSplit; +import org.apache.hadoop.mapreduce.lib.input.FileSplit; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hawq.pxf.api.Fragment; +import org.apache.hawq.pxf.api.Fragmenter; +import org.apache.hawq.pxf.api.utilities.InputData; +import org.apache.hawq.pxf.plugins.hdfs.utilities.HdfsUtilities; +import org.apache.parquet.format.converter.ParquetMetadataConverter; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.ParquetInputFormat; +import org.apache.parquet.example.data.Group; +import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.schema.MessageType; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +public class ParquetDataFragmenter extends Fragmenter { + private Job job; + + public ParquetDataFragmenter(InputData md) { + super(md); + JobConf jobConf = new JobConf(new Configuration(), ParquetDataFragmenter.class); + try { + job = Job.getInstance(jobConf); + } catch (IOException e) { + throw new RuntimeException("Unable to instantiate a job for reading fragments", e); + } + } + + + @Override + public List<Fragment> getFragments() throws Exception { + String absoluteDataPath = HdfsUtilities.absoluteDataPath(inputData.getDataSource()); + ArrayList<InputSplit> splits = getSplits(new Path(absoluteDataPath)); + + for (InputSplit split : splits) { + FileSplit fsp = (FileSplit) split; + + String filepath = fsp.getPath().toUri().getPath(); + String[] hosts = fsp.getLocations(); + + Path file = new Path(filepath); + + ParquetMetadata metadata = ParquetFileReader.readFooter( + job.getConfiguration(), file, ParquetMetadataConverter.NO_FILTER); + MessageType schema = metadata.getFileMetaData().getSchema(); + + byte[] fragmentMetadata = HdfsUtilities.prepareFragmentMetadata(fsp.getStart(), fsp.getLength(), fsp.getLocations()); --- End diff -- Can we simply use prepareFragmentMetadata(fsp)
---