danielxjd commented on a change in pull request #12786:
URL: https://github.com/apache/beam/pull/12786#discussion_r485260498
##########
File path:
sdks/java/io/parquet/src/main/java/org/apache/beam/sdk/io/parquet/ParquetIO.java
##########
@@ -336,36 +388,41 @@ public void processElement(
+ tracker.currentRestriction().getFrom()
+ " to "
+ tracker.currentRestriction().getTo());
- ParquetReadOptions options =
HadoopReadOptions.builder(getConfWithModelClass()).build();
- ParquetFileReader reader =
- ParquetFileReader.open(new
BeamParquetInputFile(file.openSeekable()), options);
+ Configuration conf = getConfWithModelClass();
GenericData model = null;
if (modelClass != null) {
model = (GenericData) modelClass.getMethod("get").invoke(null);
}
- ReadSupport<GenericRecord> readSupport = new
AvroReadSupport<GenericRecord>(model);
-
+ AvroReadSupport<GenericRecord> readSupport = new
AvroReadSupport<GenericRecord>(model);
+ if (requestSchemaString != null) {
+ AvroReadSupport.setRequestedProjection(
+ conf, new Schema.Parser().parse(requestSchemaString));
+ }
+ ParquetReadOptions options = HadoopReadOptions.builder(conf).build();
+ ParquetFileReader reader =
+ ParquetFileReader.open(new
BeamParquetInputFile(file.openSeekable()), options);
Filter filter = checkNotNull(options.getRecordFilter(), "filter");
Configuration hadoopConf = ((HadoopReadOptions) options).getConf();
+ for (String property : options.getPropertyNames()) {
+ hadoopConf.set(property, options.getProperty(property));
+ }
FileMetaData parquetFileMetadata =
reader.getFooter().getFileMetaData();
MessageType fileSchema = parquetFileMetadata.getSchema();
Map<String, String> fileMetadata =
parquetFileMetadata.getKeyValueMetaData();
-
ReadSupport.ReadContext readContext =
readSupport.init(
new InitContext(
hadoopConf, Maps.transformValues(fileMetadata,
ImmutableSet::of), fileSchema));
ColumnIOFactory columnIOFactory = new
ColumnIOFactory(parquetFileMetadata.getCreatedBy());
- MessageType requestedSchema = readContext.getRequestedSchema();
+
RecordMaterializer<GenericRecord> recordConverter =
readSupport.prepareForRead(hadoopConf, fileMetadata, fileSchema,
readContext);
- reader.setRequestedSchema(requestedSchema);
Review comment:
yes, this is duplicate. Should delete this line.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]