paul-rogers commented on a change in pull request #1951: DRILL-7454: Convert
Avro to EVF
URL: https://github.com/apache/drill/pull/1951#discussion_r363133393
##########
File path:
exec/java-exec/src/main/java/org/apache/drill/exec/store/avro/AvroFormatPlugin.java
##########
@@ -17,117 +17,68 @@
*/
package org.apache.drill.exec.store.avro;
-import java.io.IOException;
-import java.util.List;
-import java.util.regex.Pattern;
-
-import org.apache.drill.common.exceptions.ExecutionSetupException;
-import org.apache.drill.common.expression.SchemaPath;
import org.apache.drill.common.logical.StoragePluginConfig;
-import org.apache.drill.exec.ops.FragmentContext;
-import org.apache.drill.exec.planner.common.DrillStatsTable.TableStatistics;
-import org.apache.drill.exec.planner.logical.DrillTable;
+import org.apache.drill.common.types.TypeProtos;
+import org.apache.drill.common.types.Types;
+import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework;
+import org.apache.drill.exec.physical.impl.scan.framework.ManagedReader;
import org.apache.drill.exec.proto.UserBitShared.CoreOperatorType;
import org.apache.drill.exec.server.DrillbitContext;
-import org.apache.drill.exec.store.RecordReader;
-import org.apache.drill.exec.store.RecordWriter;
-import org.apache.drill.exec.store.SchemaConfig;
-import org.apache.drill.exec.store.dfs.BasicFormatMatcher;
-import org.apache.drill.exec.store.dfs.DrillFileSystem;
-import org.apache.drill.exec.store.dfs.FileSelection;
-import org.apache.drill.exec.store.dfs.FileSystemPlugin;
-import org.apache.drill.exec.store.dfs.FormatMatcher;
-import org.apache.drill.exec.store.dfs.FormatSelection;
-import org.apache.drill.exec.store.dfs.MagicString;
+import org.apache.drill.exec.server.options.OptionManager;
import org.apache.drill.exec.store.dfs.easy.EasyFormatPlugin;
-import org.apache.drill.exec.store.dfs.easy.EasyWriter;
-import org.apache.drill.exec.store.dfs.easy.FileWork;
+import org.apache.drill.exec.store.dfs.easy.EasySubScan;
import org.apache.hadoop.conf.Configuration;
-import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
-import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-
/**
* Format plugin for Avro data files.
*/
public class AvroFormatPlugin extends EasyFormatPlugin<AvroFormatConfig> {
- private final AvroFormatMatcher matcher;
-
- public AvroFormatPlugin(String name, DrillbitContext context, Configuration
fsConf,
- StoragePluginConfig storagePluginConfig) {
- this(name, context, fsConf, storagePluginConfig, new AvroFormatConfig());
- }
-
- public AvroFormatPlugin(String name, DrillbitContext context, Configuration
fsConf, StoragePluginConfig config, AvroFormatConfig formatPluginConfig) {
- super(name, context, fsConf, config, formatPluginConfig, true, false,
true, false, Lists.newArrayList("avro"), "avro");
- this.matcher = new AvroFormatMatcher(this);
- }
-
- @Override
- public boolean supportsPushDown() {
- return true;
- }
-
- @Override
- public RecordReader getRecordReader(FragmentContext context, DrillFileSystem
dfs, FileWork fileWork, List<SchemaPath> columns, String userName) throws
ExecutionSetupException {
- return new AvroRecordReader(context, fileWork.getPath(),
fileWork.getStart(), fileWork.getLength(), dfs, columns,
- userName);
- }
-
- @Override
- public RecordWriter getRecordWriter(FragmentContext context, EasyWriter
writer) throws IOException {
- throw new UnsupportedOperationException("unimplemented");
- }
+ public static final String DEFAULT_NAME = "avro";
- @Override
- public int getReaderOperatorType() {
- return CoreOperatorType.AVRO_SUB_SCAN_VALUE;
+ public AvroFormatPlugin(String name,
+ DrillbitContext context,
+ Configuration fsConf,
+ StoragePluginConfig storageConfig,
+ AvroFormatConfig formatConfig) {
+ super(name, easyConfig(fsConf, formatConfig), context, storageConfig,
formatConfig);
}
- @Override
- public int getWriterOperatorType() {
- throw new UnsupportedOperationException("unimplemented");
+ private static EasyFormatConfig easyConfig(Configuration fsConf,
AvroFormatConfig formatConfig) {
+ EasyFormatConfig config = new EasyFormatConfig();
+ config.readable = true;
+ config.writable = false;
+ config.blockSplittable = true;
+ config.compressible = false;
+ config.supportsProjectPushdown = true;
+ config.extensions = formatConfig.extensions;
+ config.fsConf = fsConf;
+ config.defaultName = DEFAULT_NAME;
+ config.readerOperatorType = CoreOperatorType.AVRO_SUB_SCAN_VALUE;
+ config.useEnhancedScan = true;
+ return config;
}
@Override
- public FormatMatcher getMatcher() {
- return this.matcher;
+ protected FileScanFramework.FileScanBuilder frameworkBuilder(OptionManager
options, EasySubScan scan) {
+ FileScanFramework.FileScanBuilder builder = new
FileScanFramework.FileScanBuilder();
+ builder.setReaderFactory(new AvroReaderFactory(new
AvroBatchReader.AvroReaderConfig(this)));
+ initScanBuilder(builder, scan);
+ builder.setNullType(Types.optional(TypeProtos.MinorType.VARCHAR));
+ return builder;
Review comment:
No harm in setting this. But, I wonder if this is the right choice for Avro?
Avro has a schema which, presumably, all files should follow. If schema
evolution occurred (new field added), we'd really want to use the proper data
type when reading old files without that column. This is of course a
long-standing limitation of Drill's "pure" schema-on-read approach: while
creating the reader is the wrong time to be guessing the schema of columns we
are not going to see.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services