[GitHub] [drill] paul-rogers commented on a change in pull request #1951: DRILL-7454: Convert Avro to EVF

GitBox Sun, 05 Jan 2020 17:43:21 -0800

paul-rogers commented on a change in pull request #1951: DRILL-7454: Convert 
Avro to EVF
URL: https://github.com/apache/drill/pull/1951#discussion_r363133393


 ##########
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/store/avro/AvroFormatPlugin.java
 ##########
 @@ -17,117 +17,68 @@
  */
 package org.apache.drill.exec.store.avro;
 
-import java.io.IOException;
-import java.util.List;
-import java.util.regex.Pattern;
-
-import org.apache.drill.common.exceptions.ExecutionSetupException;
-import org.apache.drill.common.expression.SchemaPath;
 import org.apache.drill.common.logical.StoragePluginConfig;
-import org.apache.drill.exec.ops.FragmentContext;
-import org.apache.drill.exec.planner.common.DrillStatsTable.TableStatistics;
-import org.apache.drill.exec.planner.logical.DrillTable;
+import org.apache.drill.common.types.TypeProtos;
+import org.apache.drill.common.types.Types;
+import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework;
+import org.apache.drill.exec.physical.impl.scan.framework.ManagedReader;
 import org.apache.drill.exec.proto.UserBitShared.CoreOperatorType;
 import org.apache.drill.exec.server.DrillbitContext;
-import org.apache.drill.exec.store.RecordReader;
-import org.apache.drill.exec.store.RecordWriter;
-import org.apache.drill.exec.store.SchemaConfig;
-import org.apache.drill.exec.store.dfs.BasicFormatMatcher;
-import org.apache.drill.exec.store.dfs.DrillFileSystem;
-import org.apache.drill.exec.store.dfs.FileSelection;
-import org.apache.drill.exec.store.dfs.FileSystemPlugin;
-import org.apache.drill.exec.store.dfs.FormatMatcher;
-import org.apache.drill.exec.store.dfs.FormatSelection;
-import org.apache.drill.exec.store.dfs.MagicString;
+import org.apache.drill.exec.server.options.OptionManager;
 import org.apache.drill.exec.store.dfs.easy.EasyFormatPlugin;
-import org.apache.drill.exec.store.dfs.easy.EasyWriter;
-import org.apache.drill.exec.store.dfs.easy.FileWork;
+import org.apache.drill.exec.store.dfs.easy.EasySubScan;
 import org.apache.hadoop.conf.Configuration;
 
-import org.apache.drill.shaded.guava.com.google.common.collect.ImmutableList;
-import org.apache.drill.shaded.guava.com.google.common.collect.Lists;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-
 /**
  * Format plugin for Avro data files.
  */
 public class AvroFormatPlugin extends EasyFormatPlugin<AvroFormatConfig> {
 
-  private final AvroFormatMatcher matcher;
-
-  public AvroFormatPlugin(String name, DrillbitContext context, Configuration 
fsConf,
-                          StoragePluginConfig storagePluginConfig) {
-    this(name, context, fsConf, storagePluginConfig, new AvroFormatConfig());
-  }
-
-  public AvroFormatPlugin(String name, DrillbitContext context, Configuration 
fsConf, StoragePluginConfig config, AvroFormatConfig formatPluginConfig) {
-    super(name, context, fsConf, config, formatPluginConfig, true, false, 
true, false, Lists.newArrayList("avro"), "avro");
-    this.matcher = new AvroFormatMatcher(this);
-  }
-
-  @Override
-  public boolean supportsPushDown() {
-    return true;
-  }
-
-  @Override
-  public RecordReader getRecordReader(FragmentContext context, DrillFileSystem 
dfs, FileWork fileWork, List<SchemaPath> columns, String userName) throws 
ExecutionSetupException {
-    return new AvroRecordReader(context, fileWork.getPath(), 
fileWork.getStart(), fileWork.getLength(), dfs, columns,
-      userName);
-  }
-
-  @Override
-  public RecordWriter getRecordWriter(FragmentContext context, EasyWriter 
writer) throws IOException {
-    throw new UnsupportedOperationException("unimplemented");
-  }
+  public static final String DEFAULT_NAME = "avro";
 
-  @Override
-  public int getReaderOperatorType() {
-    return CoreOperatorType.AVRO_SUB_SCAN_VALUE;
+  public AvroFormatPlugin(String name,
+                             DrillbitContext context,
+                             Configuration fsConf,
+                             StoragePluginConfig storageConfig,
+                             AvroFormatConfig formatConfig) {
+    super(name, easyConfig(fsConf, formatConfig), context, storageConfig, 
formatConfig);
   }
 
-  @Override
-  public int getWriterOperatorType() {
-    throw new UnsupportedOperationException("unimplemented");
+  private static EasyFormatConfig easyConfig(Configuration fsConf, 
AvroFormatConfig formatConfig) {
+    EasyFormatConfig config = new EasyFormatConfig();
+    config.readable = true;
+    config.writable = false;
+    config.blockSplittable = true;
+    config.compressible = false;
+    config.supportsProjectPushdown = true;
+    config.extensions = formatConfig.extensions;
+    config.fsConf = fsConf;
+    config.defaultName = DEFAULT_NAME;
+    config.readerOperatorType = CoreOperatorType.AVRO_SUB_SCAN_VALUE;
+    config.useEnhancedScan = true;
+    return config;
   }
 
   @Override
-  public FormatMatcher getMatcher() {
-    return this.matcher;
+  protected FileScanFramework.FileScanBuilder frameworkBuilder(OptionManager 
options, EasySubScan scan) {
+    FileScanFramework.FileScanBuilder builder = new 
FileScanFramework.FileScanBuilder();
+    builder.setReaderFactory(new AvroReaderFactory(new 
AvroBatchReader.AvroReaderConfig(this)));
+    initScanBuilder(builder, scan);
+    builder.setNullType(Types.optional(TypeProtos.MinorType.VARCHAR));
+    return builder;
 
 Review comment:
   No harm in setting this. But, I wonder if this is the right choice for Avro? 
Avro has a schema which, presumably, all files should follow. If schema 
evolution occurred (new field added), we'd really want to use the proper data 
type when reading old files without that column. This is of course a 
long-standing limitation of Drill's "pure" schema-on-read approach: while 
creating the reader is the wrong time to be guessing the schema of columns we 
are not going to see.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

[GitHub] [drill] paul-rogers commented on a change in pull request #1951: DRILL-7454: Convert Avro to EVF

Reply via email to