[ 
https://issues.apache.org/jira/browse/DRILL-7403?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16955976#comment-16955976
 ] 

ASF GitHub Bot commented on DRILL-7403:
---------------------------------------

arina-ielchiieva commented on pull request #1871: DRILL-7403: Validate batch 
checks, vector integretity in unit tests
URL: https://github.com/apache/drill/pull/1871#discussion_r336965873
 
 

 ##########
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/physical/impl/validate/BatchValidator.java
 ##########
 @@ -45,103 +49,373 @@
  */
 
 public class BatchValidator {
-  private static final org.slf4j.Logger logger =
-      org.slf4j.LoggerFactory.getLogger(BatchValidator.class);
+  private static final Logger logger = 
LoggerFactory.getLogger(BatchValidator.class);
 
+  public static final boolean LOG_TO_STDOUT = true;
   public static final int MAX_ERRORS = 100;
 
-  private final int rowCount;
-  private final VectorAccessible batch;
-  private final List<String> errorList;
-  private int errorCount;
+  public interface ErrorReporter {
+    void error(String name, ValueVector vector, String msg);
+    void warn(String name, ValueVector vector, String msg);
+    void error(String msg);
+    int errorCount();
+  }
+
+  public abstract static class BaseErrorReporter implements ErrorReporter {
+
+    private final String opName;
+    private int errorCount;
+
+    public BaseErrorReporter(String opName) {
+      this.opName = opName;
+    }
+
+    protected boolean startError() {
+      if (errorCount == 0) {
+        warn("Found one or more vector errors from " + opName);
+      }
+      errorCount++;
+      if (errorCount >= MAX_ERRORS) {
+        return false;
+      }
+      return true;
+    }
+
+    @Override
+    public void error(String name, ValueVector vector, String msg) {
+      error(String.format("%s - %s: %s",
+            name, vector.getClass().getSimpleName(), msg));
+    }
+
+    @Override
+    public void warn(String name, ValueVector vector, String msg) {
+      warn(String.format("%s - %s: %s",
+            name, vector.getClass().getSimpleName(), msg));
+    }
+
+    public abstract void warn(String msg);
+
+    @Override
+    public int errorCount() { return errorCount; }
+  }
+
+  private static class StdOutReporter extends BaseErrorReporter {
+
+    public StdOutReporter(String opName) {
+      super(opName);
+    }
+
+    @Override
+    public void error(String msg) {
+      if (startError()) {
+        System.out.println(msg);
+      }
+    }
+
+    @Override
+    public void warn(String msg) {
+      System.out.println(msg);
+    }
+  }
+
+  private static class LogReporter extends BaseErrorReporter {
+
+    public LogReporter(String opName) {
+      super(opName);
+    }
+
+    @Override
+    public void error(String msg) {
+      if (startError()) {
+        logger.error(msg);
+      }
+    }
+
+    @Override
+    public void warn(String msg) {
+      logger.error(msg);
+    }
+  }
+
+  private enum CheckMode { COUNTS, ALL };
+
+  private static final Map<Class<? extends RecordBatch>, CheckMode> checkRules 
= buildRules();
 
-  public BatchValidator(VectorAccessible batch) {
-    rowCount = batch.getRecordCount();
-    this.batch = batch;
-    errorList = null;
+  private final ErrorReporter errorReporter;
+
+  public BatchValidator(ErrorReporter errorReporter) {
+    this.errorReporter = errorReporter;
+  }
+
+  /**
+   * At present, most operators will not pass the checks here. The following
+   * table identifies those that should be checked, and the degree of check.
+   * Over time, this table should include all operators, and thus become
+   * unnecessary.
+   */
+  private static Map<Class<? extends RecordBatch>, CheckMode> buildRules() {
+    final Map<Class<? extends RecordBatch>, CheckMode> rules = new 
IdentityHashMap<>();
+    //rules.put(OperatorRecordBatch.class, CheckMode.ALL);
+    return rules;
   }
 
-  public BatchValidator(VectorAccessible batch, boolean captureErrors) {
-    rowCount = batch.getRecordCount();
-    this.batch = batch;
-    if (captureErrors) {
-      errorList = new ArrayList<>();
+  public static boolean validate(RecordBatch batch) {
+    final CheckMode checkMode = checkRules.get(batch.getClass());
+
+    // If no rule, don't check this batch.
+
+    if (checkMode == null) {
+
+      // As work proceeds, might want to log those batches not checked.
+      // For now, there are too many.
+
+      return true;
+    }
+
+    // All batches that do any checks will at least check counts.
+
+    final ErrorReporter reporter = errorReporter(batch);
+    final int rowCount = batch.getRecordCount();
+    int valueCount = rowCount;
+    final VectorContainer container = batch.getContainer();
+    if (!container.hasRecordCount()) {
+      reporter.error(String.format(
+          "%s: Container record count not set",
+          batch.getClass().getSimpleName()));
     } else {
-      errorList = null;
+      // Row count will = container count for most operators.
+      // Row count <= container count for the filter operator.
+
+      final int containerRowCount = container.getRecordCount();
+      valueCount = containerRowCount;
+      switch (batch.getSchema().getSelectionVectorMode()) {
+      case FOUR_BYTE:
+        final int sv4Count = batch.getSelectionVector4().getCount();
+        if (sv4Count != rowCount) {
+          reporter.error(String.format(
+              "Mismatch between %s record count = %d, SV4 record count = %d",
+              batch.getClass().getSimpleName(),
+              rowCount, sv4Count));
+        }
+        // TODO: Don't know how to check SV4 batches
+        return true;
+      case TWO_BYTE:
+        final int sv2Count = batch.getSelectionVector2().getCount();
+        if (sv2Count != rowCount) {
+          reporter.error(String.format(
+              "Mismatch between %s record count = %d, SV2 record count = %d",
+              batch.getClass().getSimpleName(),
+              rowCount, sv2Count));
+        }
+        if (sv2Count > containerRowCount) {
+          reporter.error(String.format(
+              "Mismatch between %s container count = %d, SV2 record count = 
%d",
+              batch.getClass().getSimpleName(),
+              containerRowCount, sv2Count));
+        }
+        final int svTotalCount = 
batch.getSelectionVector2().getBatchActualRecordCount();
+        if (svTotalCount != containerRowCount) {
+          reporter.error(String.format(
+              "Mismatch between %s container count = %d, SV2 total count = %d",
+              batch.getClass().getSimpleName(),
+              containerRowCount, svTotalCount));
+        }
+        break;
+      default:
+        if (rowCount != containerRowCount) {
+          reporter.error(String.format(
+              "Mismatch between %s record count = %d, container record count = 
%d",
+              batch.getClass().getSimpleName(),
+              rowCount, containerRowCount));
+        }
+        break;
+      }
+    }
+    if (checkMode == CheckMode.ALL) {
+      new BatchValidator(reporter).validateBatch(batch, valueCount);
     }
+    return reporter.errorCount() == 0;
   }
 
-  public void validate() {
-    if (batch.getRecordCount() == 0) {
-      return;
+  public static boolean validate(VectorAccessible batch) {
+    final ErrorReporter reporter = errorReporter(batch);
+    new BatchValidator(reporter).validateBatch(batch, batch.getRecordCount());
+    return reporter.errorCount() == 0;
+  }
+
+  private static ErrorReporter errorReporter(VectorAccessible batch) {
+    final String opName = batch.getClass().getSimpleName();
+    if (LOG_TO_STDOUT) {
+      return new StdOutReporter(opName);
+    } else {
+      return new LogReporter(opName);
     }
-    for (VectorWrapper<? extends ValueVector> w : batch) {
-      validateWrapper(w);
+  }
+
+  public void validateBatch(VectorAccessible batch, int rowCount) {
+    for (final VectorWrapper<? extends ValueVector> w : batch) {
+      validateWrapper(rowCount, w);
     }
   }
 
-  private void validateWrapper(VectorWrapper<? extends ValueVector> w) {
+  private void validateWrapper(int rowCount, VectorWrapper<? extends 
ValueVector> w) {
     if (w instanceof SimpleVectorWrapper) {
-      validateVector(w.getValueVector());
+      validateVector(rowCount, w.getValueVector());
+    }
+  }
+
+  private void validateVector(int expectedCount, ValueVector vector) {
+    final int valueCount = vector.getAccessor().getValueCount();
+    if (valueCount != expectedCount) {
+      error(vector.getField().getName(), vector,
+          String.format("Row count = %d, but value count = %d",
+              expectedCount, valueCount));
     }
+    validateVector(vector.getField().getName(), vector);
   }
 
-  private void validateVector(ValueVector vector) {
-    String name = vector.getField().getName();
-    if (vector instanceof NullableVector) {
+  private void validateVector(String name, ValueVector vector) {
+    if (vector instanceof BitVector) {
+      validateBitVector(name, (BitVector) vector);
+    } else if (vector instanceof RepeatedBitVector) {
+      validateRepeatedBitVector(name, (RepeatedBitVector) vector);
+    } else if (vector instanceof NullableVector) {
       validateNullableVector(name, (NullableVector) vector);
     } else if (vector instanceof VariableWidthVector) {
-      validateVariableWidthVector(name, (VariableWidthVector) vector, 
rowCount);
+      validateVariableWidthVector(name, (VariableWidthVector) vector);
     } else if (vector instanceof FixedWidthVector) {
       validateFixedWidthVector(name, (FixedWidthVector) vector);
     } else if (vector instanceof BaseRepeatedValueVector) {
       validateRepeatedVector(name, (BaseRepeatedValueVector) vector);
     } else {
-      logger.debug("Don't know how to validate vector: " + name + " of class " 
+ vector.getClass().getSimpleName());
+      logger.debug("Don't know how to validate vector: {}  of class {}",
+          name, vector.getClass().getSimpleName());
+    }
+  }
+
+  private void validateNullableVector(String name, NullableVector vector) {
+    final int outerCount = vector.getAccessor().getValueCount();
+    final ValueVector valuesVector = vector.getValuesVector();
+    final int valueCount = valuesVector.getAccessor().getValueCount();
+    if (valueCount != outerCount) {
+      error(name, vector, String.format(
+          "Outer value count = %d, but inner value count = %d",
+          outerCount, valueCount));
     }
+    verifyIsSetVector(vector, (UInt1Vector) vector.getBitsVector());
+    validateVector(name + "-values", valuesVector);
   }
 
-  private void validateVariableWidthVector(String name, VariableWidthVector 
vector, int entryCount) {
+  private void validateVariableWidthVector(String name, VariableWidthVector 
vector) {
 
     // Offsets are in the derived classes. Handle only VarChar for now.
 
     if (vector instanceof VarCharVector) {
-      validateVarCharVector(name, (VarCharVector) vector, entryCount);
+      validateVarCharVector(name, (VarCharVector) vector);
     } else {
-      logger.debug("Don't know how to validate vector: " + name + " of class " 
+ vector.getClass().getSimpleName());
+      logger.debug("Don't know how to validate vector: {}  of class {}",
+          name, vector.getClass().getSimpleName());
     }
   }
 
-  private void validateVarCharVector(String name, VarCharVector vector, int 
entryCount) {
-//    int dataLength = vector.getAllocatedByteCount(); // Includes offsets and 
data.
-    int dataLength = vector.getBuffer().capacity();
-    validateOffsetVector(name + "-offsets", vector.getOffsetVector(), 
entryCount, dataLength);
+  private void validateVarCharVector(String name, VarCharVector vector) {
+    final int valueCount = vector.getAccessor().getValueCount();
+
+    // Disabled because a large number of operators
+    // set up offset vectors wrongly.
+    if (valueCount == 0) {
+      return;
+    }
+
+    final int dataLength = vector.getBuffer().writerIndex();
+    validateOffsetVector(name + "-offsets", vector.getOffsetVector(), false, 
valueCount, dataLength);
   }
 
-  private void validateRepeatedVector(String name, BaseRepeatedValueVector 
vector) {
+//  private void validateRepeatedBitVector(String name, 
BaseRepeatedValueVector vector) {
 
 Review comment:
   Please remove.
 
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> Validate batch checks, vector integretity in unit tests
> -------------------------------------------------------
>
>                 Key: DRILL-7403
>                 URL: https://issues.apache.org/jira/browse/DRILL-7403
>             Project: Apache Drill
>          Issue Type: Improvement
>    Affects Versions: 1.16.0, 1.17.0
>            Reporter: Paul Rogers
>            Assignee: Paul Rogers
>            Priority: Minor
>              Labels: ready-to-commit
>             Fix For: 1.17.0
>
>
> Drill provides a {{BatchValidator}} that checks vectors. It is disabled by 
> default. This enhancement adds more checks, including checks for row counts 
> (of which there are surprisingly many.)
> Since most operators will fail if the check is enabled, this enhancement also 
> adds a table to keep track of which operators pass the checks (and for which 
> checks should be enabled) and those that still need work. This allows the 
> checks to exist in the code, and to be enabled incrementally as we fix the 
> various problems.



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Reply via email to