[drill] branch master updated: DRILL-8390: Minor Improvements to PDF Reader (#2742)

cgivre Thu, 19 Jan 2023 05:42:53 -0800

This is an automated email from the ASF dual-hosted git repository.

cgivre pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/drill.git



The following commit(s) were added to refs/heads/master by this push:
     new d89578c5d6 DRILL-8390:  Minor Improvements to PDF Reader (#2742)
d89578c5d6 is described below

commit d89578c5d60a6594b698974ae03a46ad11719a9c
Author: Charles S. Givre <[email protected]>
AuthorDate: Thu Jan 19 08:42:04 2023 -0500

    DRILL-8390:  Minor Improvements to PDF Reader (#2742)
    
    DRILL-8390: Minor Improvements to PDF Reader
---
 contrib/format-pdf/README.md                       |  5 +--
 .../drill/exec/store/pdf/PdfBatchReader.java       | 37 ++++++++++++++++++----
 .../drill/exec/store/pdf/PdfMetadataReader.java    | 12 ++++++-
 .../drill/exec/store/pdf/PdfRowIterator.java       |  6 ++++
 .../org/apache/drill/exec/store/pdf/PdfUtils.java  |  7 +++-
 .../apache/drill/exec/store/pdf/TestPdfFormat.java | 13 +++++---
 6 files changed, 64 insertions(+), 16 deletions(-)

diff --git a/contrib/format-pdf/README.md b/contrib/format-pdf/README.md
index 6dbc2a569e..bba351f2ed 100644
--- a/contrib/format-pdf/README.md
+++ b/contrib/format-pdf/README.md
@@ -1,4 +1,4 @@
-# Format Plugin for PDF Table Reader
+# Format Plugin for PDF Tables
 One of the most annoying tasks is when you are working on a data science 
project and you get data that is in a PDF file. This plugin endeavours to 
enable you to query data in PDF tables using Drill's SQL interface.  
 
 ## Data Model
@@ -31,7 +31,7 @@ The available options are:
 * `extractionAlgorithm`:  Allows you to choose the extraction algorithm used 
for extracting data from the PDF file.  Choices are `spreadsheet` and `basic`.  
Depending on your data, one may work better than the other.
 
 ## Accessing Document Metadata Fields
-PDF files have a considerable amount of metadata which can be useful for 
analysis.  Drill will extract the following fields from every PDF file.  Note 
that these fields are not projected in star queries and must be selected 
explicitly.  The document's creator populates these fields and some or all may 
be empty. With the exception of `_page_count` which is an `INT` and the two 
date fields, all the other fields are `VARCHAR` fields.
+PDF files have a considerable amount of metadata which can be useful for 
analysis.  Drill will extract the following fields from every PDF file.  Note 
that these fields are not projected in star queries and must be selected 
explicitly.  The document's creator populates these fields and some or all may 
be empty. With the exception of `_page_count`, `_table_count` and 
`_table_index` which are `INT` fields and the two date fields, all the other 
fields are `VARCHAR` fields.
  
  The fields are:
  * `_page_count`
@@ -44,6 +44,7 @@ PDF files have a considerable amount of metadata which can be 
useful for analysi
  * `_modification_date`
  * `_trapped`
  * `_table_count`
+ * `_table_index`
  
  The query below will access a document's metadata:
  
diff --git 
a/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfBatchReader.java
 
b/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfBatchReader.java
index fd6cec92e6..26d3a94c9f 100644
--- 
a/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfBatchReader.java
+++ 
b/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfBatchReader.java
@@ -74,6 +74,7 @@ public class PdfBatchReader implements ManagedReader {
   private PdfRowIterator rowIterator;
   private final FileSchemaNegotiator negotiator;
   private int unregisteredColumnCount;
+  private List<RectangularTextContainer> rawFirstRow;
 
   // Tables
   private List<Table> tables;
@@ -99,15 +100,20 @@ public class PdfBatchReader implements ManagedReader {
     builder = new SchemaBuilder();
 
     openFile();
-    metadataReader = new PdfMetadataReader(document);
-
+    tables = PdfUtils.extractTablesFromPDF(document, 
config.plugin.getConfig().getAlgorithm());
+    metadataReader = new PdfMetadataReader(document, tables.size());
     // Get the tables if the user set the combine pages to true
     if (config.plugin.getConfig().combinePages() ) {
-      tables = PdfUtils.extractTablesFromPDF(document, 
config.plugin.getConfig().getAlgorithm());
       currentTable = tables.get(0);
+      rowIterator = new PdfRowIterator(currentTable);
     } else {
-      currentTable = PdfUtils.getSpecificTable(document, startingTableIndex, 
config.plugin.getConfig().getAlgorithm());
-      tables = Collections.singletonList(currentTable);
+      if (tables.size() > 0) {
+        currentTable = tables.get(startingTableIndex);
+        tables = Collections.singletonList(currentTable);
+        rowIterator = new PdfRowIterator(currentTable);
+      } else {
+        rowIterator = new PdfRowIterator();
+      }
 
       // If the user specifies a table index, and that table does not exist, 
throw an exception.
       if (currentTable == null && startingTableIndex != 0) {
@@ -119,9 +125,9 @@ public class PdfBatchReader implements ManagedReader {
     }
 
     // Get the row iterator and grab the first row to build the schema
-    rowIterator = new PdfRowIterator(currentTable);
     if (rowIterator.hasNext()) {
-      firstRow = PdfUtils.convertRowToStringArray(rowIterator.next());
+      rawFirstRow = rowIterator.next();
+      firstRow = PdfUtils.convertRowToStringArray(rawFirstRow);
     }
 
     // Support provided schema
@@ -156,6 +162,7 @@ public class PdfBatchReader implements ManagedReader {
         // Get the next table
         currentTableIndex++;
         currentTable = tables.get(currentTableIndex);
+        metadataReader.setTableIndex(currentTableIndex);
 
         // Update the row iterator
         rowIterator = new PdfRowIterator(currentTable);
@@ -173,12 +180,28 @@ public class PdfBatchReader implements ManagedReader {
         return false;
       }
 
+      // Edge case: If the document is not set to extract headers, we still 
need to process the first row which
+      // was used to build the schema.
+      if (! config.plugin.getConfig().extractHeaders()) {
+        processFirstRow();
+      }
+
       // Process the row
       processRow(rowIterator.next());
     }
     return true;
   }
 
+  private void processFirstRow() {
+    if (rawFirstRow == null) {
+      return;
+    }
+    processRow(rawFirstRow);
+
+    // Now clear out the rawFirstRow variable so that we don't accidentally 
read it again.
+    rawFirstRow = null;
+  }
+
   private void processRow(List<RectangularTextContainer> row) {
     if (row == null || row.size() == 0) {
       rowWriter.start();
diff --git 
a/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfMetadataReader.java
 
b/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfMetadataReader.java
index 297fac9307..a910d6b475 100644
--- 
a/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfMetadataReader.java
+++ 
b/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfMetadataReader.java
@@ -36,12 +36,14 @@ public class PdfMetadataReader {
   private final Map<String, Object> metadata;
   private final List<PdfBatchReader.PdfColumnWriter> writers;
   private RowSetLoader rowWriter;
+  private int tableIndex;
 
 
-  public PdfMetadataReader(PDDocument document) {
+  public PdfMetadataReader(PDDocument document, int tableCount) {
     this.writers = new ArrayList<>();
     // We are using a LinkedHashMap to preserve the order
     this.metadata = new LinkedHashMap<>();
+    this.tableIndex = 1;
     PDDocumentInformation info = document.getDocumentInformation();
     metadata.put("pageCount", document.getNumberOfPages());
     metadata.put("title",info.getTitle());
@@ -53,11 +55,17 @@ public class PdfMetadataReader {
     metadata.put("creationDate", info.getCreationDate());
     metadata.put("modificationDate", info.getModificationDate());
     metadata.put("trapped", info.getTrapped());
+    metadata.put("tableCount", tableCount);
+    metadata.put("tableIndex", tableIndex);
   }
 
   public void setRowWriter(RowSetLoader rowWriter) {
     this.rowWriter = rowWriter;
   }
+  public void setTableIndex(int tableIndex) {
+    this.tableIndex = tableIndex;
+    metadata.put("tableIndex", tableIndex);
+  }
 
   public void addImplicitColumnsToSchema() {
     // Add to schema
@@ -71,6 +79,8 @@ public class PdfMetadataReader {
     addMetadataColumnToSchema("_creation_date", MinorType.TIMESTAMP);
     addMetadataColumnToSchema("_modification_date", MinorType.TIMESTAMP);
     addMetadataColumnToSchema("_trapped", MinorType.VARCHAR);
+    addMetadataColumnToSchema("_table_count", MinorType.INT);
+    addMetadataColumnToSchema("_table_index", MinorType.INT);
   }
 
   public void writeMetadata() {
diff --git 
a/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfRowIterator.java
 
b/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfRowIterator.java
index 4e90d6beee..f891d92f47 100644
--- 
a/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfRowIterator.java
+++ 
b/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfRowIterator.java
@@ -33,6 +33,12 @@ public class PdfRowIterator implements 
Iterator<List<RectangularTextContainer>>
     this.rowCounter = 0;
   }
 
+  public PdfRowIterator() {
+    this.table = null;
+    this.rowCounter = 0;
+  }
+
+
   @Override
   public boolean hasNext() {
     if (table == null) {
diff --git 
a/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfUtils.java
 
b/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfUtils.java
index ec72b86ed4..370ad56b61 100644
--- 
a/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfUtils.java
+++ 
b/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfUtils.java
@@ -34,6 +34,7 @@ import technology.tabula.extractors.ExtractionAlgorithm;
 import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
 
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.List;
 
 public class PdfUtils {
@@ -206,6 +207,10 @@ public class PdfUtils {
     if (table == null) {
       return values;
     }
-    return table.getRows().get(rowIndex);
+    if (table.getRowCount() > 0) {
+      return table.getRows().get(rowIndex);
+    } else {
+      return Collections.emptyList();
+    }
   }
 }
diff --git 
a/contrib/format-pdf/src/test/java/org/apache/drill/exec/store/pdf/TestPdfFormat.java
 
b/contrib/format-pdf/src/test/java/org/apache/drill/exec/store/pdf/TestPdfFormat.java
index 49856051f8..32b8734deb 100644
--- 
a/contrib/format-pdf/src/test/java/org/apache/drill/exec/store/pdf/TestPdfFormat.java
+++ 
b/contrib/format-pdf/src/test/java/org/apache/drill/exec/store/pdf/TestPdfFormat.java
@@ -107,7 +107,7 @@ public class TestPdfFormat extends ClusterTest {
       "(type => 'pdf', combinePages => false, extractHeaders => false))";
 
     RowSet results = client.queryBuilder().sql(sql).rowSet();
-    assertEquals(31, results.rowCount());
+    assertEquals(32, results.rowCount());
     results.clear();
 
     sql = "SELECT * " +
@@ -182,7 +182,8 @@ public class TestPdfFormat extends ClusterTest {
       "_producer," +
       "_creation_date, " +
       "_modification_date, " +
-      "_trapped " +
+      "_trapped, " +
+      "_table_count " +
       "FROM cp.`pdf/20.pdf` " +
       "LIMIT 1";
 
@@ -200,6 +201,7 @@ public class TestPdfFormat extends ClusterTest {
       .addNullable("_creation_date", MinorType.TIMESTAMP)
       .addNullable("_modification_date", MinorType.TIMESTAMP)
       .addNullable("_trapped", MinorType.VARCHAR)
+      .addNullable("_table_count", MinorType.INT)
       .buildSchema();
 
     RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
@@ -210,7 +212,7 @@ public class TestPdfFormat extends ClusterTest {
         "Acrobat Distiller 7.0.5 (Windows)",
         857403000000L,
         1230835135000L,
-        null)
+        null, 1)
       .build();
 
     new RowSetComparison(expected).verifyAndClearAll(results);
@@ -270,7 +272,7 @@ public class TestPdfFormat extends ClusterTest {
       "_producer," +
       "_creation_date, " +
       "_modification_date, " +
-      "_trapped " +
+      "_trapped, _table_count " +
       "FROM table(cp.`pdf/labor.pdf` (type => 'pdf', extractionAlgorithm => 
'spreadsheet')) LIMIT 1";
 
     RowSet results = client.queryBuilder().sql(sql).rowSet();
@@ -286,13 +288,14 @@ public class TestPdfFormat extends ClusterTest {
       .addNullable("_creation_date", MinorType.TIMESTAMP)
       .addNullable("_modification_date", MinorType.TIMESTAMP)
       .addNullable("_trapped", MinorType.VARCHAR)
+      .addNullable("_table_count", MinorType.INT)
       .buildSchema();
 
     RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
       .addRow(1, null, null, null, null, "pdftk 2.01 - www.pdftk.com",
         "itext-paulo-155 (itextpdf.sf.net-lowagie.com)",
         QueryTestUtil.ConvertDateToLong("2015-04-25T23:09:47Z"),
-        QueryTestUtil.ConvertDateToLong("2015-04-25T23:09:47Z"), null)
+        QueryTestUtil.ConvertDateToLong("2015-04-25T23:09:47Z"), null, 0)
     .build();
     new RowSetComparison(expected).verifyAndClearAll(results);
   }

[drill] branch master updated: DRILL-8390: Minor Improvements to PDF Reader (#2742)

Reply via email to