This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4708-refactor-xlsx
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 6f7eca02f55262de29e7c45c79f88d4b5ec993c2
Author: tallison <[email protected]>
AuthorDate: Fri Apr 3 07:31:20 2026 -0400

    refactor xlsx - WIP
---
 .../microsoft/ooxml/TikaSheetContentsHandler.java  |  36 ++++
 .../microsoft/ooxml/TikaSheetXMLHandler.java       |  34 ++--
 .../parser/microsoft/ooxml/XSSFCommentsShim.java   | 187 +++++++++++++++++++++
 .../ooxml/XSSFExcelExtractorDecorator.java         |  68 +++++++-
 4 files changed, 297 insertions(+), 28 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/TikaSheetContentsHandler.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/TikaSheetContentsHandler.java
new file mode 100644
index 0000000000..44173ec322
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/TikaSheetContentsHandler.java
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+/**
+ * Sheet contents handler that uses {@link XSSFCommentsShim.CommentData}
+ * instead of POI's XMLBeans-dependent {@code XSSFComment}.
+ */
+interface TikaSheetContentsHandler {
+
+    void startRow(int rowNum);
+
+    void endRow(int rowNum);
+
+    void cell(String cellRef, String formattedValue, 
XSSFCommentsShim.CommentData comment);
+
+    default void headerFooter(String text, boolean isHeader, String tagName) {
+    }
+
+    default void endSheet() {
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/TikaSheetXMLHandler.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/TikaSheetXMLHandler.java
index c7276e92a5..3ba83dd255 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/TikaSheetXMLHandler.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/TikaSheetXMLHandler.java
@@ -23,9 +23,6 @@ import java.util.Queue;
 import org.apache.poi.ss.usermodel.BuiltinFormats;
 import org.apache.poi.ss.usermodel.DataFormatter;
 import org.apache.poi.ss.util.CellAddress;
-import 
org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
-import org.apache.poi.xssf.model.Comments;
-import org.apache.poi.xssf.usermodel.XSSFComment;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.xml.sax.Attributes;
@@ -34,11 +31,10 @@ import org.xml.sax.helpers.DefaultHandler;
 
 /**
  * Sheet XML handler for XLSX event-based parsing that uses {@link 
XSSFStylesShim}
- * instead of POI's XMLBeans-dependent {@code StylesTable}.
+ * and {@link XSSFCommentsShim} instead of POI's XMLBeans-dependent
+ * {@code StylesTable} and {@code CommentsTable}.
  * <p>
  * Adapted from Apache POI's {@code XSSFSheetXMLHandler} (Apache 2.0 license).
- * The only structural change is replacing the {@code Styles}/{@code 
XSSFCellStyle}
- * lookup with a direct call to our SAX-based styles shim for format 
resolution.
  */
 class TikaSheetXMLHandler extends DefaultHandler {
 
@@ -57,9 +53,9 @@ class TikaSheetXMLHandler extends DefaultHandler {
     }
 
     private final XSSFStylesShim stylesShim;
-    private final Comments comments;
+    private final XSSFCommentsShim commentsShim;
     private final XSSFSharedStringsShim sharedStringsShim;
-    private final SheetContentsHandler output;
+    private final TikaSheetContentsHandler output;
     private final DataFormatter formatter;
     private final boolean formulasNotResults;
 
@@ -83,34 +79,34 @@ class TikaSheetXMLHandler extends DefaultHandler {
     private Queue<CellAddress> commentCellRefs;
 
     TikaSheetXMLHandler(XSSFStylesShim stylesShim,
-                         Comments comments,
+                         XSSFCommentsShim commentsShim,
                          XSSFSharedStringsShim sharedStringsShim,
-                         SheetContentsHandler sheetContentsHandler,
+                         TikaSheetContentsHandler sheetContentsHandler,
                          DataFormatter dataFormatter,
                          boolean formulasNotResults) {
         this.stylesShim = stylesShim;
-        this.comments = comments;
+        this.commentsShim = commentsShim;
         this.sharedStringsShim = sharedStringsShim;
         this.output = sheetContentsHandler;
         this.formatter = dataFormatter;
         this.formulasNotResults = formulasNotResults;
         this.nextDataType = XssfDataType.NUMBER;
-        initComments(comments);
+        initComments(commentsShim);
     }
 
     TikaSheetXMLHandler(XSSFStylesShim stylesShim,
                          XSSFSharedStringsShim sharedStringsShim,
-                         SheetContentsHandler sheetContentsHandler,
+                         TikaSheetContentsHandler sheetContentsHandler,
                          DataFormatter dataFormatter,
                          boolean formulasNotResults) {
         this(stylesShim, null, sharedStringsShim, sheetContentsHandler, 
dataFormatter,
                 formulasNotResults);
     }
 
-    private void initComments(Comments commentsTable) {
-        if (commentsTable != null) {
+    private void initComments(XSSFCommentsShim commentsShim) {
+        if (commentsShim != null) {
             commentCellRefs = new LinkedList<>();
-            for (Iterator<CellAddress> iter = commentsTable.getCellAddresses();
+            for (Iterator<CellAddress> iter = commentsShim.getCellAddresses();
                  iter.hasNext(); ) {
                 commentCellRefs.add(iter.next());
             }
@@ -333,8 +329,8 @@ class TikaSheetXMLHandler extends DefaultHandler {
         }
 
         checkForEmptyCellComments(EmptyCellCommentsCheckType.CELL);
-        XSSFComment comment = comments != null ?
-                comments.findCellComment(new CellAddress(cellRef)) : null;
+        XSSFCommentsShim.CommentData comment = commentsShim != null ?
+                commentsShim.findCellComment(new CellAddress(cellRef)) : null;
         output.cell(cellRef, thisStr, comment);
     }
 
@@ -393,7 +389,7 @@ class TikaSheetXMLHandler extends DefaultHandler {
     }
 
     private void outputEmptyCellComment(CellAddress cellRef) {
-        XSSFComment comment = comments.findCellComment(cellRef);
+        XSSFCommentsShim.CommentData comment = 
commentsShim.findCellComment(cellRef);
         output.cell(cellRef.formatAsString(), null, comment);
     }
 
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFCommentsShim.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFCommentsShim.java
new file mode 100644
index 0000000000..f3293a0d3c
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFCommentsShim.java
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.poi.ss.util.CellAddress;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.utils.XMLReaderUtils;
+
+/**
+ * SAX-based shim that parses {@code xl/commentsN.xml} without XMLBeans.
+ * Replaces POI's {@code CommentsTable} (which depends on poi-ooxml-lite)
+ * for Tika's text extraction needs.
+ *
+ * <p>Only extracts what Tika needs: cell reference → (author, text) 
mapping.</p>
+ */
+class XSSFCommentsShim {
+
+    private final Map<CellAddress, CommentData> commentsByCell;
+
+    /**
+     * Simple holder for comment data needed by Tika.
+     */
+    static class CommentData {
+        private final String author;
+        private final String text;
+
+        CommentData(String author, String text) {
+            this.author = author;
+            this.text = text;
+        }
+
+        public String getAuthor() {
+            return author;
+        }
+
+        public String getText() {
+            return text;
+        }
+    }
+
+    /**
+     * Parse a comments XML stream.
+     *
+     * @param is           the {@code xl/commentsN.xml} stream (may be null)
+     * @param parseContext parse context for SAX parser configuration
+     */
+    XSSFCommentsShim(InputStream is, ParseContext parseContext)
+            throws IOException, TikaException, SAXException {
+        commentsByCell = new LinkedHashMap<>();
+        if (is != null) {
+            CommentsHandler handler = new CommentsHandler();
+            XMLReaderUtils.parseSAX(is, handler, parseContext);
+        }
+    }
+
+    /**
+     * @return the number of comments parsed
+     */
+    int getNumberOfComments() {
+        return commentsByCell.size();
+    }
+
+    /**
+     * Find comment data for a given cell address.
+     *
+     * @return CommentData or null if no comment at that cell
+     */
+    CommentData findCellComment(CellAddress cellAddress) {
+        return commentsByCell.get(cellAddress);
+    }
+
+    /**
+     * @return iterator over all cell addresses that have comments, in 
document order
+     */
+    Iterator<CellAddress> getCellAddresses() {
+        return commentsByCell.keySet().iterator();
+    }
+
+    /**
+     * SAX handler for comments XML.  Structure:
+     * <pre>
+     * &lt;comments&gt;
+     *   &lt;authors&gt;
+     *     &lt;author&gt;Name&lt;/author&gt;
+     *   &lt;/authors&gt;
+     *   &lt;commentList&gt;
+     *     &lt;comment ref="A1" authorId="0"&gt;
+     *       &lt;text&gt;
+     *         &lt;r&gt;&lt;t&gt;Comment text&lt;/t&gt;&lt;/r&gt;
+     *         or plain &lt;t&gt;Comment text&lt;/t&gt;
+     *       &lt;/text&gt;
+     *     &lt;/comment&gt;
+     *   &lt;/commentList&gt;
+     * &lt;/comments&gt;
+     * </pre>
+     */
+    private class CommentsHandler extends DefaultHandler {
+
+        private final List<String> authors = new ArrayList<>();
+        private final StringBuilder textBuffer = new StringBuilder();
+
+        private boolean inAuthor;
+        private boolean inT;
+        private boolean inText;
+
+        private String currentRef;
+        private int currentAuthorId;
+        private final StringBuilder commentText = new StringBuilder();
+
+        @Override
+        public void startElement(String uri, String localName, String qName,
+                                 Attributes atts) {
+            if ("author".equals(localName)) {
+                inAuthor = true;
+                textBuffer.setLength(0);
+            } else if ("comment".equals(localName)) {
+                currentRef = atts.getValue("ref");
+                String authorIdStr = atts.getValue("authorId");
+                currentAuthorId = authorIdStr != null ? 
Integer.parseInt(authorIdStr) : -1;
+                commentText.setLength(0);
+            } else if ("text".equals(localName)) {
+                inText = true;
+            } else if ("t".equals(localName) && inText) {
+                inT = true;
+                textBuffer.setLength(0);
+            }
+        }
+
+        @Override
+        public void endElement(String uri, String localName, String qName) {
+            if ("author".equals(localName)) {
+                inAuthor = false;
+                authors.add(textBuffer.toString());
+            } else if ("t".equals(localName) && inT) {
+                inT = false;
+                if (commentText.length() > 0) {
+                    commentText.append(' ');
+                }
+                commentText.append(textBuffer);
+            } else if ("text".equals(localName)) {
+                inText = false;
+            } else if ("comment".equals(localName)) {
+                if (currentRef != null) {
+                    String author = (currentAuthorId >= 0 && currentAuthorId < 
authors.size())
+                            ? authors.get(currentAuthorId) : "";
+                    commentsByCell.put(new CellAddress(currentRef),
+                            new CommentData(author, commentText.toString()));
+                }
+                currentRef = null;
+            }
+        }
+
+        @Override
+        public void characters(char[] ch, int start, int length) {
+            if (inAuthor || inT) {
+                textBuffer.append(ch, start, length);
+            }
+        }
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
index fab2601e4a..3ebccba8e5 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
@@ -43,7 +43,6 @@ import org.apache.poi.ss.util.CellReference;
 import org.apache.poi.xssf.eventusermodel.XSSFReader;
 import 
org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
 import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
-import org.apache.poi.xssf.model.Comments;
 import org.apache.poi.xssf.usermodel.XSSFComment;
 import org.apache.poi.xssf.usermodel.helpers.HeaderFooterHelper;
 import org.xml.sax.Attributes;
@@ -90,6 +89,8 @@ public class XSSFExcelExtractorDecorator extends 
AbstractOOXMLExtractor {
             
"http://schemas.openxmlformats.org/officeDocument/2006/relationships";;
     private static final String RELATION_VML_DRAWING =
             
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/vmlDrawing";;
+    private static final String RELATION_COMMENTS =
+            
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments";;
 
     /**
      * Allows access to headers/footers from raw xml strings
@@ -173,8 +174,8 @@ public class XSSFExcelExtractorDecorator extends 
AbstractOOXMLExtractor {
                 addDrawingHyperLinks(sheetPart);
                 sheetParts.add(sheetPart);
 
-                Comments comments = iter.getSheetComments();
-                if (comments != null && comments.getNumberOfComments() > 0) {
+                XSSFCommentsShim commentsShim = parseSheetComments(sheetPart);
+                if (commentsShim != null && commentsShim.getNumberOfComments() 
> 0) {
                     metadata.set(Office.HAS_COMMENTS, true);
                 }
 
@@ -186,7 +187,7 @@ public class XSSFExcelExtractorDecorator extends 
AbstractOOXMLExtractor {
                 xhtml.startElement("table");
                 xhtml.startElement("tbody");
 
-                processSheet(sheetExtractor, comments, stylesShim, 
stringsShim, stream);
+                processSheet(sheetExtractor, commentsShim, stylesShim, 
stringsShim, stream);
                 try {
                     getThreadedComments(container, sheetPart, xhtml);
                 } catch (InvalidFormatException | TikaException | IOException 
e) {
@@ -822,12 +823,13 @@ public class XSSFExcelExtractorDecorator extends 
AbstractOOXMLExtractor {
         }
     }
 
-    public void processSheet(SheetContentsHandler sheetContentsHandler, 
Comments comments,
+    public void processSheet(TikaSheetContentsHandler sheetContentsHandler,
+                             XSSFCommentsShim commentsShim,
                              XSSFStylesShim stylesShim, XSSFSharedStringsShim 
stringsShim,
                              InputStream sheetInputStream) throws IOException, 
SAXException {
         try {
             XSSFSheetInterestingPartsCapturer handler = new 
XSSFSheetInterestingPartsCapturer(
-                    new TikaSheetXMLHandler(stylesShim, comments, stringsShim,
+                    new TikaSheetXMLHandler(stylesShim, commentsShim, 
stringsShim,
                             sheetContentsHandler, formatter, false));
             XMLReaderUtils.parseSAX(sheetInputStream, handler, parseContext);
             sheetInputStream.close();
@@ -846,6 +848,32 @@ public class XSSFExcelExtractorDecorator extends 
AbstractOOXMLExtractor {
         }
     }
 
+    /**
+     * Parse the comments XML for a sheet part via SAX, avoiding XMLBeans.
+     */
+    private XSSFCommentsShim parseSheetComments(PackagePart sheetPart) {
+        try {
+            PackageRelationshipCollection rels =
+                    sheetPart.getRelationshipsByType(RELATION_COMMENTS);
+            if (rels.isEmpty()) {
+                return null;
+            }
+            PackageRelationship rel = rels.getRelationship(0);
+            PackagePartName partName =
+                    PackagingURIHelper.createPartName(rel.getTargetURI());
+            PackagePart commentsPart = rel.getPackage().getPart(partName);
+            if (commentsPart == null) {
+                return null;
+            }
+            try (InputStream is = commentsPart.getInputStream()) {
+                return new XSSFCommentsShim(is, parseContext);
+            }
+        } catch (InvalidFormatException | IOException | TikaException | 
SAXException e) {
+            //swallow — comments are not critical
+            return null;
+        }
+    }
+
 
     /**
      * In Excel files, sheets have things embedded in them,
@@ -892,7 +920,8 @@ public class XSSFExcelExtractorDecorator extends 
AbstractOOXMLExtractor {
     /**
      * Turns formatted sheet events into HTML
      */
-    protected static class SheetTextAsHTML implements SheetContentsHandler {
+    protected static class SheetTextAsHTML
+            implements TikaSheetContentsHandler, SheetContentsHandler {
         private final boolean includeHeadersFooters;
         private final boolean includeMissingRows;
         protected List<String> headers;
@@ -939,7 +968,8 @@ public class XSSFExcelExtractorDecorator extends 
AbstractOOXMLExtractor {
             }
         }
 
-        public void cell(String cellRef, String formattedValue, XSSFComment 
comment) {
+        public void cell(String cellRef, String formattedValue,
+                          XSSFCommentsShim.CommentData comment) {
             try {
                 // Handle any missing cells
                 int colNum =
@@ -964,7 +994,7 @@ public class XSSFExcelExtractorDecorator extends 
AbstractOOXMLExtractor {
                     xhtml.endElement("br");
                     xhtml.characters(comment.getAuthor());
                     xhtml.characters(": ");
-                    xhtml.characters(comment.getString().getString());
+                    xhtml.characters(comment.getText());
                 }
 
                 xhtml.endElement("td");
@@ -973,6 +1003,21 @@ public class XSSFExcelExtractorDecorator extends 
AbstractOOXMLExtractor {
             }
         }
 
+        /**
+         * Bridge for POI's {@link SheetContentsHandler} interface, used by the
+         * XLSB (binary) path via {@link 
org.apache.poi.xssf.binary.XSSFBSheetHandler}.
+         */
+        public void cell(String cellRef, String formattedValue, XSSFComment 
comment) {
+            XSSFCommentsShim.CommentData commentData = null;
+            if (comment != null) {
+                String text = comment.getString() != null ?
+                        comment.getString().getString() : "";
+                commentData = new XSSFCommentsShim.CommentData(
+                        comment.getAuthor(), text);
+            }
+            cell(cellRef, formattedValue, commentData);
+        }
+
         public void headerFooter(String text, boolean isHeader, String 
tagName) {
             if (!includeHeadersFooters) {
                 return;
@@ -983,6 +1028,11 @@ public class XSSFExcelExtractorDecorator extends 
AbstractOOXMLExtractor {
                 footers.add(text);
             }
         }
+
+        @Override
+        public void endSheet() {
+            // no-op — satisfies both TikaSheetContentsHandler and 
SheetContentsHandler
+        }
     }
 
     protected static class HeaderFooterFromString implements HeaderFooter {

Reply via email to