This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4692-improve-ooxml-sax-parsers in repository https://gitbox.apache.org/repos/asf/tika.git
commit 8eec3f2999d62cfe864dff5b7fabddacb12c7aff Author: tallison <[email protected]> AuthorDate: Fri Apr 3 06:57:48 2026 -0400 parameterize glossary extraction --- .../tika/parser/microsoft/OfficeParserConfig.java | 18 ++++++++++ .../ooxml/SXWPFWordExtractorDecorator.java | 24 ++++++++----- .../ooxml/xwpf/XWPFEventBasedWordExtractor.java | 40 +++++++++++++--------- 3 files changed, 57 insertions(+), 25 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java index db6d4e78e9..fab2355064 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java @@ -40,6 +40,7 @@ public class OfficeParserConfig implements Serializable { private boolean writeSelectHeadersInBody = false; private boolean extractAllAlternativesFromMSG = false; + private boolean includeGlossary = true; private String dateOverrideFormat = null; private int maxOverride = 0;//ignore @@ -231,6 +232,23 @@ public class OfficeParserConfig implements Serializable { this.extractAllAlternativesFromMSG = extractAllAlternativesFromMSG; } + public boolean isIncludeGlossary() { + return includeGlossary; + } + + /** + * Whether or not to include the glossary (building blocks / AutoText) document + * from docx files. The glossary can contain template content such as form field + * placeholders that may duplicate content already present in the main body. + * <p/> + * Default: <code>true</code> + * + * @param includeGlossary whether or not to include glossary content + */ + public void setIncludeGlossary(boolean includeGlossary) { + this.includeGlossary = includeGlossary; + } + public boolean isIncludeMissingRows() { return includeMissingRows; } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java index 1aca8a9647..45c2725e67 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java @@ -46,6 +46,7 @@ import org.apache.tika.metadata.Office; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.microsoft.EMFParser; +import org.apache.tika.parser.microsoft.OfficeParserConfig; import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor; import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFFeatureExtractor; import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFNumberingShim; @@ -126,16 +127,21 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor { } } //handle glossary document - pps = opcPackage.getPartsByContentType(XWPFRelation.GLOSSARY_DOCUMENT.getContentType()); - if (pps != null) { - if (pps.size() > 0) { - xhtml.startElement("div", "class", "glossary"); - - for (PackagePart pp : pps) { - //likely only one, but why not... - handleDocumentPart(pp, xhtml); + OfficeParserConfig officeParserConfig = context.get(OfficeParserConfig.class, + new OfficeParserConfig()); + if (officeParserConfig.isIncludeGlossary()) { + pps = opcPackage.getPartsByContentType( + XWPFRelation.GLOSSARY_DOCUMENT.getContentType()); + if (pps != null) { + if (pps.size() > 0) { + xhtml.startElement("div", "class", "glossary"); + + for (PackagePart pp : pps) { + //likely only one, but why not... + handleDocumentPart(pp, xhtml); + } + xhtml.endElement("div"); } - xhtml.endElement("div"); } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java index 2bb53a3c69..ec1a1fa437 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java @@ -62,6 +62,7 @@ public class XWPFEventBasedWordExtractor implements POIXMLTextExtractor { private OPCPackage container; private POIXMLProperties properties; + private boolean includeGlossary = true; public XWPFEventBasedWordExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException { @@ -85,6 +86,10 @@ public class XWPFEventBasedWordExtractor implements POIXMLTextExtractor { return this.container; } + public void setIncludeGlossary(boolean includeGlossary) { + this.includeGlossary = includeGlossary; + } + public POIXMLProperties.CoreProperties getCoreProperties() { POIXMLProperties props = getOrCreateProperties(); return props != null ? props.getCoreProperties() : null; @@ -131,23 +136,26 @@ public class XWPFEventBasedWordExtractor implements POIXMLTextExtractor { } } //handle glossary document - pps = container.getPartsByContentType(XWPFRelation.GLOSSARY_DOCUMENT.getContentType()); - - if (pps != null) { - for (PackagePart pp : pps) { - //likely only one, but why not... - try { - handleDocumentPart(pp, sb); - } catch (IOException e) { - LOG.warn("IOException handling glossary document part", e); - } catch (SAXException e) { - if (WriteLimitReachedException.isWriteLimitReached(e)) { - throw new RuntimeSAXException(e); + if (includeGlossary) { + pps = container.getPartsByContentType( + XWPFRelation.GLOSSARY_DOCUMENT.getContentType()); + + if (pps != null) { + for (PackagePart pp : pps) { + //likely only one, but why not... + try { + handleDocumentPart(pp, sb); + } catch (IOException e) { + LOG.warn("IOException handling glossary document part", e); + } catch (SAXException e) { + if (WriteLimitReachedException.isWriteLimitReached(e)) { + throw new RuntimeSAXException(e); + } + //swallow this because we don't actually call it + LOG.warn("SAXException handling glossary document part", e); + } catch (TikaException e) { + LOG.warn("ParseException handling document part", e); } - //swallow this because we don't actually call it - LOG.warn("SAXException handling glossary document part", e); - } catch (TikaException e) { - LOG.warn("ParseException handling document part", e); } } }
