This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4465 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 7e5733df02562b83d9e1f3a81b6183952314c97a Author: tallison <[email protected]> AuthorDate: Fri Aug 15 08:15:01 2025 -0400 TIKA-4465 -- extract javascript from name tree --- .../main/java/org/apache/tika/metadata/PDF.java | 6 + .../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 274 +++++++++++++-------- .../org/apache/tika/parser/pdf/PDFParserTest.java | 72 +++++- 3 files changed, 239 insertions(+), 113 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java index b15c10383..f85218936 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java @@ -215,4 +215,10 @@ public interface PDF { */ Property OCR_PAGE_COUNT = Property.externalInteger(PDF_PREFIX + "ocrPageCount"); + /** + * When javascript is stored in the names tree, there's a name associated with that script. + * This is that name. When javascript is stored in an action, there is no name, and this + * metadata will not be populated. + */ + Property JS_NAME = Property.internalText(PDF_PREFIX + "jsName"); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java index de47f2394..5b3525488 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java @@ -54,6 +54,7 @@ import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.cos.COSObject; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocumentCatalog; +import org.apache.pdfbox.pdmodel.PDJavascriptNameTreeNode; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPageTree; import org.apache.pdfbox.pdmodel.common.COSObjectable; @@ -700,92 +701,7 @@ class AbstractPDF2XHTML extends PDFTextStripper { try { for (PDAnnotation annotation : page.getAnnotations()) { - String annotationName = annotation.getAnnotationName(); - if (annotationTypes.size() < MAX_ANNOTATION_TYPES) { - if (annotationName != null) { - annotationTypes.add(annotationName); - } else { - annotationTypes.add(NULL_STRING); - } - } - String annotationSubtype = annotation.getSubtype(); - if (annotationSubtypes.size() < MAX_ANNOTATION_TYPES) { - if (annotationSubtype != null) { - annotationSubtypes.add(annotationSubtype); - } else { - annotationSubtypes.add(NULL_STRING); - } - } - if (annotation instanceof PDAnnotationFileAttachment) { - PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation; - String subtype = "annotationFileAttachment"; - AttributesImpl attributes = new AttributesImpl(); - attributes.addAttribute("", "source", "source", "CDATA", subtype); - processDocOnAction("", subtype, fann.getFile(), attributes); - } else if (annotation instanceof PDAnnotationWidget) { - handleWidget((PDAnnotationWidget) annotation); - } else { - if (annotationSubtype == null) { - annotationSubtype = "unknown"; - } else if (annotationSubtype.equals(THREE_D) || - annotation.getCOSObject().containsKey(THREE_DD)) { - //To make this stricter, we could get the 3DD stream object and see if the - //subtype is U3D or PRC or model/ (prefix for model mime type) - metadata.set(PDF.HAS_3D, true); - num3DAnnotations++; - } - for (COSDictionary fileSpec : findFileSpecs(annotation.getCOSObject())) { - AttributesImpl attributes = new AttributesImpl(); - attributes.addAttribute("", "source", "source", "CDATA", annotationSubtype); - processDocOnAction("", annotationSubtype, createFileSpecification(fileSpec), - attributes); - } - } - // TODO: remove once PDFBOX-1143 is fixed: - if (config.isExtractAnnotationText()) { - PDActionURI uri = getActionURI(annotation); - if (uri != null) { - String link = uri.getURI(); - if (link != null && !link.isBlank()) { - xhtml.startElement("div", "class", "annotation"); - xhtml.startElement("a", "href", link); - xhtml.characters(link); - xhtml.endElement("a"); - xhtml.endElement("div"); - } - } - - if (annotation instanceof PDAnnotationMarkup) { - PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation; - String title = annotationMarkup.getTitlePopup(); - String subject = annotationMarkup.getSubject(); - String contents = annotationMarkup.getContents(); - // TODO: maybe also annotationMarkup.getRichContents()? - if (title != null || subject != null || contents != null) { - xhtml.startElement("div", "class", "annotation"); - - if (title != null) { - xhtml.startElement("div", "class", "annotationTitle"); - xhtml.characters(title); - xhtml.endElement("div"); - } - - if (subject != null) { - xhtml.startElement("div", "class", "annotationSubject"); - xhtml.characters(subject); - xhtml.endElement("div"); - } - - if (contents != null) { - xhtml.startElement("div", "class", "annotationContents"); - xhtml.characters(contents); - xhtml.endElement("div"); - } - - xhtml.endElement("div"); - } - } - } + processPageAnnotation(annotation); } if (config.getOcrStrategy() == PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION) { doOCROnCurrentPage(page, OCR_AND_TEXT_EXTRACTION); @@ -835,6 +751,95 @@ class AbstractPDF2XHTML extends PDFTextStripper { } } + private void processPageAnnotation(PDAnnotation annotation) throws TikaException, IOException, SAXException { + String annotationName = annotation.getAnnotationName(); + if (annotationTypes.size() < MAX_ANNOTATION_TYPES) { + if (annotationName != null) { + annotationTypes.add(annotationName); + } else { + annotationTypes.add(NULL_STRING); + } + } + String annotationSubtype = annotation.getSubtype(); + if (annotationSubtypes.size() < MAX_ANNOTATION_TYPES) { + if (annotationSubtype != null) { + annotationSubtypes.add(annotationSubtype); + } else { + annotationSubtypes.add(NULL_STRING); + } + } + if (annotation instanceof PDAnnotationFileAttachment) { + PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation; + String subtype = "annotationFileAttachment"; + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute("", "source", "source", "CDATA", subtype); + processDocOnAction("", subtype, fann.getFile(), attributes); + } else if (annotation instanceof PDAnnotationWidget) { + handleWidget((PDAnnotationWidget) annotation); + } else { + if (annotationSubtype == null) { + annotationSubtype = "unknown"; + } else if (annotationSubtype.equals(THREE_D) || + annotation.getCOSObject().containsKey(THREE_DD)) { + //To make this stricter, we could get the 3DD stream object and see if the + //subtype is U3D or PRC or model/ (prefix for model mime type) + metadata.set(PDF.HAS_3D, true); + num3DAnnotations++; + } + for (COSDictionary fileSpec : findFileSpecs(annotation.getCOSObject())) { + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute("", "source", "source", "CDATA", annotationSubtype); + processDocOnAction("", annotationSubtype, createFileSpecification(fileSpec), + attributes); + } + } + // TODO: remove once PDFBOX-1143 is fixed: + if (config.isExtractAnnotationText()) { + PDActionURI uri = getActionURI(annotation); + if (uri != null) { + String link = uri.getURI(); + if (link != null && !link.isBlank()) { + xhtml.startElement("div", "class", "annotation"); + xhtml.startElement("a", "href", link); + xhtml.characters(link); + xhtml.endElement("a"); + xhtml.endElement("div"); + } + } + + if (annotation instanceof PDAnnotationMarkup) { + PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation; + String title = annotationMarkup.getTitlePopup(); + String subject = annotationMarkup.getSubject(); + String contents = annotationMarkup.getContents(); + // TODO: maybe also annotationMarkup.getRichContents()? + if (title != null || subject != null || contents != null) { + xhtml.startElement("div", "class", "annotation"); + + if (title != null) { + xhtml.startElement("div", "class", "annotationTitle"); + xhtml.characters(title); + xhtml.endElement("div"); + } + + if (subject != null) { + xhtml.startElement("div", "class", "annotationSubject"); + xhtml.characters(subject); + xhtml.endElement("div"); + } + + if (contents != null) { + xhtml.startElement("div", "class", "annotationContents"); + xhtml.characters(contents); + xhtml.endElement("div"); + } + + xhtml.endElement("div"); + } + } + } + } + private List<COSDictionary> findFileSpecs(COSDictionary cosDict) { Set<COSName> types = new HashSet<>(); types.add(COSName.FILESPEC); @@ -906,6 +911,7 @@ class AbstractPDF2XHTML extends PDFTextStripper { protected void startDocument(PDDocument pdf) throws IOException { try { xhtml.startDocument(); + extractJavaScript(pdf); try { handleDestinationOrAction(pdf.getDocumentCatalog().getOpenAction(), ActionTrigger.DOCUMENT_OPEN); @@ -918,6 +924,57 @@ class AbstractPDF2XHTML extends PDFTextStripper { } } + private void extractJavaScript(PDDocument pdf) throws SAXException { + if (! config.isExtractActions()) { + return; + } + if (pdf.getDocumentCatalog() == null || pdf.getDocumentCatalog().getNames() == null + || pdf.getDocumentCatalog().getNames().getJavaScript() == null) { + return; + } + try { + PDJavascriptNameTreeNode pdjntn = pdf.getDocumentCatalog().getNames().getJavaScript(); + addJavaScript(pdjntn.getNames()); + int depth = 0; + processJavascriptNameTreeNodeKids(pdjntn.getKids(), depth + 1); + } catch (IOException e) { + //swallow + } + } + + private void addJavaScript(Map<String, PDActionJavaScript> pdActionJavaScriptMap) throws IOException, SAXException { + for (Map.Entry<String, PDActionJavaScript> e : pdActionJavaScriptMap.entrySet()) { + String action = e.getValue().getAction(); + if (StringUtils.isBlank(action)) { + return; + } + AttributesImpl attributes = new AttributesImpl(); + + addNonNullAttribute("trigger", "namesTree", attributes); + addNonNullAttribute("type", e.getValue().getClass().getSimpleName(), attributes); + + processJavaScriptAction("NAMES_TREE", e.getKey(), e.getValue(), attributes); + } + + } + + private void processJavascriptNameTreeNodeKids(List<PDNameTreeNode<PDActionJavaScript>> kids, int depth) throws IOException, SAXException { + + if (kids == null) { + return; + } + + if (depth > MAX_RECURSION_DEPTH) { + //hit max recursion + //return silently + return; + } + for (PDNameTreeNode<PDActionJavaScript> pdntn: kids) { + addJavaScript(pdntn.getNames()); + processJavascriptNameTreeNodeKids(pdntn.getKids(), depth + 1); + }; + } + private void handleDestinationOrAction(PDDestinationOrAction action, ActionTrigger actionTrigger) throws IOException, SAXException, TikaException { @@ -952,25 +1009,7 @@ class AbstractPDF2XHTML extends PDFTextStripper { PDActionRemoteGoTo remoteGoTo = (PDActionRemoteGoTo) action; processDocOnAction("", "", remoteGoTo.getFile(), attributes); } else if (action instanceof PDActionJavaScript) { - PDActionJavaScript jsAction = (PDActionJavaScript) action; - Metadata m = new Metadata(); - m.set(Metadata.CONTENT_TYPE, "application/javascript"); - m.set(Metadata.CONTENT_ENCODING, StandardCharsets.UTF_8.toString()); - m.set(PDF.ACTION_TRIGGER, actionTrigger.toString()); - m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, - TikaCoreProperties.EmbeddedResourceType.MACRO.name()); - String js = jsAction.getAction(); - js = (js == null) ? "" : js; - if (embeddedDocumentExtractor.shouldParseEmbedded(m)) { - try (TikaInputStream tis = TikaInputStream.get(js.getBytes(StandardCharsets.UTF_8))) { - embeddedDocumentExtractor.parseEmbedded(tis, xhtml, m, true); - } - } - addNonNullAttribute("class", "javascript", attributes); - addNonNullAttribute("type", jsAction.getType(), attributes); - addNonNullAttribute("subtype", jsAction.getSubType(), attributes); - xhtml.startElement("div", attributes); - xhtml.endElement("div"); + processJavaScriptAction(actionTrigger.name(), null, (PDActionJavaScript) action, attributes); /*} else if (action instanceof PDActionSubmitForm) { PDActionSubmitForm submitForm = (PDActionSubmitForm) action; //these are typically urls, not actual file specification @@ -982,6 +1021,31 @@ class AbstractPDF2XHTML extends PDFTextStripper { } } + private void processJavaScriptAction(String trigger, String jsActionName, PDActionJavaScript jsAction, AttributesImpl attrs) throws IOException, SAXException { + Metadata m = new Metadata(); + m.set(Metadata.CONTENT_TYPE, "application/javascript"); + m.set(Metadata.CONTENT_ENCODING, StandardCharsets.UTF_8.toString()); + m.set(PDF.ACTION_TRIGGER, trigger); + m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, + TikaCoreProperties.EmbeddedResourceType.MACRO.name()); + if (! StringUtils.isBlank(jsActionName)) { + m.set(PDF.JS_NAME, jsActionName); + } + String js = jsAction.getAction(); + js = (js == null) ? "" : js; + if (embeddedDocumentExtractor.shouldParseEmbedded(m)) { + try (TikaInputStream tis = TikaInputStream.get(js.getBytes(StandardCharsets.UTF_8))) { + embeddedDocumentExtractor.parseEmbedded(tis, xhtml, m, true); + } + } + ; + addNonNullAttribute("class", "javascript", attrs); + addNonNullAttribute("type", jsAction.getType(), attrs); + addNonNullAttribute("subtype", jsAction.getSubType(), attrs); + xhtml.startElement("div", attrs); + xhtml.endElement("div"); + } + @Override protected void endDocument(PDDocument pdf) throws IOException { try { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index 49b0042cb..52bf80129 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -25,10 +25,14 @@ import static org.junit.jupiter.api.Assertions.fail; import static org.junit.jupiter.api.Assumptions.assumeTrue; import java.io.InputStream; +import java.util.HashSet; import java.util.List; import java.util.Locale; +import java.util.Set; import java.util.logging.Level; import java.util.logging.Logger; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeAll; @@ -55,6 +59,7 @@ import org.apache.tika.parser.ocr.TesseractOCRParser; import org.apache.tika.parser.xml.XMLProfiler; import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.RecursiveParserWrapperHandler; +import org.apache.tika.utils.StringUtils; public class PDFParserTest extends TikaTest { public static final MediaType TYPE_TEXT = MediaType.TEXT_PLAIN; @@ -251,7 +256,7 @@ public class PDFParserTest extends TikaTest { @Test public void testEmbeddedDocsWithOCROnly() throws Exception { - assumeTrue(canRunOCR(), "can run OCR"); + assumeTrue(canRunOCR(), "can't run OCR"); //test default is "auto" assertEquals(PDFParserConfig.OCR_STRATEGY.AUTO, new PDFParserConfig().getOcrStrategy()); testStrategy(null); @@ -367,7 +372,7 @@ public class PDFParserTest extends TikaTest { @Test public void testJBIG2OCROnly() throws Exception { - assumeTrue(canRunOCR(), "can run OCR"); + assumeTrue(canRunOCR(), "can't run OCR"); PDFParserConfig config = new PDFParserConfig(); config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.OCR_ONLY); ParseContext context = new ParseContext(); @@ -379,7 +384,7 @@ public class PDFParserTest extends TikaTest { @Test public void testJPEG2000() throws Exception { - assumeTrue(canRunOCR(), "can run OCR"); + assumeTrue(canRunOCR(), "can't run OCR"); PDFParserConfig config = new PDFParserConfig(); config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.OCR_ONLY); ParseContext context = new ParseContext(); @@ -391,7 +396,7 @@ public class PDFParserTest extends TikaTest { @Test public void testOCRAutoMode() throws Exception { - assumeTrue(canRunOCR(), "can run OCR"); + assumeTrue(canRunOCR(), "can't run OCR"); //default assertContains("Happy New Year", getXML("testOCR.pdf").xml); @@ -410,7 +415,7 @@ public class PDFParserTest extends TikaTest { @Test public void testOCRNoText() throws Exception { - assumeTrue(canRunOCR(), "can run OCR"); + assumeTrue(canRunOCR(), "can't run OCR"); PDFParserConfig config = new PDFParserConfig(); config.setOcrRenderingStrategy(PDFParserConfig.OCR_RENDERING_STRATEGY.ALL); config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.OCR_ONLY); @@ -433,7 +438,7 @@ public class PDFParserTest extends TikaTest { //TIKA-2970 -- make sure that configurations set on the TesseractOCRParser //make it through to when the TesseractOCRParser is called via //the PDFParser - assumeTrue(canRunOCR(), "can run OCR"); + assumeTrue(canRunOCR(), "can't run OCR"); //via the config, tesseract should skip this file because it is too large try (InputStream is = getResourceAsStream( @@ -458,8 +463,8 @@ public class PDFParserTest extends TikaTest { public void testMuPDFInOCR() throws Exception { //TODO -- need to add "rendered by" to confirm that mutool was actually called //and that there wasn't some backoff to PDFBox the PDFParser - assumeTrue(canRunOCR(), "can run OCR"); - assumeTrue(hasMuPDF(), "has mupdf"); + assumeTrue(canRunOCR(), "can't run OCR"); + assumeTrue(hasMuPDF(), "does not have mupdf"); try (InputStream is = getResourceAsStream( "/configs/tika-rendering-mupdf-config.xml")) { assertNotNull(is); @@ -508,4 +513,55 @@ public class PDFParserTest extends TikaTest { assertEquals(TikaCoreProperties.EmbeddedResourceType.VERSION.toString(), metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); } + + @Test + public void testJavascriptInNamesTreeOne() throws Exception { + PDFParserConfig config = new PDFParserConfig(); + config.setExtractActions(true); + ParseContext pc = new ParseContext(); + pc.set(PDFParserConfig.class, config); + List<Metadata> metadataList = getRecursiveMetadata("testPDFPackage.pdf", pc, true); + assertEquals(4, metadataList.size()); + //look for markup in primary document + Metadata m = metadataList.get(0); + String xhtml = m.get(TikaCoreProperties.TIKA_CONTENT); + Matcher matcher = Pattern.compile("<div ([^>]{0,1000})>").matcher(xhtml); + boolean found = false; + while (matcher.find()) { + String div = matcher.group(1); + if (div.contains("trigger=\"namesTree\"")) { + assertContains("type=\"PDActionJavaScript\"", div); + assertContains("class=\"javascript\"", div); + assertContains("subtype=\"JavaScript\"", div); + found = true; + } + } + if (! found) { + fail("failed to find js div in main document"); + } + //now test js extraction + Metadata js = metadataList.get(1); + assertEquals("MACRO", js.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); + assertEquals("NAMES_TREE", js.get(PDF.ACTION_TRIGGER)); + assertTrue(js.get(PDF.JS_NAME).startsWith("ADBE::FileAttachmentsCompatibility")); + assertContains("app.viewerVersion", js.get(TikaCoreProperties.TIKA_CONTENT)); + } + + @Test + public void testJavascriptInNamesTreeTwo() throws Exception { + Set<String> expected = Set.of("!ADBE::0200_VersChkCode_XFACheck", "!ADBE::0100_VersChkVars", "!ADBE::0100_VersChkStrings"); + PDFParserConfig config = new PDFParserConfig(); + config.setExtractActions(true); + ParseContext pc = new ParseContext(); + pc.set(PDFParserConfig.class, config); + List<Metadata> metadataList = getRecursiveMetadata("testPDF_XFA_govdocs1_258578.pdf", pc, true); + Set<String> jsNames = new HashSet<>(); + for (Metadata m : metadataList) { + String n = m.get(PDF.JS_NAME); + if (!StringUtils.isBlank(n)) { + jsNames.add(n); + } + } + assertEquals(expected, jsNames); + } }
