[ https://issues.apache.org/jira/browse/TIKA-4211?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17829597#comment-17829597 ]
Xiaohong Yang commented on TIKA-4211: ------------------------------------- Hi Tim, I ran the following command and the xlsx is in the result json: java -jar tika-app-3.0.0-20240321.135818-429.jar -J -t 2020_Capacity_Ramp_Plan.pptx Here is the related part of the json [ { "cp:revision": "8", "extended-properties:AppVersion": "16.0000", "meta:paragraph-count": "278", "meta:word-count": "465", "extended-properties:PresentationFormat": "Widescreen", "extended-properties:Application": "Microsoft Office PowerPoint", "meta:last-author": "Kenneth Nip", "X-TIKA:Parsed-By-Full-Set": [ "org.apache.tika.parser.DefaultParser", "org.apache.tika.parser.microsoft.ooxml.OOXMLParser", "org.apache.tika.parser.image.JpegParser", "org.apache.tika.parser.ocr.TesseractOCRParser" ], "X-TIKA:content_handler": "ToTextContentHandler", "dc:creator": "Kenneth Nip", "meta:slide-count": "3", "xmpTPg:NPages": "3", "resourceName": "2020_Capacity_Ramp_Plan.pptx", "dcterms:created": "2020-01-04T05:19:17Z", "dcterms:modified": "2020-01-06T07:58:18Z", "X-TIKA:Parsed-By": [ "org.apache.tika.parser.DefaultParser", "org.apache.tika.parser.microsoft.ooxml.OOXMLParser" ], "dc:title": "PowerPoint Presentation", "extended-properties:DocSecurityString": "None", "extended-properties:TotalTime": "342", "X-TIKA:parse_time_millis": "1223", "X-TIKA:embedded_depth": "0", "X-TIKA:content": "…… / Peter\t\t\t\n\n\n\nMicrosoft_Excel_Worksheet.xlsx\n\n\n", "Content-Length": "144945", "Content-Type": "application/vnd.openxmlformats-officedocument.presentationml.presentation" }, { "extended-properties:AppVersion": "16.0300", "extended-properties:Application": "Microsoft Excel", "meta:last-author": "Kenneth Nip", "X-TIKA:embedded_id_path": "/1", "X-TIKA:content_handler": "ToTextContentHandler", "dc:creator": "Kenneth Nip", "extended-properties:Company": "", "meta:print-date": "2019-11-06T23:43:22Z", "resourceName": "Microsoft_Excel_Worksheet.xlsx", "dcterms:created": "2019-10-30T16:50:00Z", "dcterms:modified": "2020-01-06T07:29:13Z", "X-TIKA:origResourceName": "C:\\Users\\kenrw\\Downloads\\", "embeddedRelationshipId": "rId3", "protected": "false", "embeddedResourceType": "ATTACHMENT", "X-TIKA:Parsed-By": [ "org.apache.tika.parser.DefaultParser", "org.apache.tika.parser.microsoft.ooxml.OOXMLParser" ], "extended-properties:DocSecurityString": "None", "X-TIKA:embedded_depth": "1", "X-TIKA:parse_time_millis": "376", "X-TIKA:content": "…………..", "X-TIKA:embedded_resource_path": "/Microsoft_Excel_Worksheet.xlsx", "X-TIKA:embedded_id": "1", "Content-Type": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "dc:publisher": "" }, … ] > Tika extractor fails to extract embedded excel from pptx > -------------------------------------------------------- > > Key: TIKA-4211 > URL: https://issues.apache.org/jira/browse/TIKA-4211 > Project: Tika > Issue Type: Bug > Reporter: Xiaohong Yang > Priority: Major > Attachments: config_and_sample_file.zip > > > We use org.apache.tika.extractor.EmbeddedDocumentExtractor to get embedded > excel from PowerPoint presentation. It works with most pptx files. But it > fails to detect the embedded excel with some pptx files. > Following is the sample code and attached is the tika-config.xml and a pptx > file that works. > We cannot provide the pptx file that does not work because it is client data. > We noticed a difference between the pptx files that work and the pptx file > that does not work: > "{*}Worksheet Object{*}" *is in the popup menu when the embedded Excel object > is right-clicked in the pptx files that work.* > "{*}Edit Data{*}" *is in the popup menu when the embedded Excel object is > right-clicked in the pptx file that does not work. This file might be created > with an old version fo PowerPoint.* > > The operating system is Ubuntu 20.04. Java version is 17. Tika version is > 2.9.1 and POI version is 5.2.3. > > import org.apache.pdfbox.io.IOUtils; > import org.apache.poi.poifs.filesystem.DirectoryEntry; > import org.apache.poi.poifs.filesystem.DocumentEntry; > import org.apache.poi.poifs.filesystem.DocumentInputStream; > import org.apache.poi.poifs.filesystem.POIFSFileSystem; > import org.apache.tika.config.TikaConfig; > import org.apache.tika.extractor.EmbeddedDocumentExtractor; > import org.apache.tika.io.FilenameUtils; > import org.apache.tika.io.TikaInputStream; > import org.apache.tika.metadata.Metadata; > import org.apache.tika.metadata.TikaCoreProperties; > import org.apache.tika.parser.AutoDetectParser; > import org.apache.tika.parser.ParseContext; > import org.apache.tika.parser.Parser; > import org.xml.sax.ContentHandler; > import org.xml.sax.SAXException; > import org.xml.sax.helpers.DefaultHandler; > > import java.io.*; > import java.net.URL; > import java.nio.file.Path; > > public class ExtractExcelFromPowerPoint { > private final Path pptxFile = new > File("/home/ubuntu/testdirs/testdir_pptx/sample.pptx").toPath(); > private final Path outputDir = new > File("/home/ubuntu/testdirs/testdir_pptx/tika_output/").toPath(); > > private Parser parser; > private ParseContext context; > > > public static void main(String args[]) { > try { > new ExtractExcelFromPowerPoint().process(); > } > catch(Exception ex) { > ex.printStackTrace(); > } > } > > public ExtractExcelFromPowerPoint() { > } > > public void process() throws Exception { > TikaConfig config = new > TikaConfig("/home/ubuntu/testdirs/testdir_pptx/tika-config.xml"); > FileEmbeddedDocumentExtractor fileEmbeddedDocumentExtractor = new > FileEmbeddedDocumentExtractor(); > > parser = new AutoDetectParser(config); > context = new ParseContext(); > context.set(Parser.class, parser); > context.set(TikaConfig.class, config); > context.set(EmbeddedDocumentExtractor.class, > fileEmbeddedDocumentExtractor); > > URL url = pptxFile.toUri().toURL(); > Metadata metadata = new Metadata(); > try (InputStream input = TikaInputStream.get(url, metadata)) { > ContentHandler handler = new DefaultHandler(); > parser.parse(input, handler, metadata, context); > } > } > > private class FileEmbeddedDocumentExtractor implements > EmbeddedDocumentExtractor { > private int count = 0; > > public boolean shouldParseEmbedded(Metadata metadata) { > return true; > } > > public void parseEmbedded(InputStream inputStream, ContentHandler > contentHandler, Metadata metadata, > boolean outputHtml) throws SAXException, > IOException { > String fullFileName = > metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); > if (fullFileName == null) { > fullFileName = "file" + count++; > } > > String[] fileNameSplit = fullFileName.split("/"); > String fileName = fileNameSplit[fileNameSplit.length - 1]; > File outputFile = new File(outputDir.toFile(), > FilenameUtils.normalize(fileName)); > System.out.println("Extracting '" + fileName + " to " + > outputFile); > FileOutputStream os = null; > try { > os = new FileOutputStream(outputFile); > if (inputStream instanceof TikaInputStream tin) { > if (tin.getOpenContainer() instanceof DirectoryEntry) { > try(POIFSFileSystem fs = new POIFSFileSystem()){ > copy((DirectoryEntry) tin.getOpenContainer(), > fs.getRoot()); > fs.writeFilesystem(os); > } > } else { > IOUtils.copy(inputStream, os); > } > } else { > IOUtils.copy(inputStream, os); > } > } catch (Exception ex) { > ex.printStackTrace(); > } finally { > if (os != null) { > os.flush(); > os.close(); > } > } > } > > protected void copy(DirectoryEntry sourceDir, DirectoryEntry destDir) > throws IOException { > for (org.apache.poi.poifs.filesystem.Entry entry : sourceDir) { > if (entry instanceof DirectoryEntry) { > // Need to recurse > DirectoryEntry newDir = > destDir.createDirectory(entry.getName()); > copy((DirectoryEntry) entry, newDir); > } else { > // Copy entry > try (InputStream contents = new > DocumentInputStream((DocumentEntry) entry)) { > destDir.createDocument(entry.getName(), contents); > } > } > } > } > } > } > [^config_and_sample_file.zip] -- This message was sent by Atlassian Jira (v8.20.10#820010)