[ https://issues.apache.org/jira/browse/TIKA-4211?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17827233#comment-17827233 ]
Tim Allison commented on TIKA-4211: ----------------------------------- Or, if you grep for "embeddings" in the in uncompressed zip, can you find a link to the xlsx file? > Tika extractor fails to extract embedded excel from pptx > -------------------------------------------------------- > > Key: TIKA-4211 > URL: https://issues.apache.org/jira/browse/TIKA-4211 > Project: Tika > Issue Type: Bug > Reporter: Xiaohong Yang > Priority: Major > Attachments: config_and_sample_file.zip > > > We use org.apache.tika.extractor.EmbeddedDocumentExtractor to get embedded > excel from PowerPoint presentation. It works with most pptx files. But it > fails to detect the embedded excel with some pptx files. > Following is the sample code and attached is the tika-config.xml and a pptx > file that works. > We cannot provide the pptx file that does not work because it is client data. > We noticed a difference between the pptx files that work and the pptx file > that does not work: > "{*}Worksheet Object{*}" *is in the popup menu when the embedded Excel object > is right-clicked in the pptx files that work.* > "{*}Edit Data{*}" *is in the popup menu when the embedded Excel object is > right-clicked in the pptx file that does not work. This file might be created > with an old version fo PowerPoint.* > > The operating system is Ubuntu 20.04. Java version is 17. Tika version is > 2.9.1 and POI version is 5.2.3. > > import org.apache.pdfbox.io.IOUtils; > import org.apache.poi.poifs.filesystem.DirectoryEntry; > import org.apache.poi.poifs.filesystem.DocumentEntry; > import org.apache.poi.poifs.filesystem.DocumentInputStream; > import org.apache.poi.poifs.filesystem.POIFSFileSystem; > import org.apache.tika.config.TikaConfig; > import org.apache.tika.extractor.EmbeddedDocumentExtractor; > import org.apache.tika.io.FilenameUtils; > import org.apache.tika.io.TikaInputStream; > import org.apache.tika.metadata.Metadata; > import org.apache.tika.metadata.TikaCoreProperties; > import org.apache.tika.parser.AutoDetectParser; > import org.apache.tika.parser.ParseContext; > import org.apache.tika.parser.Parser; > import org.xml.sax.ContentHandler; > import org.xml.sax.SAXException; > import org.xml.sax.helpers.DefaultHandler; > > import java.io.*; > import java.net.URL; > import java.nio.file.Path; > > public class ExtractExcelFromPowerPoint { > private final Path pptxFile = new > File("/home/ubuntu/testdirs/testdir_pptx/sample.pptx").toPath(); > private final Path outputDir = new > File("/home/ubuntu/testdirs/testdir_pptx/tika_output/").toPath(); > > private Parser parser; > private ParseContext context; > > > public static void main(String args[]) { > try { > new ExtractExcelFromPowerPoint().process(); > } > catch(Exception ex) { > ex.printStackTrace(); > } > } > > public ExtractExcelFromPowerPoint() { > } > > public void process() throws Exception { > TikaConfig config = new > TikaConfig("/home/ubuntu/testdirs/testdir_pptx/tika-config.xml"); > FileEmbeddedDocumentExtractor fileEmbeddedDocumentExtractor = new > FileEmbeddedDocumentExtractor(); > > parser = new AutoDetectParser(config); > context = new ParseContext(); > context.set(Parser.class, parser); > context.set(TikaConfig.class, config); > context.set(EmbeddedDocumentExtractor.class, > fileEmbeddedDocumentExtractor); > > URL url = pptxFile.toUri().toURL(); > Metadata metadata = new Metadata(); > try (InputStream input = TikaInputStream.get(url, metadata)) { > ContentHandler handler = new DefaultHandler(); > parser.parse(input, handler, metadata, context); > } > } > > private class FileEmbeddedDocumentExtractor implements > EmbeddedDocumentExtractor { > private int count = 0; > > public boolean shouldParseEmbedded(Metadata metadata) { > return true; > } > > public void parseEmbedded(InputStream inputStream, ContentHandler > contentHandler, Metadata metadata, > boolean outputHtml) throws SAXException, > IOException { > String fullFileName = > metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); > if (fullFileName == null) { > fullFileName = "file" + count++; > } > > String[] fileNameSplit = fullFileName.split("/"); > String fileName = fileNameSplit[fileNameSplit.length - 1]; > File outputFile = new File(outputDir.toFile(), > FilenameUtils.normalize(fileName)); > System.out.println("Extracting '" + fileName + " to " + > outputFile); > FileOutputStream os = null; > try { > os = new FileOutputStream(outputFile); > if (inputStream instanceof TikaInputStream tin) { > if (tin.getOpenContainer() instanceof DirectoryEntry) { > try(POIFSFileSystem fs = new POIFSFileSystem()){ > copy((DirectoryEntry) tin.getOpenContainer(), > fs.getRoot()); > fs.writeFilesystem(os); > } > } else { > IOUtils.copy(inputStream, os); > } > } else { > IOUtils.copy(inputStream, os); > } > } catch (Exception ex) { > ex.printStackTrace(); > } finally { > if (os != null) { > os.flush(); > os.close(); > } > } > } > > protected void copy(DirectoryEntry sourceDir, DirectoryEntry destDir) > throws IOException { > for (org.apache.poi.poifs.filesystem.Entry entry : sourceDir) { > if (entry instanceof DirectoryEntry) { > // Need to recurse > DirectoryEntry newDir = > destDir.createDirectory(entry.getName()); > copy((DirectoryEntry) entry, newDir); > } else { > // Copy entry > try (InputStream contents = new > DocumentInputStream((DocumentEntry) entry)) { > destDir.createDocument(entry.getName(), contents); > } > } > } > } > } > } > [^config_and_sample_file.zip] -- This message was sent by Atlassian Jira (v8.20.10#820010)