[jira] [Commented] (TIKA-4211) Tika extractor fails to extract embedded excel from pptx

Tim Allison (Jira) Thu, 14 Mar 2024 13:02:32 -0700


    [ 
https://issues.apache.org/jira/browse/TIKA-4211?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17827233#comment-17827233
 ]


Tim Allison commented on TIKA-4211:
-----------------------------------

Or, if you grep for "embeddings" in the in uncompressed zip, can you find a 
link to the xlsx file?

> Tika extractor fails to extract embedded excel from pptx
> --------------------------------------------------------
>
>                 Key: TIKA-4211
>                 URL: https://issues.apache.org/jira/browse/TIKA-4211
>             Project: Tika
>          Issue Type: Bug
>            Reporter: Xiaohong Yang
>            Priority: Major
>         Attachments: config_and_sample_file.zip
>
>
> We use org.apache.tika.extractor.EmbeddedDocumentExtractor to get embedded 
> excel from PowerPoint presentation.  It works with most pptx files. But it 
> fails to detect the embedded excel with some pptx files.
> Following is the sample code and attached is the tika-config.xml and a pptx 
> file that works.
> We cannot provide the pptx file that does not work because it is client data.
> We noticed a difference between the pptx files that work and the pptx file 
> that does not work:  
> "{*}Worksheet Object{*}" *is in the popup menu when the embedded Excel object 
> is right-clicked in the pptx files that work.*
> "{*}Edit Data{*}" *is in the popup menu when the embedded Excel object is 
> right-clicked in the pptx file that does not work. This file might be created 
> with an old version fo PowerPoint.*
>  
> The operating system is Ubuntu 20.04. Java version is 17.  Tika version is 
> 2.9.1 and POI version is 5.2.3. 
>  
> import org.apache.pdfbox.io.IOUtils;
> import org.apache.poi.poifs.filesystem.DirectoryEntry;
> import org.apache.poi.poifs.filesystem.DocumentEntry;
> import org.apache.poi.poifs.filesystem.DocumentInputStream;
> import org.apache.poi.poifs.filesystem.POIFSFileSystem;
> import org.apache.tika.config.TikaConfig;
> import org.apache.tika.extractor.EmbeddedDocumentExtractor;
> import org.apache.tika.io.FilenameUtils;
> import org.apache.tika.io.TikaInputStream;
> import org.apache.tika.metadata.Metadata;
> import org.apache.tika.metadata.TikaCoreProperties;
> import org.apache.tika.parser.AutoDetectParser;
> import org.apache.tika.parser.ParseContext;
> import org.apache.tika.parser.Parser;
> import org.xml.sax.ContentHandler;
> import org.xml.sax.SAXException;
> import org.xml.sax.helpers.DefaultHandler;
>  
> import java.io.*;
> import java.net.URL;
> import java.nio.file.Path;
>  
> public class ExtractExcelFromPowerPoint {
>     private final Path pptxFile = new 
> File("/home/ubuntu/testdirs/testdir_pptx/sample.pptx").toPath();
>     private final Path outputDir = new 
> File("/home/ubuntu/testdirs/testdir_pptx/tika_output/").toPath();
>  
>     private Parser parser;
>     private ParseContext context;
>  
>  
>     public static void main(String args[]) {
>         try {
>             new ExtractExcelFromPowerPoint().process();
>         }
>         catch(Exception ex) {
>             ex.printStackTrace();
>         }
>     }
>  
>     public ExtractExcelFromPowerPoint() {
>     }
>  
>     public void process() throws Exception {
>         TikaConfig config = new 
> TikaConfig("/home/ubuntu/testdirs/testdir_pptx/tika-config.xml");
>         FileEmbeddedDocumentExtractor fileEmbeddedDocumentExtractor = new 
> FileEmbeddedDocumentExtractor();
>  
>         parser = new AutoDetectParser(config);
>         context = new ParseContext();
>         context.set(Parser.class, parser);
>         context.set(TikaConfig.class, config);
>         context.set(EmbeddedDocumentExtractor.class, 
> fileEmbeddedDocumentExtractor);
>  
>         URL url = pptxFile.toUri().toURL();
>         Metadata metadata = new Metadata();
>         try (InputStream input = TikaInputStream.get(url, metadata)) {
>             ContentHandler handler = new DefaultHandler();
>             parser.parse(input, handler, metadata, context);
>         }
>     }
>  
>     private class FileEmbeddedDocumentExtractor implements 
> EmbeddedDocumentExtractor {
>         private int count = 0;
>  
>         public boolean shouldParseEmbedded(Metadata metadata) {
>             return true;
>         }
>  
>         public void parseEmbedded(InputStream inputStream, ContentHandler 
> contentHandler, Metadata metadata,
>                                   boolean outputHtml) throws SAXException, 
> IOException {
>             String fullFileName = 
> metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
>             if (fullFileName == null) {
>                 fullFileName = "file" + count++;
>             }
>  
>             String[] fileNameSplit = fullFileName.split("/");
>             String fileName = fileNameSplit[fileNameSplit.length - 1];
>             File outputFile = new File(outputDir.toFile(), 
> FilenameUtils.normalize(fileName));
>             System.out.println("Extracting '" + fileName + " to " + 
> outputFile);
>             FileOutputStream os = null;
>             try {
>                 os = new FileOutputStream(outputFile);
>                 if (inputStream instanceof TikaInputStream tin) {
>                     if (tin.getOpenContainer() instanceof DirectoryEntry) {
>                         try(POIFSFileSystem fs = new POIFSFileSystem()){
>                             copy((DirectoryEntry) tin.getOpenContainer(), 
> fs.getRoot());
>                             fs.writeFilesystem(os);
>                         }
>                     } else {
>                         IOUtils.copy(inputStream, os);
>                     }
>                 } else {
>                     IOUtils.copy(inputStream, os);
>                 }
>             } catch (Exception ex) {
>                 ex.printStackTrace();
>             } finally {
>                 if (os != null) {
>                     os.flush();
>                     os.close();
>                 }
>             }
>         }
>  
>         protected void copy(DirectoryEntry sourceDir, DirectoryEntry destDir) 
> throws IOException {
>             for (org.apache.poi.poifs.filesystem.Entry entry : sourceDir) {
>                 if (entry instanceof DirectoryEntry) {
>                     // Need to recurse
>                     DirectoryEntry newDir = 
> destDir.createDirectory(entry.getName());
>                     copy((DirectoryEntry) entry, newDir);
>                 } else {
>                     // Copy entry
>                     try (InputStream contents = new 
> DocumentInputStream((DocumentEntry) entry)) {
>                         destDir.createDocument(entry.getName(), contents);
>                     }
>                 }
>             }
>         }
>     }
> }
> [^config_and_sample_file.zip]



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

[jira] [Commented] (TIKA-4211) Tika extractor fails to extract embedded excel from pptx

Reply via email to