[
https://issues.apache.org/jira/browse/TIKA-4398?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17946339#comment-17946339
]
Tilman Hausherr edited comment on TIKA-4398 at 4/22/25 8:26 AM:
----------------------------------------------------------------
It worked for me... I didn't use the additional dependencies, maybe retry with
a separate project. Here's the code I used, which is slightly different in
loading and in the output:
{code:java}
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Arrays;
import java.util.List;
import org.apache.tika.config.ServiceLoader;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.DefaultParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.audio.AudioParser;
import org.apache.tika.parser.audio.MidiParser;
import org.apache.tika.parser.mp3.Mp3Parser;
import org.apache.tika.parser.mp4.MP4Parser;
import org.apache.tika.parser.pkg.CompressorParser;
import org.apache.tika.parser.pkg.RarParser;
import org.apache.tika.parser.video.FLVParser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
public class TIKA4398docx_2 extends AutoDetectParser
{
public static void main(String[] args) throws IOException,
URISyntaxException, SAXException, TikaException
{
List<Class<? extends Parser>> excludeParsers = Arrays.asList(
MP4Parser.class,
AudioParser.class,
Mp3Parser.class,
MidiParser.class,
FLVParser.class,
CompressorParser.class,
RarParser.class
);
TikaConfig config = TikaConfig.getDefaultConfig();
Parser myParser = new DefaultParser(config.getMediaTypeRegistry(),
new ServiceLoader(), excludeParsers);
Parser parser = new AutoDetectParser(config.getDetector(), myParser);
ContentHandler contentHandler = new BodyContentHandler();
Metadata meta = new Metadata();
ParseContext context = new ParseContext();
context.set(Parser.class, parser);
InputStream is = new
URI("https://issues.apache.org/jira/secure/attachment/13076074/01.docx").toURL().openStream();
byte[] ba = is.readAllBytes();
InputStream stream = new ByteArrayInputStream(ba);
parser.parse(stream, contentHandler, meta, context);
System.out.println("Extracted? " +
(contentHandler.toString().contains("RESUME") ? "yes" : "no"));
for (String name : meta.names())
{
if (meta.getValues(name).length > 1)
{
System.out.println(name + ": " +
Arrays.toString(meta.getValues(name))); // get() brings only the first one!
}
else
{
System.out.println(name + ": " + meta.get(name));
}
}
}
}
{code}
The output with 3.1.0:
{noformat}
Extracted? yes
cp:revision: 31
meta:paragraph-count: 1
meta:word-count: 11
extended-properties:Application: WPS
Office_10.1.0.7400_F1E327BC-269C-435d-A152-05C5408002CA
meta:last-author: WPS_1528193819
X-TIKA:Parsed-By-Full-Set: [org.apache.tika.parser.DefaultParser,
org.apache.tika.parser.microsoft.ooxml.OOXMLParser,
org.apache.tika.parser.image.ImageParser]
dc:creator: Administrator
xmpTPg:NPages: 4
dcterms:created: 2017-09-28T08:20:00Z
meta:line-count: 1
dcterms:modified: 2018-07-24T04:40:31Z
meta:character-count: 68
extended-properties:Template: ?2017???????????????.docx
meta:character-count-with-spaces: 78
X-TIKA:Parsed-By: [org.apache.tika.parser.DefaultParser,
org.apache.tika.parser.microsoft.ooxml.OOXMLParser]
extended-properties:DocSecurityString: None
extended-properties:TotalTime: 4
meta:page-count: 4
Content-Type:
application/vnd.openxmlformats-officedocument.wordprocessingml.document
custom:KSOProductBuildVer: 2052-10.1.0.7400
{noformat}
was (Author: tilman):
It worked for me... I didn't use the additional dependencies, maybe retry with
a separate project. Here's the code I used, which is slightly different in
loading and in the output:
{code:java}
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Arrays;
import java.util.List;
import org.apache.tika.config.ServiceLoader;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.DefaultParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.audio.AudioParser;
import org.apache.tika.parser.audio.MidiParser;
import org.apache.tika.parser.mp3.Mp3Parser;
import org.apache.tika.parser.mp4.MP4Parser;
import org.apache.tika.parser.pkg.CompressorParser;
import org.apache.tika.parser.pkg.RarParser;
import org.apache.tika.parser.video.FLVParser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
public class TIKA4398docx_2 extends AutoDetectParser
{
public static void main(String[] args) throws IOException,
URISyntaxException, SAXException, TikaException
{
List<Class<? extends Parser>> excludeParsers = Arrays.asList(
MP4Parser.class,
AudioParser.class,
Mp3Parser.class,
MidiParser.class,
FLVParser.class,
CompressorParser.class,
RarParser.class
);
TikaConfig config = TikaConfig.getDefaultConfig();
Parser myParser = new DefaultParser(config.getMediaTypeRegistry(),
new ServiceLoader(), excludeParsers);
Parser parser = new AutoDetectParser(config.getDetector(), myParser);
ContentHandler contentHandler = new BodyContentHandler();
Metadata meta = new Metadata();
ParseContext context = new ParseContext();
context.set(Parser.class, parser);
InputStream is = new
URI("https://issues.apache.org/jira/secure/attachment/13076074/01.docx").toURL().openStream();
byte[] ba = is.readAllBytes();
InputStream stream = new ByteArrayInputStream(ba);
parser.parse(stream, contentHandler, meta, context);
System.out.println("Extracted? " +
(contentHandler.toString().contains("RESUME") ? "yes" : "no"));
for (String name : meta.names())
{
if (meta.getValues(name).length > 1)
{
System.out.println(name + ": " +
Arrays.toString(meta.getValues(name))); // get() brings only the first one!
}
else
{
System.out.println(name + ": " + meta.get(name));
}
}
}
}
{code}
> When extracting a docx file with Tika 3.1.0, the package parser was detected
> instead of the OOXML parser
> --------------------------------------------------------------------------------------------------------
>
> Key: TIKA-4398
> URL: https://issues.apache.org/jira/browse/TIKA-4398
> Project: Tika
> Issue Type: Bug
> Components: tika-core
> Affects Versions: 3.1.0
> Environment: java17
> Reporter: mannixli
> Priority: Major
> Attachments: 01.docx, image-2025-04-16-20-46-07-228.png,
> image-2025-04-22-11-26-09-936.png, image-2025-04-22-11-27-33-655.png,
> image-2025-04-22-11-37-15-401.png
>
>
> 3.0.0 detected ooxml parser
--
This message was sent by Atlassian Jira
(v8.20.10#820010)