[ 
https://issues.apache.org/jira/browse/TIKA-4207?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17831840#comment-17831840
 ] 

Hudson commented on TIKA-4207:
------------------------------

SUCCESS: Integrated in Jenkins build Tika ยป tika-main-jdk11 #1580 (See 
[https://ci-builds.apache.org/job/Tika/job/tika-main-jdk11/1580/])
TIKA-4207: Add handling of embedded bytes to tika-pipes (#1699) (github: 
[https://github.com/apache/tika/commit/4fe7312330c430f357012f8d0ff886a0fb344783])
* (edit) 
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WMFParser.java
* (add) 
tika-pipes/tika-async-cli/src/test/resources/configs/TIKA-4207-emitter.xml
* (edit) 
tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
* (add) 
tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStoreExtractorFactory.java
* (edit) 
tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
* (edit) tika-core/src/test/java/org/apache/tika/pipes/PipesServerTest.java
* (add) tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java
* (edit) tika-pipes/tika-pipes-iterators/pom.xml
* (edit) tika-pipes/tika-async-cli/pom.xml
* (add) tika-pipes/tika-pipes-iterators/tika-pipes-iterator-json/pom.xml
* (edit) 
tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
* (add) tika-core/src/test/resources/org/apache/tika/pipes/TIKA-4207.xml
* (add) 
tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentBytesConfig.java
* (edit) 
tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.xml
* (delete) 
tika-core/src/test/java/org/apache/tika/pipes/async/AsyncProcessorTest.java
* (add) 
tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentBytesHandler.java
* (add) 
tika-core/src/test/java/org/apache/tika/pipes/async/AsyncChaosMonkeyTest.java
* (add) 
tika-pipes/tika-pipes-iterators/tika-pipes-iterator-json/src/test/resources/test-documents/test.json
* (edit) 
tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/AsyncResource.java
* (edit) 
tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-with-names.xml
* (add) 
tika-core/src/main/java/org/apache/tika/pipes/extractor/EmittingEmbeddedDocumentBytesHandler.java
* (delete) tika-pipes/tika-async-cli/src/test/resources/tika-config-broken.xml
* (add) 
tika-pipes/tika-async-cli/src/test/resources/configs/tika-config-broken.xml
* (add) 
tika-pipes/tika-pipes-iterators/tika-pipes-iterator-json/src/test/java/org/apache/tika/pipes/pipesiterator/json/TestJsonPipesIterator.java
* (add) 
tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentBytesHandler.java
* (add) 
tika-pipes/tika-pipes-iterators/tika-pipes-iterator-json/src/main/java/org/apache/tika/pipes/pipesiterator/json/JsonPipesIterator.java
* (edit) 
tika-serialization/src/main/java/org/apache/tika/metadata/serialization/JsonFetchEmitTuple.java
* (edit) tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
* (edit) 
tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/TikaAsyncCLITest.java
* (edit) 
tika-serialization/src/test/java/org/apache/tika/metadata/serialization/JsonFetchEmitTupleTest.java
* (add) 
tika-pipes/tika-pipes-iterators/tika-pipes-iterator-json/src/test/resources/test-documents/test-with-embedded-bytes.json
* (add) tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java
* (add) 
tika-core/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
* (edit) tika-core/src/main/java/org/apache/tika/pipes/FetchEmitTuple.java
* (edit) tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
* (edit) 
tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java
* (add) 
tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractorFactory.java
* (add) 
tika-core/src/test/resources/org/apache/tika/pipes/TIKA-4207-limit-bytes.xml
* (add) 
tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedBytesSelector.java
* (add) 
tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentBytesHandler.java
* (add) 
tika-core/src/main/java/org/apache/tika/extractor/EmbeddedBytesSelector.java
* (edit) tika-core/src/main/java/org/apache/tika/io/BoundedInputStream.java
* (edit) 
tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
* (edit) tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java
* (edit) tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
* (add) 
tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/AsyncProcessorTest.java
* (add) 
tika-pipes/tika-async-cli/src/test/resources/test-documents/basic_embedded.xml
* (add) 
tika-core/src/test/resources/org/apache/tika/config/TIKA-4207-embedded-bytes-config.xml


> PipesParser should have option to extract raw bytes of embedded files
> ---------------------------------------------------------------------
>
>                 Key: TIKA-4207
>                 URL: https://issues.apache.org/jira/browse/TIKA-4207
>             Project: Tika
>          Issue Type: New Feature
>            Reporter: Tim Allison
>            Priority: Major
>             Fix For: 3.0.0
>
>
> There are many use cases, where text+metadata are important, but users also 
> need the raw bytes from embedded files.
> Let's make it possible to extract the usual rmeta content in _and_ the raw 
> bytes. This is a preliminary step that will offer more customization options 
> than the proposal in TIKA-3703.
> This is targeted to 3.x.



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to