[ 
https://issues.apache.org/jira/browse/TIKA-4124?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17762216#comment-17762216
 ] 

Hudson commented on TIKA-4124:
------------------------------

SUCCESS: Integrated in Jenkins build Tika » tika-main-jdk11 #1230 (See 
[https://ci-builds.apache.org/job/Tika/job/tika-main-jdk11/1230/])
TIKA-4124 -- extract alternate format chunk from ooxml (#1317) (github: 
[https://github.com/apache/tika/commit/f6290858bae72ed1c561ce75812c577e6b736a32])
* (edit) 
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
* (edit) 
tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
* (edit) 
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java


> embedded html of type 
> http://schemas.openxmlformats.org/officeDocument/2006/relationships/aFChunk 
> is not parsed
> ---------------------------------------------------------------------------------------------------------------
>
>                 Key: TIKA-4124
>                 URL: https://issues.apache.org/jira/browse/TIKA-4124
>             Project: Tika
>          Issue Type: Bug
>          Components: parser
>            Reporter: Tim Barrett
>            Priority: Minor
>
> Word documents that may have been created using third party programs such as 
> docx4j sometimes contain embedded html. This is not parsed by Tika. The 
> embedded HTML file usually resides within the main folder of the docx 
> internal structure.
> Changing the code in: 
> org.apache.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor.handleEmbeddedPart()
> as follows, handles this (the final else if)
>  
> {color:#7f0055}if{color}{color:#000000} 
> (POIXMLDocument.{color}{color:#0000c0}OLE_OBJECT_REL_TYPE{color}{color:#000000}.equals({color}{color:#6a3e3e}type{color}{color:#000000})
>  && 
> {color}{color:#0000c0}TYPE_OLE_OBJECT{color}{color:#000000}.equals({color}{color:#6a3e3e}target{color}{color:#000000}.getContentType()))
>  {{color}
> {color:#000000} 
> handleEmbeddedOLE({color}{color:#6a3e3e}target{color}{color:#000000}, 
> {color}{color:#6a3e3e}xhtml{color}{color:#000000}, 
> {color}{color:#6a3e3e}sourceDesc{color}{color:#000000} + 
> {color}{color:#6a3e3e}rel{color}{color:#000000}.getId(), 
> {color}{color:#6a3e3e}parentMetadata{color}{color:#000000});{color}
> {color:#000000} {color}{color:#7f0055}if{color}{color:#000000} 
> ({color}{color:#6a3e3e}targetURI{color}{color:#000000} != 
> {color}{color:#7f0055}null{color}{color:#000000}) {{color}
> {color:#000000} 
> {color}{color:#6a3e3e}handledTarget{color}{color:#000000}.add({color}{color:#6a3e3e}targetURI{color}{color:#000000}.toString());{color}
> {color:#000000} }{color}
> {color:#000000} } {color}{color:#7f0055}else{color}{color:#000000} 
> {color}{color:#7f0055}if{color}{color:#000000} 
> ({color}{color:#0000c0}RELATION_MEDIA{color}{color:#000000}.equals({color}{color:#6a3e3e}type{color}{color:#000000})
>  || 
> {color}{color:#0000c0}RELATION_VIDEO{color}{color:#000000}.equals({color}{color:#6a3e3e}type{color}{color:#000000})
>  || 
> {color}{color:#0000c0}RELATION_AUDIO{color}{color:#000000}.equals({color}{color:#6a3e3e}type{color}{color:#000000}){color}
> {color:#000000} || 
> PackageRelationshipTypes.{color}{color:#0000c0}IMAGE_PART{color}{color:#000000}.equals({color}{color:#6a3e3e}type{color}{color:#000000})
>  || 
> POIXMLDocument.{color}{color:#0000c0}PACK_OBJECT_REL_TYPE{color}{color:#000000}.equals({color}{color:#6a3e3e}type{color}{color:#000000}){color}
> {color:#000000} || 
> POIXMLDocument.{color}{color:#0000c0}OLE_OBJECT_REL_TYPE{color}{color:#000000}.equals({color}{color:#6a3e3e}type{color}{color:#000000}))
>  {{color}
> {color:#000000} 
> handleEmbeddedFile({color}{color:#6a3e3e}target{color}{color:#000000}, 
> {color}{color:#6a3e3e}xhtml{color}{color:#000000}, 
> {color}{color:#6a3e3e}sourceDesc{color}{color:#000000} + 
> {color}{color:#6a3e3e}rel{color}{color:#000000}.getId());{color}
> {color:#000000} {color}{color:#7f0055}if{color}{color:#000000} 
> ({color}{color:#6a3e3e}targetURI{color}{color:#000000} != 
> {color}{color:#7f0055}null{color}{color:#000000}) {{color}
> {color:#000000} 
> {color}{color:#6a3e3e}handledTarget{color}{color:#000000}.add({color}{color:#6a3e3e}targetURI{color}{color:#000000}.toString());{color}
> {color:#000000} }{color}
> {color:#000000} } {color}{color:#7f0055}else{color}{color:#000000} 
> {color}{color:#7f0055}if{color}{color:#000000} 
> (XSSFRelation.{color}{color:#0000c0}VBA_MACROS{color}{color:#000000}.getRelation().equals({color}{color:#6a3e3e}type{color}{color:#000000}))
>  {{color}
> {color:#000000} 
> handleMacros({color}{color:#6a3e3e}target{color}{color:#000000}, 
> {color}{color:#6a3e3e}xhtml{color}{color:#000000});{color}
> {color:#000000} {color}{color:#7f0055}if{color}{color:#000000} 
> ({color}{color:#6a3e3e}targetURI{color}{color:#000000} != 
> {color}{color:#7f0055}null{color}{color:#000000}) {{color}
> {color:#000000} 
> {color}{color:#6a3e3e}handledTarget{color}{color:#000000}.add({color}{color:#6a3e3e}targetURI{color}{color:#000000}.toString());{color}
> {color:#000000} }{color}
> {color:#000000} } {color}{color:#7f0055}else{color}{color:#000000} 
> {color}{color:#7f0055}if{color}{color:#000000} 
> ({color}{color:#6a3e3e}type{color}{color:#000000}.endsWith({color}{color:#2a00ff}"aFChunk"{color}{color:#000000}))
>  {{color}
>  
> {color:#000000} 
> handleEmbeddedFile({color}{color:#6a3e3e}target{color}{color:#000000}, 
> {color}{color:#6a3e3e}xhtml{color}{color:#000000}, 
> {color}{color:#6a3e3e}sourceDesc{color}{color:#000000} + 
> {color}{color:#6a3e3e}rel{color}{color:#000000}.getId());{color}
>  
> {color:#000000} }{color}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to