# I am trying to extract zip files(which have xml files in it) using DIH

# I can get data and index them from xml like this

<dataConfig>
        <dataSource encoding="UTF-8" type="FileDataSource"/>
        
    <document>
        <entity
            name="pickupdir"
            processor="FileListEntityProcessor"
            rootEntity="false"
            dataSource="null"
            fileName=".*xml"
            baseDir="${solr.install.dir}\example\exampledocs\myFiles"
            recursive="true"
        >
                                <entity 
                                        name="xml"
                                        processor="XPathEntityProcessor"
                                        datasource="pickupdir"
                                        stream="false"
                                        forEach="/accountingEntries/entryHeader"
                                        url="${pickupdir.fileAbsolutePath}"
                                        rootEntity='true'
                                        pk="lineNumber"
                                        onError="skip"
                                >
                                        <field column="entryNumberCounter"
xpath="/accountingEntries/entryHeader/entryNumberCounter"/>
                                        <field column="entryNumber"
xpath="/accountingEntries/entryHeader/entryNumber"/>
                                        <field column="enteredDate"
xpath="/accountingEntries/entryHeader/enteredDate"/>
                                        <field column="totalDebit"
xpath="/accountingEntries/entryHeader/totalDebit"/>
                                        <field column="totalCredit"
xpath="/accountingEntries/entryHeader/totalCredit"/>
                                        <field column="lineNumber"
xpath="/accountingEntries/entryHeader/entryDetail/lineNumber"/>
                                
                        </entity>
        </entity>
    </document>
</dataConfig>

# How can i add TikaEntityProcessor ? I tried like this way

<dataConfig>
        <dataSource encoding="UTF-8" type="BinFileDataSource"/>
    
    <document>
        <entity
            name="pickupdir"
            processor="FileListEntityProcessor"
            rootEntity="false"
            dataSource="null"
            fileName=".*zip"
            baseDir="${solr.install.dir}\example\exampledocs\myFiles"
            recursive="false"
        >
                        <entity 
                                name="ext" 
                                processor="TikaEntityProcessor"
                                url="${pickupdir.fileAbsolutePath}" 
                                format="xml"
                                rootEntity="false"
                        >
                                <entity 
                                        name="xml"
                                        processor="XPathEntityProcessor"
                                        datasource="pickupdir"
                                        stream="false"
                                        forEach="/accountingEntries/entryHeader"
                                        url="${pickupdir.fileAbsolutePath}"
                                        rootEntity='true'
                                        pk="lineNumber"
                                        onError="skip"
                                >
                                        <field column="entryNumberCounter"
xpath="/accountingEntries/entryHeader/entryNumberCounter"/>
                                        <field column="entryNumber"
xpath="/accountingEntries/entryHeader/entryNumber"/>
                                        <field column="enteredDate"
xpath="/accountingEntries/entryHeader/enteredDate"/>
                                        <field column="totalDebit"
xpath="/accountingEntries/entryHeader/totalDebit"/>
                                        <field column="totalCredit"
xpath="/accountingEntries/entryHeader/totalCredit"/>
                                        <field column="lineNumber"
xpath="/accountingEntries/entryHeader/entryDetail/lineNumber"/>
                                
                        </entity>
                </entity>
        </entity>
    </document>
</dataConfig>

# And this is the result i got

"----------- row #1-------------",
      "file",
      "YYY.zip",
      "fileSize",
      1124851,
      "fileLastModified",
      "2017-07-21T08:18:23.085Z",
      "fileDir",
      "C:\\Users\\USER\\Desktop\\solr-6.6.0\\example\\exampledocs\\myFiles",
      "fileAbsolutePath",
     
"C:\\Users\\USER\\Desktop\\solr-6.6.0\\example\\exampledocs\\myFiles\\YYY.zip",
      null,
      "---------------------------------------------",
      "entity:ext",
      [
        "query",
       
"C:\\Users\\USER\\Desktop\\solr-6.6.0\\example\\exampledocs\\myFiles\\YYY.zip",
        "time-taken",
        "0:0:0.0",
        null,
        "----------- row #1-------------",
        "text",
        "<?xml version=\"1.0\" encoding=\"UTF-8\"?><html
xmlns=\"http://www.w3.org/1999/xhtml\";>\r\n<head>\r\n<meta
name=\"X-Parsed-By\"
content=\"org.apache.tika.parser.DefaultParser\"/>\r\n<meta
name=\"X-Parsed-By\"
content=\"org.apache.tika.parser.pkg.PackageParser\"/>\r\n<meta
name=\"Content-Type\"
content=\"application/zip\"/>\r\n<title/>\r\n</head>\r\n<body><div
class=\"embedded\" id=\"9860029035-201601-Y-000000.xml\"/>\r\n<div
class=\"package-entry\">
9860029035-201601-Y-000000.xml
\r\n</div>\r\n</body></html>",
        null,
        "---------------------------------------------",
        "entity:xml",
        [
          "document#1",
          [
            "query",
           
"C:\\Users\\USER\\Desktop\\solr-6.6.0\\example\\exampledocs\\myFiles\\YYY.zip",
            "time-taken",
            "0:0:0.0"

# Please explain how it works



--
View this message in context: 
http://lucene.472066.n3.nabble.com/Tika-DIH-zip-to-xml-tp4347122.html
Sent from the Solr - User mailing list archive at Nabble.com.

Reply via email to