# I am trying to extract zip files(which have xml files in it) using DIH # I can get data and index them from xml like this
<dataConfig> <dataSource encoding="UTF-8" type="FileDataSource"/> <document> <entity name="pickupdir" processor="FileListEntityProcessor" rootEntity="false" dataSource="null" fileName=".*xml" baseDir="${solr.install.dir}\example\exampledocs\myFiles" recursive="true" > <entity name="xml" processor="XPathEntityProcessor" datasource="pickupdir" stream="false" forEach="/accountingEntries/entryHeader" url="${pickupdir.fileAbsolutePath}" rootEntity='true' pk="lineNumber" onError="skip" > <field column="entryNumberCounter" xpath="/accountingEntries/entryHeader/entryNumberCounter"/> <field column="entryNumber" xpath="/accountingEntries/entryHeader/entryNumber"/> <field column="enteredDate" xpath="/accountingEntries/entryHeader/enteredDate"/> <field column="totalDebit" xpath="/accountingEntries/entryHeader/totalDebit"/> <field column="totalCredit" xpath="/accountingEntries/entryHeader/totalCredit"/> <field column="lineNumber" xpath="/accountingEntries/entryHeader/entryDetail/lineNumber"/> </entity> </entity> </document> </dataConfig> # How can i add TikaEntityProcessor ? I tried like this way <dataConfig> <dataSource encoding="UTF-8" type="BinFileDataSource"/> <document> <entity name="pickupdir" processor="FileListEntityProcessor" rootEntity="false" dataSource="null" fileName=".*zip" baseDir="${solr.install.dir}\example\exampledocs\myFiles" recursive="false" > <entity name="ext" processor="TikaEntityProcessor" url="${pickupdir.fileAbsolutePath}" format="xml" rootEntity="false" > <entity name="xml" processor="XPathEntityProcessor" datasource="pickupdir" stream="false" forEach="/accountingEntries/entryHeader" url="${pickupdir.fileAbsolutePath}" rootEntity='true' pk="lineNumber" onError="skip" > <field column="entryNumberCounter" xpath="/accountingEntries/entryHeader/entryNumberCounter"/> <field column="entryNumber" xpath="/accountingEntries/entryHeader/entryNumber"/> <field column="enteredDate" xpath="/accountingEntries/entryHeader/enteredDate"/> <field column="totalDebit" xpath="/accountingEntries/entryHeader/totalDebit"/> <field column="totalCredit" xpath="/accountingEntries/entryHeader/totalCredit"/> <field column="lineNumber" xpath="/accountingEntries/entryHeader/entryDetail/lineNumber"/> </entity> </entity> </entity> </document> </dataConfig> # And this is the result i got "----------- row #1-------------", "file", "YYY.zip", "fileSize", 1124851, "fileLastModified", "2017-07-21T08:18:23.085Z", "fileDir", "C:\\Users\\USER\\Desktop\\solr-6.6.0\\example\\exampledocs\\myFiles", "fileAbsolutePath", "C:\\Users\\USER\\Desktop\\solr-6.6.0\\example\\exampledocs\\myFiles\\YYY.zip", null, "---------------------------------------------", "entity:ext", [ "query", "C:\\Users\\USER\\Desktop\\solr-6.6.0\\example\\exampledocs\\myFiles\\YYY.zip", "time-taken", "0:0:0.0", null, "----------- row #1-------------", "text", "<?xml version=\"1.0\" encoding=\"UTF-8\"?><html xmlns=\"http://www.w3.org/1999/xhtml\">\r\n<head>\r\n<meta name=\"X-Parsed-By\" content=\"org.apache.tika.parser.DefaultParser\"/>\r\n<meta name=\"X-Parsed-By\" content=\"org.apache.tika.parser.pkg.PackageParser\"/>\r\n<meta name=\"Content-Type\" content=\"application/zip\"/>\r\n<title/>\r\n</head>\r\n<body><div class=\"embedded\" id=\"9860029035-201601-Y-000000.xml\"/>\r\n<div class=\"package-entry\"> 9860029035-201601-Y-000000.xml \r\n</div>\r\n</body></html>", null, "---------------------------------------------", "entity:xml", [ "document#1", [ "query", "C:\\Users\\USER\\Desktop\\solr-6.6.0\\example\\exampledocs\\myFiles\\YYY.zip", "time-taken", "0:0:0.0" # Please explain how it works -- View this message in context: http://lucene.472066.n3.nabble.com/Tika-DIH-zip-to-xml-tp4347122.html Sent from the Solr - User mailing list archive at Nabble.com.