ok i see what your getting at but why doesn't the following work: <field xpath="//h:h1" column="h_1" /> <field column="text" xpath="/xhtml:html/xhtml:body" />
i removed the tiki-processor. what am i missing, i haven't found anything in the wiki? On 28. Sep 2013, at 12:28 AM, P Williams wrote: > I spent some more time thinking about this. Do you really need to use the > TikaEntityProcessor? It doesn't offer anything new to the document you are > building that couldn't be accomplished by the XPathEntityProcessor alone > from what I can tell. > > I also tried to get the Advanced > Parsing<http://wiki.apache.org/solr/TikaEntityProcessor>example to > work without success. There are some obvious typos (<document> > instead of </document>) and an odd order to the pieces (<dataSources> is > enclosed by <document>). It also looks like > FieldStreamDataSource<http://lucene.apache.org/solr/4_3_1/solr-dataimporthandler/org/apache/solr/handler/dataimport/FieldStreamDataSource.html>is > the one that is meant to work in this context. If Koji is still around > maybe he could offer some help? Otherwise this bit of erroneous > instruction should probably be removed from the wiki. > > Cheers, > Tricia > > $ svn diff > Index: > solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java > =================================================================== > --- > solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java > (revision 1526990) > +++ > solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java > (working copy) > @@ -99,13 +99,13 @@ > runFullImport(getConfigHTML("identity")); > assertQ(req("*:*"), testsHTMLIdentity); > } > - > + > private String getConfigHTML(String htmlMapper) { > return > "<dataConfig>" + > " <dataSource type='BinFileDataSource'/>" + > " <document>" + > - " <entity name='Tika' format='xml' > processor='TikaEntityProcessor' " + > + " <entity name='Tika' format='html' > processor='TikaEntityProcessor' " + > " url='" + > getFile("dihextras/structured.html").getAbsolutePath() + "' " + > ((htmlMapper == null) ? "" : (" htmlMapper='" + htmlMapper + > "'")) + ">" + > " <field column='text'/>" + > @@ -114,4 +114,36 @@ > "</dataConfig>"; > > } > + private String[] testsHTMLH1 = { > + "//*[@numFound='1']" > + , "//str[@name='h1'][contains(.,'H1 Header')]" > + }; > + > + @Test > + public void testTikaHTMLMapperSubEntity() throws Exception { > + runFullImport(getConfigSubEntity("identity")); > + assertQ(req("*:*"), testsHTMLH1); > + } > + > + private String getConfigSubEntity(String htmlMapper) { > + return > + "<dataConfig>" + > + "<dataSource type='BinFileDataSource' name='bin'/>" + > + "<dataSource type='FieldStreamDataSource' name='fld'/>" + > + "<document>" + > + "<entity name='tika' processor='TikaEntityProcessor' url='" + > getFile("dihextras/structured.html").getAbsolutePath() + "' > dataSource='bin' format='html' rootEntity='false'>" + > + "<!--Do appropriate mapping here meta=\"true\" means it is a > metadata field -->" + > + "<field column='Author' meta='true' name='author'/>" + > + "<field column='title' meta='true' name='title'/>" + > + "<!--'text' is an implicit field emited by TikaEntityProcessor . > Map it appropriately-->" + > + "<field name='text' column='text'/>" + > + "<entity name='detail' type='XPathEntityProcessor' forEach='/html' > dataSource='fld' dataField='tika.text' rootEntity='true' >" + > + "<field xpath='//div' column='foo'/>" + > + "<field xpath='//h1' column='h1' />" + > + "</entity>" + > + "</entity>" + > + "</document>" + > + "</dataConfig>"; > + } > + > } > Index: > solr/contrib/dataimporthandler-extras/src/test-files/dihextras/solr/collection1/conf/dataimport-schema-no-unique-key.xml > =================================================================== > --- > solr/contrib/dataimporthandler-extras/src/test-files/dihextras/solr/collection1/conf/dataimport-schema-no-unique-key.xml > (revision 1526990) > +++ > solr/contrib/dataimporthandler-extras/src/test-files/dihextras/solr/collection1/conf/dataimport-schema-no-unique-key.xml > (working copy) > @@ -194,6 +194,8 @@ > <field name="title" type="string" indexed="true" stored="true"/> > <field name="author" type="string" indexed="true" stored="true" /> > <field name="text" type="text" indexed="true" stored="true" /> > + <field name="h1" type="text" indexed="true" stored="true" /> > + <field name="foo" type="text" indexed="true" stored="true" /> > > </fields> > <!-- field for the QueryParser to use when an explicit fieldname is > absent --> > > > I find the SqlEntityProcessor part particularly odd. That's the default > right?: > 2405 T12 C1 oashd.SqlEntityProcessor.initQuery ERROR The query failed > 'null' java.lang.RuntimeException: unsupported type : class java.lang.String > at > org.apache.solr.handler.dataimport.FieldStreamDataSource.getData(FieldStreamDataSource.java:89) > at > org.apache.solr.handler.dataimport.FieldStreamDataSource.getData(FieldStreamDataSource.java:1) > at > org.apache.solr.handler.dataimport.SqlEntityProcessor.initQuery(SqlEntityProcessor.java:59) > at > org.apache.solr.handler.dataimport.SqlEntityProcessor.nextRow(SqlEntityProcessor.java:73) > at > org.apache.solr.handler.dataimport.EntityProcessorWrapper.nextRow(EntityProcessorWrapper.java:243) > at > org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:469) > at > org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:495) > at > org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:408) > at > org.apache.solr.handler.dataimport.DocBuilder.doFullDump(DocBuilder.java:323) > at > org.apache.solr.handler.dataimport.DocBuilder.execute(DocBuilder.java:231) > at > org.apache.solr.handler.dataimport.DataImporter.doFullImport(DataImporter.java:411) > at > org.apache.solr.handler.dataimport.DataImporter.runCmd(DataImporter.java:476) > at > org.apache.solr.handler.dataimport.DataImportHandler.handleRequestBody(DataImportHandler.java:179) > at > org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135) > at org.apache.solr.core.SolrCore.execute(SolrCore.java:1859) > at org.apache.solr.util.TestHarness.query(TestHarness.java:291) > at > org.apache.solr.handler.dataimport.AbstractDataImportHandlerTestCase.runFullImport(AbstractDataImportHandlerTestCase.java:96) > at > org.apache.solr.handler.dataimport.TestTikaEntityProcessor.testTikaHTMLMapperSubEntity(TestTikaEntityProcessor.java:124) > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > at > sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:601) > at > com.carrotsearch.randomizedtesting.RandomizedRunner.invoke(RandomizedRunner.java:1559) > at > com.carrotsearch.randomizedtesting.RandomizedRunner.access$600(RandomizedRunner.java:79) > at > com.carrotsearch.randomizedtesting.RandomizedRunner$6.evaluate(RandomizedRunner.java:737) > at > com.carrotsearch.randomizedtesting.RandomizedRunner$7.evaluate(RandomizedRunner.java:773) > at > com.carrotsearch.randomizedtesting.RandomizedRunner$8.evaluate(RandomizedRunner.java:787) > at > com.carrotsearch.randomizedtesting.rules.SystemPropertiesRestoreRule$1.evaluate(SystemPropertiesRestoreRule.java:53) > at > org.apache.lucene.util.TestRuleSetupTeardownChained$1.evaluate(TestRuleSetupTeardownChained.java:50) > at > org.apache.lucene.util.TestRuleFieldCacheSanity$1.evaluate(TestRuleFieldCacheSanity.java:51) > at > org.apache.lucene.util.AbstractBeforeAfterRule$1.evaluate(AbstractBeforeAfterRule.java:46) > at > com.carrotsearch.randomizedtesting.rules.SystemPropertiesInvariantRule$1.evaluate(SystemPropertiesInvariantRule.java:55) > at > org.apache.lucene.util.TestRuleThreadAndTestName$1.evaluate(TestRuleThreadAndTestName.java:49) > at > org.apache.lucene.util.TestRuleIgnoreAfterMaxFailures$1.evaluate(TestRuleIgnoreAfterMaxFailures.java:70) > at > org.apache.lucene.util.TestRuleMarkFailure$1.evaluate(TestRuleMarkFailure.java:48) > at > com.carrotsearch.randomizedtesting.rules.StatementAdapter.evaluate(StatementAdapter.java:36) > at > com.carrotsearch.randomizedtesting.ThreadLeakControl$StatementRunner.run(ThreadLeakControl.java:358) > at > com.carrotsearch.randomizedtesting.ThreadLeakControl.forkTimeoutingTask(ThreadLeakControl.java:782) > at > com.carrotsearch.randomizedtesting.ThreadLeakControl$3.evaluate(ThreadLeakControl.java:442) > at > com.carrotsearch.randomizedtesting.RandomizedRunner.runSingleTest(RandomizedRunner.java:746) > at > com.carrotsearch.randomizedtesting.RandomizedRunner$3.evaluate(RandomizedRunner.java:648) > at > com.carrotsearch.randomizedtesting.RandomizedRunner$4.evaluate(RandomizedRunner.java:682) > at > com.carrotsearch.randomizedtesting.RandomizedRunner$5.evaluate(RandomizedRunner.java:693) > at > com.carrotsearch.randomizedtesting.rules.StatementAdapter.evaluate(StatementAdapter.java:36) > at > com.carrotsearch.randomizedtesting.rules.SystemPropertiesRestoreRule$1.evaluate(SystemPropertiesRestoreRule.java:53) > at > org.apache.lucene.util.AbstractBeforeAfterRule$1.evaluate(AbstractBeforeAfterRule.java:46) > at > org.apache.lucene.util.TestRuleStoreClassName$1.evaluate(TestRuleStoreClassName.java:42) > at > com.carrotsearch.randomizedtesting.rules.SystemPropertiesInvariantRule$1.evaluate(SystemPropertiesInvariantRule.java:55) > at > com.carrotsearch.randomizedtesting.rules.NoShadowingOrOverridesOnMethodsRule$1.evaluate(NoShadowingOrOverridesOnMethodsRule.java:39) > at > com.carrotsearch.randomizedtesting.rules.NoShadowingOrOverridesOnMethodsRule$1.evaluate(NoShadowingOrOverridesOnMethodsRule.java:39) > at > com.carrotsearch.randomizedtesting.rules.StatementAdapter.evaluate(StatementAdapter.java:36) > at > org.apache.lucene.util.TestRuleAssertionsRequired$1.evaluate(TestRuleAssertionsRequired.java:43) > at > org.apache.lucene.util.TestRuleMarkFailure$1.evaluate(TestRuleMarkFailure.java:48) > at > org.apache.lucene.util.TestRuleIgnoreAfterMaxFailures$1.evaluate(TestRuleIgnoreAfterMaxFailures.java:70) > at > org.apache.lucene.util.TestRuleIgnoreTestSuites$1.evaluate(TestRuleIgnoreTestSuites.java:55) > at > com.carrotsearch.randomizedtesting.rules.StatementAdapter.evaluate(StatementAdapter.java:36) > at > com.carrotsearch.randomizedtesting.ThreadLeakControl$StatementRunner.run(ThreadLeakControl.java:358) > at java.lang.Thread.run(Thread.java:722) > > > > On Fri, Sep 27, 2013 at 3:55 AM, Andreas Owen <a...@conx.ch> wrote: > >> i removed the FieldReaderDataSource and dataSource="fld" but it didn't >> help. i get the following for each document: >> DataImportHandlerException: Exception in invoking url null >> Processing Document # 9 >> nullpointerexception >> >> >> On 26. Sep 2013, at 8:39 PM, P Williams wrote: >> >>> Hi, >>> >>> Haven't tried this myself but maybe try leaving out the >>> FieldReaderDataSource entirely. From my quick searching looks like it's >>> tied to SQL. Did you try copying the >>> http://wiki.apache.org/solr/TikaEntityProcessor Advanced Parsing example >>> exactly? What happens when you leave out FieldReaderDataSource? >>> >>> Cheers, >>> Tricia >>> >>> >>> On Thu, Sep 26, 2013 at 4:17 AM, Andreas Owen <a...@conx.ch> wrote: >>> >>>> i'm using solr 4.3.1 and the dataimporter. i am trying to use >>>> XPathEntityProcessor within the TikaEntityProcessor for indexing >> html-pages >>>> but i'm getting this error for each document. i have also tried >>>> dataField="tika.text" and dataField="text" to no avail. the nested >>>> XPathEntityProcessor "detail" creates the error, the rest works fine. >> what >>>> am i doing wrong? >>>> >>>> error: >>>> >>>> ERROR - 2013-09-26 12:08:49.006; >>>> org.apache.solr.handler.dataimport.SqlEntityProcessor; The query failed >>>> 'null' >>>> java.lang.ClassCastException: java.io.StringReader cannot be cast to >>>> java.util.Iterator >>>> at >>>> >> org.apache.solr.handler.dataimport.SqlEntityProcessor.initQuery(SqlEntityProcessor.java:59) >>>> at >>>> >> org.apache.solr.handler.dataimport.SqlEntityProcessor.nextRow(SqlEntityProcessor.java:73) >>>> at >>>> >> org.apache.solr.handler.dataimport.EntityProcessorWrapper.nextRow(EntityProcessorWrapper.java:243) >>>> at >>>> >> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:465) >>>> at >>>> >> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491) >>>> at >>>> >> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491) >>>> at >>>> >> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:404) >>>> at >>>> >> org.apache.solr.handler.dataimport.DocBuilder.doFullDump(DocBuilder.java:319) >>>> at >>>> >> org.apache.solr.handler.dataimport.DocBuilder.execute(DocBuilder.java:227) >>>> at >>>> >> org.apache.solr.handler.dataimport.DataImporter.doFullImport(DataImporter.java:422) >>>> at >>>> >> org.apache.solr.handler.dataimport.DataImporter.runCmd(DataImporter.java:487) >>>> at >>>> >> org.apache.solr.handler.dataimport.DataImportHandler.handleRequestBody(DataImportHandler.java:179) >>>> at >>>> >> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135) >>>> at org.apache.solr.core.SolrCore.execute(SolrCore.java:1820) >>>> at >>>> >> org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:656) >>>> at >>>> >> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:359) >>>> at >>>> >> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:155) >>>> at >>>> >> org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1307) >>>> at >>>> >> org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:453) >>>> at >>>> >> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137) >>>> at >>>> >> org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:560) >>>> at >>>> >> org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231) >>>> at >>>> >> org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1072) >>>> at >>>> >> org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:382) >>>> at >>>> >> org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193) >>>> at >>>> >> org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1006) >>>> at >>>> >> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135) >>>> at >>>> >> org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255) >>>> at >>>> >> org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154) >>>> at >>>> >> org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116) >>>> at org.eclipse.jetty.server.Server.handle(Server.java:365) >>>> at >>>> >> org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:485) >>>> at >>>> >> org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53) >>>> at >>>> >> org.eclipse.jetty.server.AbstractHttpConnection.content(AbstractHttpConnection.java:937) >>>> at >>>> >> org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.content(AbstractHttpConnection.java:998) >>>> at >> org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:856) >>>> at >>>> org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:240) >>>> at >>>> >> org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72) >>>> at >>>> >> org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264) >>>> at >>>> >> org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608) >>>> at >>>> >> org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543) >>>> at java.lang.Thread.run(Unknown Source) >>>> ERROR - 2013-09-26 12:08:49.022; org.apache.solr.common.SolrException; >>>> Exception in entity : >>>> detail:org.apache.solr.handler.dataimport.DataImportHandlerException: >>>> java.lang.ClassCastException: java.io.StringReader cannot be cast to >>>> java.util.Iterator >>>> at >>>> >> org.apache.solr.handler.dataimport.SqlEntityProcessor.initQuery(SqlEntityProcessor.java:65) >>>> at >>>> >> org.apache.solr.handler.dataimport.SqlEntityProcessor.nextRow(SqlEntityProcessor.java:73) >>>> at >>>> >> org.apache.solr.handler.dataimport.EntityProcessorWrapper.nextRow(EntityProcessorWrapper.java:243) >>>> at >>>> >> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:465) >>>> at >>>> >> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491) >>>> at >>>> >> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:491) >>>> at >>>> >> org.apache.solr.handler.dataimport.DocBuilder.buildDocument(DocBuilder.java:404) >>>> at >>>> >> org.apache.solr.handler.dataimport.DocBuilder.doFullDump(DocBuilder.java:319) >>>> at >>>> >> org.apache.solr.handler.dataimport.DocBuilder.execute(DocBuilder.java:227) >>>> at >>>> >> org.apache.solr.handler.dataimport.DataImporter.doFullImport(DataImporter.java:422) >>>> at >>>> >> org.apache.solr.handler.dataimport.DataImporter.runCmd(DataImporter.java:487) >>>> at >>>> >> org.apache.solr.handler.dataimport.DataImportHandler.handleRequestBody(DataImportHandler.java:179) >>>> at >>>> >> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135) >>>> at org.apache.solr.core.SolrCore.execute(SolrCore.java:1820) >>>> at >>>> >> org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:656) >>>> at >>>> >> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:359) >>>> at >>>> >> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:155) >>>> at >>>> >> org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1307) >>>> at >>>> >> org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:453) >>>> at >>>> >> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137) >>>> at >>>> >> org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:560) >>>> at >>>> >> org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231) >>>> at >>>> >> org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1072) >>>> at >>>> >> org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:382) >>>> at >>>> >> org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193) >>>> at >>>> >> org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1006) >>>> at >>>> >> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135) >>>> at >>>> >> org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255) >>>> at >>>> >> org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154) >>>> at >>>> >> org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116) >>>> at org.eclipse.jetty.server.Server.handle(Server.java:365) >>>> at >>>> >> org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:485) >>>> at >>>> >> org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53) >>>> at >>>> >> org.eclipse.jetty.server.AbstractHttpConnection.content(AbstractHttpConnection.java:937) >>>> at >>>> >> org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.content(AbstractHttpConnection.java:998) >>>> at >> org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:856) >>>> at >>>> org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:240) >>>> at >>>> >> org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72) >>>> at >>>> >> org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264) >>>> at >>>> >> org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608) >>>> at >>>> >> org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543) >>>> at java.lang.Thread.run(Unknown Source) >>>> Caused by: java.lang.ClassCastException: java.io.StringReader cannot be >>>> cast to java.util.Iterator >>>> at >>>> >> org.apache.solr.handler.dataimport.SqlEntityProcessor.initQuery(SqlEntityProcessor.java:59) >>>> ... 41 more >>>> >>>> >>>> >>>> data-config.xml >>>> >>>> <dataConfig> >>>> <dataSource type="BinURLDataSource" name="dataFile"/> >>>> <dataSource type="BinURLDataSource" name="dataUrl"/> >>>> <dataSource type="URLDataSource" name="main"/> >>>> <dataSource type="FieldReaderDataSource" name="fld"/> >>>> <document> >>>> <entity name="rec" processor="XPathEntityProcessor" >>>> >> url="file:///C:\ColdFusion10\cfusion\solr\solr\tkbintranet\docImportUrl.xml" >>>> forEach="/docs/doc" dataSource="main"> >>>> <field column="title" xpath="//title" /> >>>> <field column="id" xpath="//id" /> >>>> <field column="file" xpath="//file" /> >>>> <field column="url" xpath="//url" /> >>>> <field column="urlParse" xpath="//urlParse" /> >>>> <field column="last_modified" xpath="//last_modified" /> >>>> <field column="Author" xpath="//author" /> >>>> >>>> <entity name="tika" processor="TikaEntityProcessor" >>>> url="${rec.urlParse}" dataSource="dataUrl" onError="skip" format="html"> >>>> <field column="text"/> >>>> >>>> <entity name="detail" type="XPathEntityProcessor" >>>> forEach="/html" dataSource="fld" dataField="${tika.text}" >> rootEntity="true" >>>> onError="skip"> >>>> <field xpath="//h1" column="h_1" /> >>>> </entity> >>>> </entity> >>>> </entity> >>>> </document> >>>> </dataConfig> >> >>