why does stripHTML="false" have no effect in dih? the html is strippedin text 
and text_nohtml when i do display the index with select?q=*

i'm trying to get a field without html and one with it so i can also index the 
links on the page.

data-config.xml
<entity name="rec" processor="XPathEntityProcessor" 
url="file:///C:\ColdFusion10\cfusion\solr\solr\tkbintranet\docImportUrl.xml" 
forEach="/docs/doc" dataSource="main"> <!-- transformer="script:GenerateId"-->
                <field column="title" xpath="//title" />
                <field column="id" xpath="//id" />
                <field column="file" xpath="//file" />
                <field column="url" xpath="//url" />
                <field column="urlParse" xpath="//urlParse" />
                <field column="last_modified" xpath="//last_modified" />
                <field column="Author" xpath="//author" />
                
                <entity name="tika" processor="TikaEntityProcessor" 
url="${rec.urlParse}" dataSource="dataUrl" onError="skip" htmlMapper="identity" 
format="html" transformer="HTMLStripTransformer">
                        <field column="text" name="text" stripHTML="false" />
                        <field column="text" name="text_nohtml" 
stripHTML="true" />
                        <!--  transformer="RegexTransformer"
                        <field column="text_html_b" 
regex="(?s)^.*&lt;div.*id=.*&gt;(.*)&lt;/div&gt;.*$" replaceWith="$1" 
sourceColName="text"  />
                        <field column="text_html_b" 
regex="(?s)^.*&lt;!-body-&gt;(.*)&lt;!-/body-&gt;.*$" replaceWith="$1" 
sourceColName="text"  /> -->
                </entity>
        </entity>

Reply via email to