Hello all,
I have the following DIH data-config.xml file. Adding
HTMLStripTransformer and the associated stripHTML on the
para tag seems to have broke things. I am using a nightly
build from 12-jan-2009
The /record/sect1/para contains HTML sub tags which need
to be discarded. Is my use of stripHTML correct?
<dataConfig>
<dataSource name="myfilereader" type="FileDataSource"/>
<document>
<entity name="jcurrent"
processor="FileListEntityProcessor"
fileName=".*xml"
newerThan="'NOW-1000DAYS'"
recursive="true"
rootEntity="false"
dataSource="null"
baseDir="/Volumes/spare/ts/jxml/data/news/groups">
<entity name="x"
dataSource="myfilereader"
processor="XPathEntityProcessor"
url="${jcurrent.fileAbsolutePath}"
stream="false"
forEach="/record"
transformer="DateFormatTransformer,TemplateTransformer,RegexTransformer,HTMLStripTransformer">
<field column="fileAbsPath" template="${jcurrent.fileAbsolutePath}"
/>
<field column="fileWebPath" regex="/Volumes/spare/ts/(.*)"
replaceWith="$1" sourceColName="fileAbsePath"/>
<field column="title" xpath="/record/title" />
<field column="para" xpath="/record/sect1/para" stripHTML="true"
/>
<field column="subject"
xpath="/record/metadata/subje...@qualifier='fullTitle']" />
<field column="pubname"
xpath="/record/metadata/subje...@qualifier='publication']" />
<field column="pubdate"
xpath="/record/metadata/da...@qualifier='pubDate']" dateTimeFormat="yyyyMMdd"
/>
</entity>
</entity>
</document>
</dataConfig>
--
===============================================================
Fergus McMenemie Email:[email protected]
Techmore Ltd Phone:(UK) 07721 376021
Unix/Mac/Intranets Analyst Programmer
===============================================================