I would recommend to create a simple data import handler to test tika
parsing for large BLOBs, i.e. remove not related entities, remove all
the configuration for delta imports and keep just entity that
retrieves blobs and entity that parses binary content
(fieldReader/TikaEntityProcessor).

Some comments:
1. Maybe you are running delta import and there are not new records in database?
2. deltaQuery should only return id-s and not other columns/data,
because you don't use them in deltaQueryImport (see
dataimporter.delta.id )
3. not all entities have HTMLStripTransformer in a transformers list,
but use them in fields. TemplateTransformer is not used at all.

>   <entity name="aitiologikes_ektheseis"
>         dataSource="db"
>         transformer="HTMLStripTransformer"
>         query="select id, title, title AS grid_title, model, type, url,
> last_modified, CONCAT_WS('_',id,model) AS solr_id, search_tag, CONCAT(
> body,' ',title)  AS content from aitiologikes_ektheseis where type = 'text'"
>         deltaImportQuery="select id, title, title AS grid_title, model, type, 
> url,
> last_modified, CONCAT_WS('_',id,model) AS solr_id, search_tag, CONCAT(
> body,' ',title)  AS content from aitiologikes_ektheseis where type = 'text'
> and id='${dataimporter.delta.id}'"
>         deltaQuery="select id, title, title AS grid_title, model, type, url,
> last_modified, CONCAT_WS('_',id,model) AS solr_id, search_tag, CONCAT(
> body,' ',title)  AS content from aitiologikes_ektheseis where type = 'text'
> and last_modified > '${dataimporter.last_index_time}'">
>                 <field column="id" name="ida" />
>                 <field column="solr_id" name="solr_id" />
>                 <field column="title" name="title" stripHTML="true" />
>                 <field column="grid_title" name="grid_title" stripHTML="true" 
> />
>                 <field column="model" name="model" stripHTML="true" />
>                 <field column="type" name="type" stripHTML="true" />
>                 <field column="url" name="url" stripHTML="true" />
>                 <field column="last_modified" name="last_modified" 
> stripHTML="true"  />
>                 <field column="search_tag" name="search_tag" stripHTML="true" 
> />
>                 <field column="content" name="content" stripHTML="true" />
>     </entity>
>
>     <entity name="aitiologikes_ektheseis_bin"
>           query="select id, title, title AS grid_title, model, type, url,
> last_modified, CONCAT_WS('_',id,model) AS solr_id, search_tag, bin_con AS
> text from aitiologikes_ektheseis where type = 'bin'"
>           deltaImportQuery="select id, title, title AS grid_title, model, 
> type,
> url, last_modified, CONCAT_WS('_',id,model) AS solr_id, search_tag, bin_con
> AS text from aitiologikes_ektheseis where type = 'bin' and
> id='${dataimporter.delta.id}'"
>           deltaQuery="select id, title, title AS grid_title, model, type, url,
> last_modified, CONCAT_WS('_',id,model) AS solr_id, search_tag, bin_con AS
> text from aitiologikes_ektheseis where type = 'bin' and last_modified >
> '${dataimporter.last_index_time}'"
>           transformer="TemplateTransformer"
>           dataSource="db">
>
>                   <field column="id" name="ida" />
>                 <field column="solr_id" name="solr_id" />
>                   <field column="title" name="title" stripHTML="true" />
>                   <field column="grid_title" name="grid_title" 
> stripHTML="true" />
>                   <field column="model" name="model" stripHTML="true" />
>                   <field column="type" name="type" stripHTML="true" />
>                   <field column="url" name="url" stripHTML="true" />
>                   <field column="last_modified" name="last_modified" 
> stripHTML="true"  />
>                   <field column="search_tag" name="search_tag" 
> stripHTML="true" />
>
>                 <entity dataSource="fieldReader" 
> processor="TikaEntityProcessor"
> dataField="aitiologikes_ektheseis_bin.text" format="text">
>                   <field column="text" name="contentbin" stripHTML="true" />
>                 </entity>
>
>         </entity>
>
> ...
> ...
>     </document>
>
> </dataConfig>
>
> *A portion from schema.xml (the fieldTypes and filed definition):*
>
> <fieldType name="text_ktimatologio" class="solr.TextField"
> positionIncrementGap="100">
>
>       <analyzer type="index">
>         <tokenizer class="solr.StandardTokenizerFactory"/>
>         <filter class="solr.StopFilterFactory" ignoreCase="true"
> words="lang/stopwords_en.txt" enablePositionIncrements="true"/>
>         <filter class="solr.LowerCaseFilterFactory"/>
>             <filter class="solr.EnglishPossessiveFilterFactory"/>
>                 <filter class="solr.StopFilterFactory" ignoreCase="true"
> words="lang/stopwords_el.txt" enablePositionIncrements="true"/>
>             <filter class="solr.GreekLowerCaseFilterFactory"/>
>             <filter class="solr.GreekStemFilterFactory"/>
>         <filter class="solr.KeywordMarkerFilterFactory"
> protected="protwords.txt"/>
>         <filter class="solr.PorterStemFilterFactory"/>
>       </analyzer>
>
>       <analyzer type="query">
>         <tokenizer class="solr.StandardTokenizerFactory"/>
>         <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
> ignoreCase="true" expand="true"/>
>         <filter class="solr.StopFilterFactory" ignoreCase="true"
> words="lang/stopwords_en.txt" enablePositionIncrements="true"/>
>                 <filter class="solr.StopFilterFactory" ignoreCase="true"
> words="lang/stopwords_el.txt" enablePositionIncrements="true"/>
>                 <filter class="solr.GreekLowerCaseFilterFactory"/>
>         <filter class="solr.GreekStemFilterFactory"/>
>         <filter class="solr.LowerCaseFilterFactory"/>
>             <filter class="solr.EnglishPossessiveFilterFactory"/>
>         <filter class="solr.KeywordMarkerFilterFactory"
> protected="protwords.txt"/>
>         <filter class="solr.PorterStemFilterFactory"/>
>       </analyzer>
> </fieldType>
>
>
>
>
> <fieldType name="text" class="solr.TextField" positionIncrementGap="100">
>         <analyzer type="index">
>                 <charFilter class="solr.HTMLStripCharFilterFactory"/>
>                 <tokenizer class="solr.StandardTokenizerFactory"/>
>                 <filter class="solr.StandardFilterFactory"/>
>                 <filter class="solr.LowerCaseFilterFactory"/>
>                 <filter class="solr.StopFilterFactory" ignoreCase="true"
> words="lang/stopwords_el.txt" enablePositionIncrements="true"/>
>                 <filter class="solr.GreekLowerCaseFilterFactory"/>
>                 <filter class="solr.GreekStemFilterFactory"/>
>                 <filter class="solr.HunspellStemFilterFactory"
> dictionary="dictionaries/el_GR.dic" affix="dictionaries/el_GR.aff"
> ignoreCase="true" />
>         </analyzer>
>
>         <analyzer type="query">
>                 <charFilter class="solr.HTMLStripCharFilterFactory"/>
>                 <tokenizer class="solr.StandardTokenizerFactory"/>
>                 <filter class="solr.StandardFilterFactory"/>
>                 <filter class="solr.LowerCaseFilterFactory"/>
>                 <filter class="solr.LowerCaseFilterFactory"/>
>                 <filter class="solr.StopFilterFactory" ignoreCase="true"
> words="lang/stopwords_el.txt" enablePositionIncrements="true"/>
>                 <filter class="solr.GreekLowerCaseFilterFactory"/>
>                 <filter class="solr.GreekStemFilterFactory"/>
>                 <filter class="solr.HunspellStemFilterFactory"
> dictionary="dictionaries/el_GR.dic" affix="dictionaries/el_GR.aff"
> ignoreCase="true" />
>         </analyzer>
> </fieldType>
>
>
> <fields>
>   <field  name="ida" type="string" indexed="true" stored="true"
> multiValued="false"/>
>   <field  name="solr_id" type="string" indexed="true" stored="true"
> multiValued="false"/>
>   <field  name="title" type="text_ktimatologio" indexed="true"
> stored="true"/>
>   <field  name="grid_title" type="text_ktimatologio" indexed="true"
> stored="true"/>
>   <field  name="model" type="string" indexed="true" stored="true"
> multiValued="false"/>
>   <field  name="type" type="string" indexed="true" stored="true"/>
>   <field  name="url" type="string" indexed="true" stored="true"/>
>   <field  name="last_modified" type="string" indexed="true" stored="true"/>
>   <field  name="search_tag" type="string" indexed="true" stored="true"/>
>   <field  name="contentbin" type="text" indexed="true" stored="true"
> multiValued="true"/>
>   <field  name="content" type="text_ktimatologio" indexed="true"
> stored="true" multiValued="true"/>
> </fields>
>
> I really need help on this!
>
> With respect,
>
> Tom
>
> Greece
>
>
>
> --
> View this message in context: 
> http://lucene.472066.n3.nabble.com/Indexing-and-querying-BLOBS-stored-in-Mysql-tp4002940.html
> Sent from the Solr - User mailing list archive at Nabble.com.

Reply via email to