Custom EntityProcessor for DataImportHandler related Issue.

anupambumba Mon, 28 Nov 2016 00:26:56 -0800

Hi All,
I am facing some issue with the Custom EntityProcessor for DataImportHandler
related Issue.


*My Requirement:*

My Requirement is to process a main file along with it's associated chunk
files( Child Files) placed in a folder. The file related information are
part of JSON file placed in the same location. JSON file is having main file
metadata and also child related metadata. There may be multiple json file
and corresponding main file and associated chunk files. All I need to do is
to process the JSON and then from it extract the metadata and then index it
in SOLR. Main files and Chunk files should be indexed as separate file.

*The Approach I have Followed:*

I have used FileListEntityProcessor along with Custom Entity Processor to
Parse the JSON and indexing data in JSON. The details of the configuration
and code that I have used is as mentioned below.

*The Code and Configuration Details:*

solr-config.xml:

 <requestHandler name="/dataimport"
class="org.apache.solr.handler.dataimport.DataImportHandler">
    <lst name="defaults">
      <str
name="config">C:\SOLR\solr-6.3.0\server\solr\dataimporttest\conf\solr-data-config.xml</str>
    </lst>
  </requestHandler>

solr-data-config.xml:

<dataConfig>  
 <dataSource type="BinFileDataSource" /> 
  
        <document>
            <entity name="files" dataSource="null" rootEntity="false"
            processor="FileListEntityProcessor" fileName=".*\.(json)"
           
baseDir="C:/CognitiveAssignment/ProjectSpecificMaterials/C3Portal/TestData/Telecom
Italia Sparkle" 
            onError="skip"
            recursive="true">
                             
                <field column="fileAbsolutePath" name="fileAbsolutePath" />
               <field column="fileSize" name="size" />
                <field column="fileLastModified" name="lastModified" /> 
                 
                <entity
                    name="documentImportJSON"
                    processor="com.ibm.lnk.processor.LNKEntityProcessor" 
                    url="${files.fileAbsolutePath}" 
                    format="text" extractEmbedded="true">
                    
                    <field column="title" name="title" meta="true"/>  
                    <field column="text" name="raw_text" meta="true"/>
                   
                </entity>
                                
        </entity>
        </document> 
                
</dataConfig>  


LNKEntityProcessor.java:

package com.ibm.lnk.processor;

import java.io.IOException;
import java.io.InputStream;
import java.lang.reflect.GenericArrayType;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;

import org.apache.solr.handler.dataimport.Context;
import org.apache.solr.handler.dataimport.DataSource;
import org.apache.solr.handler.dataimport.EntityProcessorBase;
import org.apache.tika.exception.TikaException;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;

import com.ibm.lnk.utility.LNKProcessorUtility;

public class LNKEntityProcessor extends EntityProcessorBase {

        private static final int String = 0;
        private final Logger slf4jLogger =
LoggerFactory.getLogger(LNKEntityProcessor.class);
        private boolean isMainFileProcessingComplete = false;
        private static int i = 0;
        private static JSONObject parsedJSONMap = null;
        private static int childIndexValue = 0;
        private static int numberOfChildElements = 0;

        @Override
        protected void firstInit(Context context) {
                slf4jLogger.info("firstInit() is getting called ::::     ");

                super.firstInit(context);
        }

        @Override
        public void init(Context context) {

                slf4jLogger.info("init is getting called ::::::    ");
                isMainFileProcessingComplete = false;
                parsedJSONMap = null;
                childIndexValue = 0;
                numberOfChildElements = 0;
                super.init(context);
        }

        public Map<String, Object> nextRow() {
                

                slf4jLogger.info("Entering the nextRow() with 
isFileProcessingComplete
value  " + isMainFileProcessingComplete);
                Map<String, Object> dataMap=null;
                
                try {
                        if (isMainFileProcessingComplete) {
                                
                                /*if (numberOfChildElements == childIndexValue) 
{
                                        dataMap  = null;
                                } else {
                                        dataMap = parseRow(false, 
childIndexValue);
                                        
                                        childIndexValue++;
                                }*/
                                dataMap  = null;

                        } else {
                                
                                DataSource dataSource = 
this.context.getDataSource();
                                InputStream inputStreamOfFile = (InputStream) 
dataSource
                                                
.getData(this.context.getResolvedEntityAttribute("url"));

                                String fileAbsolutePath =
this.context.getResolvedEntityAttribute("url");
                                slf4jLogger.info("  The Url for the file is 
::::  " + fileAbsolutePath);
                                String currentJsonString =
LNKProcessorUtility.getTextContent(inputStreamOfFile);
                                slf4jLogger.info("The JSON String to be parsed 
is ::::  " +
currentJsonString);
                                dataMap = new HashMap<String, Object>();
                                parsedJSONMap = 
LNKProcessorUtility.parseJSONMap(currentJsonString);
                                slf4jLogger.info("Parsed Map is ::::  " + 
parsedJSONMap);
                                dataMap = parseRow(true, -1);

                                isMainFileProcessingComplete = true;
                                numberOfChildElements = ((JSONArray)
parsedJSONMap.get("listOfCheunkFile")).length();
                                

                        }
                } catch (Exception e) {

                        slf4jLogger.info("Some Exception has Occured::::  " + 
e);
                        return null;
                }
        
                return dataMap;

        }

        private Map<String, Object> parseRow(boolean isParent, int childIndex)
                        throws JSONException, IOException, SAXException, 
TikaException {
                Map<String, Object> currentDataMap = new HashMap<String, 
Object>();
                DataSource dataSource = this.context.getDataSource();
                Iterator<Map&lt;String, String>> iterator =
this.context.getAllEntityFields().iterator();
                do {
                        if (!iterator.hasNext()) {
                                break;
                        }
                        Map field = (Map) iterator.next();
                        if ("true".equals(field.get("meta"))) {
                                String col = (String) field.get("column");
                                String valueOfColumn = "";
                                // Code for Testing SolR//

                                if (isParent) {
                                        if (col.equals("text")) {
                                                InputStream 
inputStreamOfMainFile = (InputStream) dataSource.getData(
                                                                
parsedJSONMap.getString("source") + "\\" +
parsedJSONMap.getString("fileName"));
                                                valueOfColumn =
LNKProcessorUtility.getTextContent(inputStreamOfMainFile);
                                        } else if (col.equals("title")) {
                                                valueOfColumn = 
parsedJSONMap.getString("fileName");
                                        }
                                } else {
                                        if (col.equals("text")) {
                                                valueOfColumn="This is Raw 
Text";
                                        } else if (col.equals("title")) {
                                                valueOfColumn="This is Title";
                                        }
                                }
                                slf4jLogger.info("The valueOfColumn is ::::   
PPPPP \n\n\n " +
valueOfColumn);
                                // End of Code for Testing SOLR

                                if (valueOfColumn != null) {
                                        currentDataMap.put(col, valueOfColumn);
                                }
                        }
                        i++;
                        slf4jLogger.info("Data for the Row is ::::  " + 
currentDataMap);
                } while (true);
                
                return currentDataMap;

        }
        
        

}


LNKProcessorUtility.java:

package com.ibm.lnk.utility;

import java.io.IOException;
import java.io.InputStream;
import java.util.Map;

import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.sax.BodyContentHandler;
import org.json.JSONException;
import org.json.JSONObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;

public class LNKProcessorUtility {
         private static final Logger slf4jLogger =
LoggerFactory.getLogger(LNKProcessorUtility.class);
        public static String getTextContent(InputStream inputSreamOfFile) throws
IOException, SAXException, TikaException {
                String fileTextContent = null;
                AutoDetectParser parser = new AutoDetectParser();
            BodyContentHandler handler = new BodyContentHandler();
            Metadata metadata = new Metadata();
            parser.parse(inputSreamOfFile, handler, metadata);
        fileTextContent= handler.toString();
        return fileTextContent;

        }
        
        public static JSONObject parseJSONMap(String jsonParsedString) throws
JSONException {
                Map<String,String> parsedJson=null;
                JSONObject jsonObject=new JSONObject(jsonParsedString) ;
                return jsonObject;
        }
}


*The Problem I am facing:*

I am facing no issue if I am trying to store the Parent file only. But as I
am trying to store the child files also one strange thing is happening. In
the UI from where I am running the DIH is showing all the documents are
indexed but the actual fact is only One document is getting indexed that is
the last Child document. Seems like some changes I have to make in
LNKEntityProcessor.java so that each time I am returning a map from
nextRow() will create a document in SOLR. Can you please help me on this.

Thankns and Regards
------------------------------
Anupam Sarkar





--
View this message in context: 
http://lucene.472066.n3.nabble.com/Custom-EntityProcessor-for-DataImportHandler-related-Issue-tp4307629.html
Sent from the Solr - User mailing list archive at Nabble.com.

Custom EntityProcessor for DataImportHandler related Issue.

Reply via email to