Hi All,

I followed the advice Michael and the timings reduced to couple of hours
now from 6-8 hours :-)
I have attached the solrconfig.xml we're using, can you let me know if I'm
missing something..

Thanks,
Sandeep
<?xml version="1.0" encoding="UTF-8" ?>
<!--
 Licensed to the Apache Software Foundation (ASF) under one or more
 contributor license agreements.  See the NOTICE file distributed with
 this work for additional information regarding copyright ownership.
 The ASF licenses this file to You under the Apache License, Version 2.0
 (the "License"); you may not use this file except in compliance with
 the License.  You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-->
<!-- 
     For more details about configurations options that may appear in this 
     file, see http://wiki.apache.org/solr/SolrConfigXml.

     Specifically, the Solr Config can support XInclude, which may make it easier to manage
     the configuration.  See https://issues.apache.org/jira/browse/SOLR-1167
-->
<config>
	<luceneMatchVersion>LUCENE_40</luceneMatchVersion>
  <!-- Set this to 'false' if you want solr to continue working after it has 
       encountered an severe configuration error.  In a production environment, 
       you may want solr to keep working even if one handler is mis-configured.

       You may also set this to false using by setting the system property:
         -Dsolr.abortOnConfigurationError=false
     -->
  <abortOnConfigurationError>${solr.abortOnConfigurationError:true}</abortOnConfigurationError>

  <!-- lib directives can be used to instruct Solr to load an Jars identified
       and use them to resolve any "plugins" specified in your solrconfig.xml or
       schema.xml (ie: Analyzers, Request Handlers, etc...).

       All directories and paths are resolved relative the instanceDir.

       If a "./lib" directory exists in your instanceDir, all files found in it
       are included as if you had used the following syntax...
       
              <lib dir="./lib" />
    -->
  <!-- A dir option by itself adds any files found in the directory to the
       classpath, this is useful for including all jars in a directory.
    -->
  <lib dir="../../contrib/extraction/lib" />
  <!-- When a regex is specified in addition to a directory, only the files in that
       directory which completely match the regex (anchored on both ends)
       will be included.
    -->
  <lib dir="../../dist/" regex="apache-solr-cell-\d.*\.jar" />
  <lib dir="../../dist/" regex="apache-solr-clustering-\d.*\.jar" />
  <!-- If a dir option (with or without a regex) is used and nothing is found
       that matches, it will be ignored
    -->
  <lib dir="../../contrib/clustering/lib/downloads/" />
  <lib dir="../../contrib/clustering/lib/" />
  <lib dir="/total/crap/dir/ignored" /> 
  <!-- an exact path can be used to specify a specific file.  This will cause
       a serious error to be logged if it can't be loaded.
  <lib path="../a-jar-that-does-not-exist.jar" /> 
  -->

  
  <!-- Used to specify an alternate directory to hold all index data
       other than the default ./data under the Solr home.
       If replication is in use, this should match the replication configuration. -->
  <dataDir>${solr.data.dir:./solr/data}</dataDir>
    
  <directoryFactory name="DirectoryFactory" class="${solr.directoryFactory:solr.NIOFSDirectory}"/> 


  <!-- WARNING: this <indexDefaults> section only provides defaults for index writers
       in general. See also the <mainIndex> section after that when changing parameters
       for Solr's main Lucene index. -->
  <indexConfig>
   <!-- Values here affect all index writers and act as a default unless overridden. -->    
    <mergeFactor>30</mergeFactor>
	<mergeScheduler class="org.apache.lucene.index.ConcurrentMergeScheduler"/>
	<mergePolicy class="org.apache.lucene.index.TieredMergePolicy">
		<int name="maxMergeAtOnce">15</int>
		<int name="segmentsPerTier">15</int>
	</mergePolicy>
	
	<!-- options specific to the main on-disk lucene index -->
        <ramBufferSizeMB>32</ramBufferSizeMB>    

    <!--
        Custom deletion policies can specified here. The class must
        implement org.apache.lucene.index.IndexDeletionPolicy.

        http://lucene.apache.org/java/2_3_2/api/org/apache/lucene/index/IndexDeletionPolicy.html

        The standard Solr IndexDeletionPolicy implementation supports deleting
        index commit points on number of commits, age of commit point and
        optimized status.

        The latest commit point should always be preserved regardless
        of the criteria.
    -->
    <deletionPolicy class="solr.SolrDeletionPolicy">
      <!-- The number of commit points to be kept -->
      <str name="maxCommitsToKeep">1</str>
      <!-- The number of optimized commit points to be kept -->
      <str name="maxOptimizedCommitsToKeep">0</str>
      <!--
          Delete all commit points once they have reached the given age.
          Supports DateMathParser syntax e.g.
          
          <str name="maxCommitAge">30MINUTES</str>
          <str name="maxCommitAge">1DAY</str>
      -->
    </deletionPolicy>

    <!--  To aid in advanced debugging, you may turn on IndexWriter debug logging.
      Setting to true will set the file that the underlying Lucene IndexWriter
      will write its debug infostream to.  -->
     <infoStream file="INFOSTREAM.txt">false</infoStream> 

  </indexConfig>

  <!-- the default high-performance update handler -->
  <updateHandler class="solr.DirectUpdateHandler2">
    <!-- A prefix of "solr." for class names is an alias that
         causes solr to search appropriate packages, including
         org.apache.solr.(search|update|request|core|analysis)
     -->

    <!-- Perform a <commit/> automatically under certain conditions:
         maxDocs - number of updates since last commit is greater than this
         maxTime - oldest uncommited update (in ms) is this long ago
         Instead of enabling autoCommit, consider using "commitWithin"
         when adding documents. http://wiki.apache.org/solr/UpdateXmlMessages
    -->
    <autoCommit> 
      <maxDocs>50000</maxDocs>
	  <openSearcher>false</openSearcher>
    </autoCommit>

  </updateHandler>

  <query>
    <!-- Maximum number of clauses in a boolean query... in the past, this affected
        range or prefix queries that expanded to big boolean queries - built in Solr 
        query parsers no longer create queries with this limitation. 
        An exception is thrown if exceeded.  -->
    <maxBooleanClauses>1024</maxBooleanClauses>


    <!-- There are two implementations of cache available for Solr,
         LRUCache, based on a synchronized LinkedHashMap, and
         FastLRUCache, based on a ConcurrentHashMap.  FastLRUCache has faster gets
         and slower puts in single threaded operation and thus is generally faster
         than LRUCache when the hit ratio of the cache is high (> 75%), and may be
         faster under other scenarios on multi-cpu systems. -->
    <!-- Cache used by SolrIndexSearcher for filters (DocSets),
         unordered sets of *all* documents that match a query.
         When a new searcher is opened, its caches may be prepopulated
         or "autowarmed" using data from caches in the old searcher.
         autowarmCount is the number of items to prepopulate.  For LRUCache,
         the autowarmed items will be the most recently accessed items.
       Parameters:
         class - the SolrCache implementation LRUCache or FastLRUCache
         size - the maximum number of entries in the cache
         initialSize - the initial capacity (number of entries) of
           the cache.  (seel java.util.HashMap)
         autowarmCount - the number of entries to prepopulate from
           and old cache.
         -->
    <filterCache
      class="solr.FastLRUCache"
      size="256"
      initialSize="256"
      autowarmCount="0"/>

    <!-- Cache used to hold field values that are quickly accessible
         by document id.  The fieldValueCache is created by default
         even if not configured here.
      <fieldValueCache
        class="solr.FastLRUCache"
        size="512"
        autowarmCount="128"
        showItems="32"
      />
    -->

   <!-- queryResultCache caches results of searches - ordered lists of
         document ids (DocList) based on a query, a sort, and the range
         of documents requested.  -->
    <queryResultCache
      class="solr.LRUCache"
      size="256"
      initialSize="256"
      autowarmCount="0"/>

  <!-- documentCache caches Lucene Document objects (the stored fields for each document).
       Since Lucene internal document ids are transient, this cache will not be autowarmed.  -->
    <documentCache
      class="solr.LRUCache"
      size="256"
      initialSize="256"
      autowarmCount="0"/>

    <!-- If true, stored fields that are not requested will be loaded lazily.
      This can result in a significant speed improvement if the usual case is to
      not load all stored fields, especially if the skipped fields are large
      compressed text fields.
    -->
    <enableLazyFieldLoading>true</enableLazyFieldLoading>
    
   <!-- An optimization for use with the queryResultCache.  When a search
         is requested, a superset of the requested number of document ids
         are collected.  For example, if a search for a particular query
         requests matching documents 10 through 19, and queryWindowSize is 50,
         then documents 0 through 49 will be collected and cached.  Any further
         requests in that range can be satisfied via the cache.  -->
    <queryResultWindowSize>20</queryResultWindowSize>

    <!-- Maximum number of documents to cache for any entry in the
         queryResultCache. -->
    <queryResultMaxDocsCached>200</queryResultMaxDocsCached>

    <!-- a newSearcher event is fired whenever a new searcher is being prepared
      and there is a current searcher handling requests (aka registered).
      It can be used to prime certain caches to prevent long request times for
      certain requests.
    -->
    <!-- QuerySenderListener takes an array of NamedList and executes a
         local query request for each NamedList in sequence. -->
    <listener event="newSearcher" class="solr.QuerySenderListener">
      <arr name="queries">
      </arr>
    </listener>

    <!-- a firstSearcher event is fired whenever a new searcher is being
         prepared but there is no current registered searcher to handle
         requests or to gain autowarming data from. -->
    <listener event="firstSearcher" class="solr.QuerySenderListener">
      <arr name="queries">
      </arr>
    </listener>

    <!-- If a search request comes in and there is no current registered searcher,
         then immediately register the still warming searcher and use it.  If
         "false" then all requests will block until the first searcher is done
         warming. -->
    <useColdSearcher>false</useColdSearcher>

    <!-- Maximum number of searchers that may be warming in the background
      concurrently.  An error is returned if this limit is exceeded. Recommend
      1-2 for read-only slaves, higher for masters w/o cache warming. -->
    <maxWarmingSearchers>2</maxWarmingSearchers>

  </query>

  <!-- 
    Let the dispatch filter handler /select?qt=XXX
    handleSelect=true will use consistent error handling for /select and /update
    handleSelect=false will use solr1.1 style error formatting
    -->
  <requestDispatcher handleSelect="true" >
    <!--Make sure your system has some authentication before enabling remote streaming!  -->
    <requestParsers enableRemoteStreaming="true" multipartUploadLimitInKB="2048000" />

    <!-- Set HTTP caching related parameters (for proxy caches and clients).
          
         To get the behaviour of Solr 1.2 (ie: no caching related headers)
         use the never304="true" option and do not specify a value for
         <cacheControl>
    -->
    <!-- <httpCaching never304="true"> -->
    <httpCaching lastModifiedFrom="openTime"
                 etagSeed="Solr">
       <!-- lastModFrom="openTime" is the default, the Last-Modified value
            (and validation against If-Modified-Since requests) will all be
            relative to when the current Searcher was opened.
            You can change it to lastModFrom="dirLastMod" if you want the
            value to exactly corrispond to when the physical index was last
            modified.

            etagSeed="..." is an option you can change to force the ETag
            header (and validation against If-None-Match requests) to be
            differnet even if the index has not changed (ie: when making
            significant changes to your config file)

            lastModifiedFrom and etagSeed are both ignored if you use the
            never304="true" option.
       -->
    </httpCaching>
  </requestDispatcher>


  <!-- requestHandler plugins... incoming queries will be dispatched to the
     correct handler based on the path or the qt (query type) param.
     Names starting with a '/' are accessed with the a path equal to the 
     registered name.  Names without a leading '/' are accessed with:
      http://host/app/select?qt=name
     If no qt is defined, the requestHandler that declares default="true"
     will be used.
  -->
  <requestHandler name="standard" class="solr.SearchHandler" default="true">
    <!-- default values for query parameters -->
     <lst name="defaults">
       <str name="echoParams">explicit</str>
       <!--
       <int name="rows">10</int>
       <str name="fl">*</str>
       <str name="version">2.1</str>
        -->
     </lst>
  </requestHandler>

  <!-- DisMaxRequestHandler allows easy searching across multiple fields
       for simple user-entered phrases.  It's implementation is now
       just the standard SearchHandler with a default query type
       of "dismax". 
       see http://wiki.apache.org/solr/DisMaxRequestHandler
   -->

   <!-- The spell check component can return a list of alternative spelling suggestions.  -->
  <searchComponent name="spellcheck" class="solr.SpellCheckComponent">

    <str name="queryAnalyzerFieldType">textSpell</str>

    <lst name="spellchecker">
      <str name="name">default</str>
      <str name="field">name</str>
      <str name="spellcheckIndexDir">./spellchecker</str>
    </lst>

  </searchComponent>

  <!-- Clustering Component
       http://wiki.apache.org/solr/ClusteringComponent
       This relies on third party jars which are not included in the release.
       To use this component (and the "/clustering" handler)
       Those jars will need to be downloaded, and you'll need to set the
       solr.cluster.enabled system property when running solr...
          java -Dsolr.clustering.enabled=true -jar start.jar
    -->
  <searchComponent
    name="clusteringComponent"
    enable="${solr.clustering.enabled:false}"
    class="org.apache.solr.handler.clustering.ClusteringComponent" >
    <!-- Declare an engine -->
    <lst name="engine">
      <!-- The name, only one can be named "default" -->
      <str name="name">default</str>
      <!-- 
           Class name of Carrot2 clustering algorithm. Currently available algorithms are:
           
           * org.carrot2.clustering.lingo.LingoClusteringAlgorithm
           * org.carrot2.clustering.stc.STCClusteringAlgorithm
           
           See http://project.carrot2.org/algorithms.html for the algorithm's characteristics.
        -->
      <str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str>
      <!-- 
           Overriding values for Carrot2 default algorithm attributes. For a description
           of all available attributes, see: http://download.carrot2.org/stable/manual/#chapter.components.
           Use attribute key as name attribute of str elements below. These can be further
           overridden for individual requests by specifying attribute key as request
           parameter name and attribute value as parameter value.
        -->
      <str name="LingoClusteringAlgorithm.desiredClusterCountBase">20</str>
    </lst>
    <lst name="engine">
      <str name="name">stc</str>
      <str name="carrot.algorithm">org.carrot2.clustering.stc.STCClusteringAlgorithm</str>
    </lst>
  </searchComponent>
  <requestHandler name="/clustering"
                  enable="${solr.clustering.enabled:false}"
                  class="solr.SearchHandler">
     <lst name="defaults">
       <bool name="clustering">true</bool>
       <str name="clustering.engine">default</str>
       <bool name="clustering.results">true</bool>
       <!-- The title field -->
       <str name="carrot.title">name</str>
       <str name="carrot.url">id</str>
       <!-- The field to cluster on -->
       <str name="carrot.snippet">features</str>
       <!-- produce summaries -->
       <bool name="carrot.produceSummary">true</bool>
       <!-- the maximum number of labels per cluster -->
       <!--<int name="carrot.numDescriptions">5</int>-->
       <!-- produce sub clusters -->
       <bool name="carrot.outputSubClusters">false</bool>
    </lst>     
    <arr name="last-components">
      <str>clusteringComponent</str>
    </arr>
  </requestHandler>
  
  <!-- Solr Cell: http://wiki.apache.org/solr/ExtractingRequestHandler -->
  <requestHandler name="/update/extract" class="org.apache.solr.handler.extraction.ExtractingRequestHandler" startup="lazy">
    <lst name="defaults">
      <!-- All the main content goes into "text"... if you need to return
           the extracted text or do highlighting, use a stored field. -->
      <str name="fmap.content">text</str>
      <str name="lowernames">true</str>
      <str name="uprefix">ignored_</str>

      <!-- capture link hrefs but ignore div attributes -->
      <str name="captureAttr">true</str>
      <str name="fmap.a">links</str>
      <str name="fmap.div">ignored_</str>
    </lst>
  </requestHandler>


  <!-- A component to return terms and document frequency of those terms.
       This component does not yet support distributed search. -->
  <searchComponent name="termsComponent" class="org.apache.solr.handler.component.TermsComponent"/>

  <requestHandler name="/terms" class="org.apache.solr.handler.component.SearchHandler">
     <lst name="defaults">
      <bool name="terms">true</bool>
    </lst>     
    <arr name="components">
      <str>termsComponent</str>
    </arr>
  </requestHandler>


  <!-- a search component that enables you to configure the top results for
       a given query regardless of the normal lucene scoring.-->
  <searchComponent name="elevator" class="solr.QueryElevationComponent" >
    <!-- pick a fieldType to analyze queries -->
    <str name="queryFieldType">string</str>
    <str name="config-file">elevate.xml</str>
  </searchComponent>

  <!-- a request handler utilizing the elevator component -->
  <requestHandler name="/elevate" class="solr.SearchHandler" startup="lazy">
    <lst name="defaults">
      <str name="echoParams">explicit</str>
    </lst>
    <arr name="last-components">
      <str>elevator</str>
    </arr>
  </requestHandler>


  <!-- Update request handler.  

       Note: Since solr1.1 requestHandlers requires a valid content type header if posted in
       the body. For example, curl now requires: -H 'Content-type:text/xml; charset=utf-8'
       The response format differs from solr1.1 formatting and returns a standard error code.
       To enable solr1.1 behavior, remove the /update handler or change its path
    -->
  <requestHandler name="/update" class="solr.UpdateRequestHandler" />

  <requestHandler name="/update/javabin" class="solr.UpdateRequestHandler" />

  <!--
   Analysis request handler.  Since Solr 1.3.  Use to return how a document is analyzed.  Useful
   for debugging and as a token server for other types of applications.

   This is deprecated in favor of the improved DocumentAnalysisRequestHandler and FieldAnalysisRequestHandler

   <requestHandler name="/analysis" class="solr.AnalysisRequestHandler" />
   -->

  <!--
    An analysis handler that provides a breakdown of the analysis process of provided docuemnts. This handler expects a
    (single) content stream with the following format:

    <docs>
      <doc>
        <field name="id">1</field>
        <field name="name">The Name</field>
        <field name="text">The Text Value</field>
      <doc>
      <doc>...</doc>
      <doc>...</doc>
      ...
    </docs>

    Note: Each document must contain a field which serves as the unique key. This key is used in the returned
    response to assoicate an analysis breakdown to the analyzed document.

    Like the FieldAnalysisRequestHandler, this handler also supports query analysis by
    sending either an "analysis.query" or "q" request paraemter that holds the query text to be analyized. It also
    supports the "analysis.showmatch" parameter which when set to true, all field tokens that match the query
    tokens will be marked as a "match".
  -->
  <requestHandler name="/analysis/document" class="solr.DocumentAnalysisRequestHandler" />

  <!--
    RequestHandler that provides much the same functionality as analysis.jsp. Provides the ability
    to specify multiple field types and field names in the same request and outputs index-time and
    query-time analysis for each of them.

    Request parameters are:
    analysis.fieldname - The field name whose analyzers are to be used
    analysis.fieldtype - The field type whose analyzers are to be used
    analysis.fieldvalue - The text for index-time analysis
    q (or analysis.q) - The text for query time analysis
    analysis.showmatch (true|false) - When set to true and when query analysis is performed, the produced
                                      tokens of the field value analysis will be marked as "matched" for every
                                      token that is produces by the query analysis
   -->
  <requestHandler name="/analysis/field" class="solr.FieldAnalysisRequestHandler" />


  <!-- CSV update handler, loaded on demand -->
  <requestHandler name="/update/csv" class="solr.CSVRequestHandler" startup="lazy" />
  <requestHandler name="/admin/" class="solr.admin.AdminHandlers" />

  <!-- ping/healthcheck -->
  <requestHandler name="/admin/ping" class="PingRequestHandler">
    <lst name="defaults">
      <str name="qt">standard</str>
      <str name="q">solrpingquery</str>
      <str name="echoParams">all</str>
    </lst>
  </requestHandler>

  <!-- Echo the request contents back to the client -->
  <requestHandler name="/debug/dump" class="solr.DumpRequestHandler" >
    <lst name="defaults">
     <str name="echoParams">explicit</str> <!-- for all params (including the default etc) use: 'all' -->
     <str name="echoHandler">true</str>
    </lst>
  </requestHandler>

  <highlighting>
   <!-- Configure the standard fragmenter -->
   <!-- This could most likely be commented out in the "default" case -->
   <fragmenter name="gap" class="org.apache.solr.highlight.GapFragmenter" default="true">
    <lst name="defaults">
     <int name="hl.fragsize">100</int>
    </lst>
   </fragmenter>

   <!-- A regular-expression-based fragmenter (f.i., for sentence extraction) -->
   <fragmenter name="regex" class="org.apache.solr.highlight.RegexFragmenter">
    <lst name="defaults">
      <!-- slightly smaller fragsizes work better because of slop -->
      <int name="hl.fragsize">70</int>
      <!-- allow 50% slop on fragment sizes -->
      <float name="hl.regex.slop">0.5</float>
      <!-- a basic sentence pattern -->
      <str name="hl.regex.pattern">[-\w ,/\n\"']{20,200}</str>
    </lst>
   </fragmenter>

   <!-- Configure the standard formatter -->
   <formatter name="html" class="org.apache.solr.highlight.HtmlFormatter" default="true">
    <lst name="defaults">
     <str name="hl.simple.pre"><![CDATA[<em>]]></str>
     <str name="hl.simple.post"><![CDATA[</em>]]></str>
    </lst>
   </formatter>
  </highlighting>

  <!-- XSLT response writer transforms the XML output by any xslt file found
       in Solr's conf/xslt directory.  Changes to xslt files are checked for
       every xsltCacheLifetimeSeconds.  
   -->
  <queryResponseWriter name="xslt" class="solr.XSLTResponseWriter">
    <int name="xsltCacheLifetimeSeconds">5</int>
  </queryResponseWriter>

  <!-- config for the admin interface -->
  <admin>
    <defaultQuery></defaultQuery>
  </admin>

</config>

Reply via email to