default-topic-trainingset.conf.solrconfig.xml Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of org.apache.stanbol.enhancer.engine.topic Show documentation
Implementation of an annotation engine that links the content item to a set of possible categories from a dedicated Solr index using MoreLikeThis queries. The classification can be either applied to a complete document (text in a given language) which is the default behavior or to a specific portion of the text (using a TextAnnotation).
The newest version!
<?xml version="1.0" encoding="UTF-8" ?>
<!--
 Licensed to the Apache Software Foundation (ASF) under one or more
 contributor license agreements.  See the NOTICE file distributed with
 this work for additional information regarding copyright ownership.
 The ASF licenses this file to You under the Apache License, Version 2.0
 (the "License"); you may not use this file except in compliance with
 the License.  You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-->

<!-- 
     For more details about configurations options that may appear in
     this file, see http://wiki.apache.org/solr/SolrConfigXml. 
-->
<config>
  <!-- In all configuration below, a prefix of "solr." for class names
       is an alias that causes solr to search appropriate packages,
       including org.apache.solr.(search|update|request|core|analysis)

       You may also specify a fully qualified Java classname if you
       have your own custom plugins.
    -->

  <!-- Set this to 'false' if you want solr to continue working after
       it has encountered an severe configuration error.  In a
       production environment, you may want solr to keep working even
       if one handler is mis-configured.

       You may also set this to false using by setting the system
       property:

         -Dsolr.abortOnConfigurationError=false
    -->
  <abortOnConfigurationError>${solr.abortOnConfigurationError:true}</abortOnConfigurationError>
  
  <!-- Controls what version of Lucene various components of Solr
       adhere to.  Generally, you want to use the latest version to
       get all bug fixes and improvements. It is highly recommended
       that you fully re-index after changing this setting as it can
       affect both how text is indexed and queried.
    -->
  <luceneMatchVersion>LUCENE_42</luceneMatchVersion>

  <dataDir>${solr.data.dir:}</dataDir>

  <directoryFactory name="DirectoryFactory" 
                    class="${solr.directoryFactory:solr.NRTCachingDirectoryFactory}"/> 


  <indexConfig>
    <!-- maxFieldLength was removed in 4.0. To get similar behavior, include a 
         LimitTokenCountFilterFactory in your fieldType definition. E.g. 
     <filter class="solr.LimitTokenCountFilterFactory" maxTokenCount="10000"/>
    -->
    <!-- Maximum time to wait for a write lock (ms) for an IndexWriter. Default: 1000 -->
    <writeLockTimeout>5000</writeLockTimeout>
 </indexConfig>
 
   <!-- The default high-performance update handler -->

  <updateHandler class="solr.DirectUpdateHandler2">
    <!-- Deactivate transaction log 
    <updateLog>
      <str name="dir">${solr.ulog.dir:}</str>
    </updateLog > -->
 
    <!-- no auto commit
     <autoCommit> 
       <maxTime>15000</maxTime> 
       <openSearcher>false</openSearcher> 
     </autoCommit>
     -->
     <!--
       <autoSoftCommit> 
         <maxTime>1000</maxTime> 
       </autoSoftCommit>
      -->
  </updateHandler>
 
  <query>
    <maxBooleanClauses>1024</maxBooleanClauses>

    <filterCache class="solr.FastLRUCache"
                 size="2048"
                 initialSize="1024"
                 autowarmCount="512"/>
    <queryResultCache class="solr.LRUCache"
                     size="2048"
                     initialSize="1024"
                     autowarmCount="512"/>
    <documentCache class="solr.LRUCache"
                   size="4096"
                   initialSize="1024"
                   autowarmCount="0"/>
    
    <!--
       <fieldValueCache class="solr.FastLRUCache"
                        size="512"
                        autowarmCount="128"
                        showItems="32" />
      -->
    <enableLazyFieldLoading>true</enableLazyFieldLoading>

   <!-- Result Window Size

        An optimization for use with the queryResultCache.  When a search
        is requested, a superset of the requested number of document ids
        are collected.  For example, if a search for a particular query
        requests matching documents 10 through 19, and queryWindowSize is 50,
        then documents 0 through 49 will be collected and cached.  Any further
        requests in that range can be satisfied via the cache.  
     -->
   <queryResultWindowSize>20</queryResultWindowSize>

   <!-- Maximum number of documents to cache for any entry in the
        queryResultCache. 
     -->
   <queryResultMaxDocsCached>200</queryResultMaxDocsCached>

    <!-- QuerySenderListener takes an array of NamedList and executes a
         local query request for each NamedList in sequence. 
      -->
    <listener event="newSearcher" class="solr.QuerySenderListener">
      <arr name="queries">
        <!--
           <lst><str name="q">solr</str><str name="sort">price asc</str></lst>
           <lst><str name="q">rocks</str><str name="sort">weight asc</str></lst>
          -->
      </arr>
    </listener>
    <listener event="firstSearcher" class="solr.QuerySenderListener">
      <arr name="queries">
        <lst>
          <str name="q">static firstSearcher warming in solrconfig.xml</str>
        </lst>
      </arr>
    </listener>

    <!-- Use Cold Searcher

         If a search request comes in and there is no current
         registered searcher, then immediately register the still
         warming searcher and use it.  If "false" then all requests
         will block until the first searcher is done warming.
      -->
    <useColdSearcher>false</useColdSearcher>

    <!-- Max Warming Searchers
         
         Maximum number of searchers that may be warming in the
         background concurrently.  An error is returned if this limit
         is exceeded.

         Recommend values of 1-2 for read-only slaves, higher for
         masters w/o cache warming.
      -->
    <maxWarmingSearchers>2</maxWarmingSearchers>

  </query>

  <requestDispatcher handleSelect="false" >
    <requestParsers enableRemoteStreaming="true" 
                    multipartUploadLimitInKB="2048000"
                    formdataUploadLimitInKB="2048"/>
    <httpCaching never304="true" />
  </requestDispatcher>

  <!-- Request Handlers 

       http://wiki.apache.org/solr/SolrRequestHandler
    -->
  <!-- SearchHandler

       http://wiki.apache.org/solr/SearchHandler
    -->
  <requestHandler name="/select" class="solr.SearchHandler">
    <!-- default values for query parameters can be specified, these
         will be overridden by parameters in the request
      -->
     <lst name="defaults">
       <str name="echoParams">explicit</str>
       <int name="rows">10</int>
    </lst>
    </requestHandler>

  <!-- Request Handler for similarity queries and topic classification -->
  <requestHandler name="/mlt" class="solr.MoreLikeThisHandler" startup="lazy" />

  <!-- A request handler that returns indented JSON by default -->
  <requestHandler name="/query" class="solr.SearchHandler">
     <lst name="defaults">
       <str name="echoParams">explicit</str>
       <str name="wt">json</str>
       <str name="indent">true</str>
       <str name="df">text</str>
     </lst>
  </requestHandler>


  <!-- realtime get handler, guaranteed to return the latest stored fields of
       any document, without the need to commit or open a new searcher.  The
       current implementation relies on the updateLog feature being enabled. -->
  <requestHandler name="/get" class="solr.RealTimeGetHandler">
     <lst name="defaults">
       <str name="omitHeader">true</str>
       <str name="wt">json</str>
       <str name="indent">true</str>
     </lst>
  </requestHandler>

 
  <!-- Update Request Handler.  
       
       http://wiki.apache.org/solr/UpdateXmlMessages

    -->
  <requestHandler name="/update" class="solr.UpdateRequestHandler" />

  <!-- for back compat with clients using /update/json and /update/csv -->  
  <requestHandler name="/update/json" class="solr.JsonUpdateRequestHandler">
        <lst name="defaults">
         <str name="stream.contentType">application/json</str>
       </lst>
  </requestHandler>
  <requestHandler name="/update/csv" class="solr.CSVRequestHandler">
        <lst name="defaults">
         <str name="stream.contentType">application/csv</str>
       </lst>
  </requestHandler>

  <!-- Solr Cell Update Request Handler

       http://wiki.apache.org/solr/ExtractingRequestHandler 

    -->
  <requestHandler name="/update/extract" 
                  startup="lazy"
                  class="solr.extraction.ExtractingRequestHandler" >
    <lst name="defaults">
      <str name="lowernames">true</str>
      <str name="uprefix">ignored_</str>

      <!-- capture link hrefs but ignore div attributes -->
      <str name="captureAttr">true</str>
      <str name="fmap.a">links</str>
      <str name="fmap.div">ignored_</str>
    </lst>
  </requestHandler>


  <!-- Field Analysis Request Handler

       RequestHandler that provides much the same functionality as
       analysis.jsp. Provides the ability to specify multiple field
       types and field names in the same request and outputs
       index-time and query-time analysis for each of them.

       Request parameters are:
       analysis.fieldname - field name whose analyzers are to be used

       analysis.fieldtype - field type whose analyzers are to be used
       analysis.fieldvalue - text for index-time analysis
       q (or analysis.q) - text for query time analysis
       analysis.showmatch (true|false) - When set to true and when
           query analysis is performed, the produced tokens of the
           field value analysis will be marked as "matched" for every
           token that is produces by the query analysis
   -->
  <requestHandler name="/analysis/field" 
                  startup="lazy"
                  class="solr.FieldAnalysisRequestHandler" />


  <!-- Document Analysis Handler

       http://wiki.apache.org/solr/AnalysisRequestHandler

       An analysis handler that provides a breakdown of the analysis
       process of provided documents. This handler expects a (single)
       content stream with the following format:

       <docs>
         <doc>
           <field name="id">1</field>
           <field name="name">The Name</field>
           <field name="text">The Text Value</field>
         </doc>
         <doc>...</doc>
         <doc>...</doc>
         ...
       </docs>

    Note: Each document must contain a field which serves as the
    unique key. This key is used in the returned response to associate
    an analysis breakdown to the analyzed document.

    Like the FieldAnalysisRequestHandler, this handler also supports
    query analysis by sending either an "analysis.query" or "q"
    request parameter that holds the query text to be analyzed. It
    also supports the "analysis.showmatch" parameter which when set to
    true, all field tokens that match the query tokens will be marked
    as a "match". 
  -->
  <requestHandler name="/analysis/document" 
                  class="solr.DocumentAnalysisRequestHandler" 
                  startup="lazy" />

  <!-- Admin Handlers

       Admin Handlers - This will register all the standard admin
       RequestHandlers.  
    -->
  <requestHandler name="/admin/" 
                  class="solr.admin.AdminHandlers" />

  <!-- ping/healthcheck -->
  <requestHandler name="/admin/ping" class="solr.PingRequestHandler">
    <lst name="invariants">
      <str name="q">solrpingquery</str>
    </lst>
    <lst name="defaults">
      <str name="echoParams">all</str>
    </lst>
    <!-- An optional feature of the PingRequestHandler is to configure the 
         handler with a "healthcheckFile" which can be used to enable/disable 
         the PingRequestHandler.
         relative paths are resolved against the data dir 
      -->
    <!-- <str name="healthcheckFile">server-enabled.txt</str> -->
  </requestHandler>

  <!-- Echo the request contents back to the client -->
  <requestHandler name="/debug/dump" class="solr.DumpRequestHandler" >
    <lst name="defaults">
     <str name="echoParams">explicit</str> 
     <str name="echoHandler">true</str>
    </lst>
  </requestHandler>
  
  <!-- Solr Replication

       The SolrReplicationHandler supports replicating indexes from a
       "master" used for indexing and "slaves" used for queries.

       http://wiki.apache.org/solr/SolrReplication 

       It is also neccessary for SolrCloud to function (in Cloud mode, the 
       replication handler is used to bulk transfer segments when nodes 
       are added or need to recover).

       https://wiki.apache.org/solr/SolrCloud/
    -->
  <requestHandler name="/replication" class="solr.ReplicationHandler" > 
    <!--
       To enable simple master/slave replication, uncomment one of the 
       sections below, depending on wether this solr instance should be 
       the "master" or a "slave".  If this instance is a "slave" you will 
       also need to fill in the masterUrl to point to a real machine.
    -->
    <!--
       <lst name="master">
         <str name="replicateAfter">commit</str>
         <str name="replicateAfter">startup</str>
         <str name="confFiles">schema.xml,stopwords.txt</str>
       </lst>
    -->
    <!--
       <lst name="slave">
         <str name="masterUrl">http://your-master-hostname:8983/solr</str>
         <str name="pollInterval">00:00:60</str>
       </lst>
    -->
  </requestHandler>
   <!-- Spell Check

        The spell check component can return a list of alternative spelling
        suggestions.  

        http://wiki.apache.org/solr/SpellCheckComponent
     -->
  <searchComponent name="spellcheck" class="solr.SpellCheckComponent">

    <str name="queryAnalyzerFieldType">textSpell</str>

    <!-- Multiple "Spell Checkers" can be declared and used by this
         component
      -->

    <!-- a spellchecker built from a field of the main index -->
    <lst name="spellchecker">
      <str name="name">default</str>
      <str name="field">name</str>
      <str name="classname">solr.DirectSolrSpellChecker</str>
      <!-- the spellcheck distance measure used, the default is the internal levenshtein -->
      <str name="distanceMeasure">internal</str>
      <!-- minimum accuracy needed to be considered a valid spellcheck suggestion -->
      <float name="accuracy">0.5</float>
      <!-- the maximum #edits we consider when enumerating terms: can be 1 or 2 -->
      <int name="maxEdits">2</int>
      <!-- the minimum shared prefix when enumerating terms -->
      <int name="minPrefix">1</int>
      <!-- maximum number of inspections per result. -->
      <int name="maxInspections">5</int>
      <!-- minimum length of a query term to be considered for correction -->
      <int name="minQueryLength">4</int>
      <!-- maximum threshold of documents a query term can appear to be considered for correction -->
      <float name="maxQueryFrequency">0.01</float>
      <!-- uncomment this to require suggestions to occur in 1% of the documents
        <float name="thresholdTokenFrequency">.01</float>
      -->
    </lst>
    
    <!-- a spellchecker that can break or combine words.  See "/spell" handler below for usage -->
    <lst name="spellchecker">
      <str name="name">wordbreak</str>
      <str name="classname">solr.WordBreakSolrSpellChecker</str>      
      <str name="field">name</str>
      <str name="combineWords">true</str>
      <str name="breakWords">true</str>
      <int name="maxChanges">10</int>
    </lst>

    <!-- a spellchecker that uses a different distance measure -->
    <!--
       <lst name="spellchecker">
         <str name="name">jarowinkler</str>
         <str name="field">spell</str>
         <str name="classname">solr.DirectSolrSpellChecker</str>
         <str name="distanceMeasure">
           org.apache.lucene.search.spell.JaroWinklerDistance
         </str>
       </lst>
     -->

    <!-- a spellchecker that use an alternate comparator 

         comparatorClass be one of:
          1. score (default)
          2. freq (Frequency first, then score)
          3. A fully qualified class name
      -->
    <!--
       <lst name="spellchecker">
         <str name="name">freq</str>
         <str name="field">lowerfilt</str>
         <str name="classname">solr.DirectSolrSpellChecker</str>
         <str name="comparatorClass">freq</str>
      -->

    <!-- A spellchecker that reads the list of words from a file -->
    <!--
       <lst name="spellchecker">
         <str name="classname">solr.FileBasedSpellChecker</str>
         <str name="name">file</str>
         <str name="sourceLocation">spellings.txt</str>
         <str name="characterEncoding">UTF-8</str>
         <str name="spellcheckIndexDir">spellcheckerFile</str>
       </lst>
      -->
  </searchComponent>

  <!-- A request handler for demonstrating the spellcheck component.  

       NOTE: This is purely as an example.  The whole purpose of the
       SpellCheckComponent is to hook it into the request handler that
       handles your normal user queries so that a separate request is
       not needed to get suggestions.

       IN OTHER WORDS, THERE IS REALLY GOOD CHANCE THE SETUP BELOW IS
       NOT WHAT YOU WANT FOR YOUR PRODUCTION SYSTEM!
       
       See http://wiki.apache.org/solr/SpellCheckComponent for details
       on the request parameters.
    -->
  <requestHandler name="/spell" class="solr.SearchHandler" startup="lazy">
    <lst name="defaults">
      <str name="df">text</str>
      <!-- Solr will use suggestions from both the 'default' spellchecker
           and from the 'wordbreak' spellchecker and combine them.
           collations (re-written queries) can include a combination of
           corrections from both spellcheckers -->
      <str name="spellcheck.dictionary">default</str>
      <str name="spellcheck.dictionary">wordbreak</str>
      <str name="spellcheck">on</str>
      <str name="spellcheck.extendedResults">true</str>       
      <str name="spellcheck.count">10</str>
      <str name="spellcheck.alternativeTermCount">5</str>
      <str name="spellcheck.maxResultsForSuggest">5</str>       
      <str name="spellcheck.collate">true</str>
      <str name="spellcheck.collateExtendedResults">true</str>  
      <str name="spellcheck.maxCollationTries">10</str>
      <str name="spellcheck.maxCollations">5</str>         
    </lst>
    <arr name="last-components">
      <str>spellcheck</str>
    </arr>
  </requestHandler>

  <!-- Term Vector Component

       http://wiki.apache.org/solr/TermVectorComponent
    -->
  <searchComponent name="tvComponent" class="solr.TermVectorComponent"/>


  <!-- Clustering Component

       http://wiki.apache.org/solr/ClusteringComponent

       You'll need to set the solr.clustering.enabled system property
       when running solr to run with clustering enabled:

            java -Dsolr.clustering.enabled=true -jar start.jar

    -->
  <searchComponent name="clustering"
                   enable="${solr.clustering.enabled:false}"
                   class="solr.clustering.ClusteringComponent" >
    <!-- Declare an engine -->
    <lst name="engine">
      <!-- The name, only one can be named "default" -->
      <str name="name">default</str>

      <!-- Class name of Carrot2 clustering algorithm.

           Currently available algorithms are:
           
           * org.carrot2.clustering.lingo.LingoClusteringAlgorithm
           * org.carrot2.clustering.stc.STCClusteringAlgorithm
           * org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithm
           
           See http://project.carrot2.org/algorithms.html for the
           algorithm's characteristics.
        -->
      <str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str>

      <!-- Overriding values for Carrot2 default algorithm attributes.

           For a description of all available attributes, see:
           http://download.carrot2.org/stable/manual/#chapter.components.
           Use attribute key as name attribute of str elements
           below. These can be further overridden for individual
           requests by specifying attribute key as request parameter
           name and attribute value as parameter value.
        -->
      <str name="LingoClusteringAlgorithm.desiredClusterCountBase">20</str>

      <!-- Location of Carrot2 lexical resources.

           A directory from which to load Carrot2-specific stop words
           and stop labels. Absolute or relative to Solr config directory.
           If a specific resource (e.g. stopwords.en) is present in the
           specified dir, it will completely override the corresponding
           default one that ships with Carrot2.

           For an overview of Carrot2 lexical resources, see:
           http://download.carrot2.org/head/manual/#chapter.lexical-resources
        -->
      <str name="carrot.lexicalResourcesDir">clustering/carrot2</str>

      <!-- The language to assume for the documents.

           For a list of allowed values, see:
           http://download.carrot2.org/stable/manual/#section.attribute.lingo.MultilingualClustering.defaultLanguage
       -->
      <str name="MultilingualClustering.defaultLanguage">ENGLISH</str>
    </lst>
    <lst name="engine">
      <str name="name">stc</str>
      <str name="carrot.algorithm">org.carrot2.clustering.stc.STCClusteringAlgorithm</str>
    </lst>
  </searchComponent>

  <!-- A request handler for demonstrating the clustering component

       This is purely as an example.

       In reality you will likely want to add the component to your 
       already specified request handlers. 
    -->
  <requestHandler name="/clustering"
                  startup="lazy"
                  enable="${solr.clustering.enabled:false}"
                  class="solr.SearchHandler">
    <lst name="defaults">
      <bool name="clustering">true</bool>
      <str name="clustering.engine">default</str>
      <bool name="clustering.results">true</bool>
      <!-- The title field -->
      <str name="carrot.title">name</str>
      <str name="carrot.url">id</str>
      <!-- The field to cluster on -->
       <str name="carrot.snippet">features</str>
       <!-- produce summaries -->
       <bool name="carrot.produceSummary">true</bool>
       <!-- the maximum number of labels per cluster -->
       <!--<int name="carrot.numDescriptions">5</int>-->
       <!-- produce sub clusters -->
       <bool name="carrot.outputSubClusters">false</bool>
       
       <str name="defType">edismax</str>
       <str name="qf">
         text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4
       </str>
       <str name="q.alt">*:*</str>
       <str name="rows">10</str>
       <str name="fl">*,score</str>
    </lst>     
    <arr name="last-components">
      <str>clustering</str>
    </arr>
  </requestHandler>
  
  <!-- Terms Component

       http://wiki.apache.org/solr/TermsComponent

       A component to return terms and document frequency of those
       terms
    -->
  <searchComponent name="terms" class="solr.TermsComponent"/>

  <!-- A request handler for demonstrating the terms component -->
  <requestHandler name="/terms" class="solr.SearchHandler" startup="lazy">
     <lst name="defaults">
      <bool name="terms">true</bool>
      <bool name="distrib">false</bool>
    </lst>     
    <arr name="components">
      <str>terms</str>
    </arr>
  </requestHandler>


  <!-- Update Processors

       Chains of Update Processor Factories for dealing with Update
       Requests can be declared, and then used by name in Update
       Request Processors

       http://wiki.apache.org/solr/UpdateRequestProcessor

    --> 

  <queryResponseWriter name="json" class="solr.JSONResponseWriter">
     <!-- For the purposes of the tutorial, JSON responses are written as
      plain text so that they are easy to read in *any* browser.
      If you expect a MIME type of "application/json" just remove this override.
     -->
    <str name="content-type">text/plain; charset=UTF-8</str>
  </queryResponseWriter>
  
  <!--
     Custom response writers can be declared as needed...
    -->
    <queryResponseWriter name="velocity" class="solr.VelocityResponseWriter" startup="lazy"/>
  

  <!-- XSLT response writer transforms the XML output by any xslt file found
       in Solr's conf/xslt directory.  Changes to xslt files are checked for
       every xsltCacheLifetimeSeconds.  
    -->
  <queryResponseWriter name="xslt" class="solr.XSLTResponseWriter">
    <int name="xsltCacheLifetimeSeconds">5</int>
  </queryResponseWriter>

  <!-- Query Parsers

       http://wiki.apache.org/solr/SolrQuerySyntax

       Multiple QParserPlugins can be registered by name, and then
       used in either the "defType" param for the QueryComponent (used
       by SearchHandler) or in LocalParams
    -->
 
  <!-- Legacy config for the admin interface -->
  <admin>
    <defaultQuery>*:*</defaultQuery>
  </admin>


</config>