solr.WebPages.conf.schema.xml Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of socialsensor-framework-client Show documentation
The project contains a set of convenience methods on top of common data repositories.
The newest version!
<?xml version="1.0" encoding="utf-8"?>
<schema name="webpages" version="1.5">


    <fields>
        <field name="url" type="string" indexed="true" stored="true" required="true" multiValued="false" />
        <!-- SocialSensor fields -->
        <field name="expandedUrl" type="string" indexed="true" stored="true" multiValued="false" />
        <field name="streamId" type="string" indexed="true" stored="true" multiValued="false" />
        <field name="title" type="text_en" indexed="true" stored="true" multiValued="false" />
        <field name="text" type="text_en" indexed="true" stored="true" multiValued="false" />
        <field name="domain" type="string" indexed="true" stored="true" multiValued="false" />
        <field name="mediaIds" type="string" indexed="true" stored="true" multiValued="true" />
        <field name="mediaThumbnail" type="string" indexed="true" stored="true" multiValued="false" />
        <field name="date" type="date" indexed="true" stored="true" multiValued="false" />
        <field name="reference" type="string" indexed="true" stored="true" multiValued="false" />
        <field name="shares" type="int" indexed="true" stored="true" multiValued="false" />
        <field name="media" type="int" indexed="true" stored="true" multiValued="false" /> 
        <field name="_version_" type="long" indexed="true" stored="true" />
    </fields>


   <!-- Field to use to determine and enforce document uniqueness. 
        Unless this field is marked with required="false", it will be a required field -->
   <uniqueKey>url</uniqueKey>

   <types>
      
        <!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
        <fieldType name="string" class="solr.StrField" sortMissingLast="true" />
      
        <!-- boolean type: "true" or "false" -->
        <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true" />
      
        <!-- Default numeric field types. For faster range queries, 
             consider the tint/tfloat/tlong/tdouble types. -->
        <fieldType name="int" class="solr.TrieIntField" precisionStep="0" positionIncrementGap="0" />
        <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" positionIncrementGap="0" />
        <fieldType name="long" class="solr.TrieLongField" precisionStep="0" positionIncrementGap="0" />
        <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" positionIncrementGap="0" />

        <fieldType name="date" class="solr.TrieDateField" precisionStep="0" positionIncrementGap="0" />

        <!--
        Numeric field types that index each value at various levels of precision
        to accelerate range queries when the number of values between the range
        endpoints is large. See the javadoc for NumericRangeQuery for internal
        implementation details.

        Smaller precisionStep values (specified in bits) will lead to more tokens
        indexed per value, slightly larger index size, and faster range queries.
        A precisionStep of 0 disables indexing at different precision levels.
        -->
        <fieldType name="tint" class="solr.TrieIntField" precisionStep="8" positionIncrementGap="0" />
        <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" positionIncrementGap="0" />
        <fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" positionIncrementGap="0" />
        <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" positionIncrementGap="0" />
      
        <!-- A Trie based date field for faster date range queries and date faceting. -->
        <fieldType name="tdate" class="solr.TrieDateField" precisionStep="6" positionIncrementGap="0" />

        <!--Binary data type. The data should be sent/retrieved in as Base64 encoded Strings -->
        <fieldtype name="binary" class="solr.BinaryField" />
        <fieldType name="random" class="solr.RandomSortField" indexed="true" />

        <!-- A text field that only splits on whitespace for exact matching of words -->
        <fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100">
            <analyzer>
                <tokenizer class="solr.WhitespaceTokenizerFactory" />
            </analyzer>
        </fieldType>
      
        <!-- A general text field that has reasonable, generic cross-language defaults: 
            it tokenizes with StandardTokenizer, removes stop words from case-insensitive "stopwords.txt"
           (empty by default), and down cases.  At query time only, it also applies synonyms. -->
        <fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">
            <analyzer type="index">
                <tokenizer class="solr.StandardTokenizerFactory" />
                <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
                <!-- in this example, we will only use synonyms at query time
                <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
                -->
                <filter class="solr.LowerCaseFilterFactory" />
            </analyzer>
            <analyzer type="query">
                <tokenizer class="solr.StandardTokenizerFactory" />
                <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
                <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true" />
                <filter class="solr.LowerCaseFilterFactory" />
            </analyzer>
        </fieldType>
      
        <!-- A text field with defaults appropriate for English: it tokenizes with StandardTokenizer, removes English stop words
            (lang/stopwords_en.txt), down cases, protects words from protwords.txt, and finally applies Porter's stemming.  The query time analyzer
            also applies synonyms from synonyms.txt. -->
        <fieldType name="text_en" class="solr.TextField" positionIncrementGap="100">
            <analyzer type="index">
                <tokenizer class="solr.StandardTokenizerFactory" />
                <!-- in this example, we will only use synonyms at query time
                <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
                -->
                <!-- Case insensitive stop word removal. add enablePositionIncrements=true in both the index and query
                analyzers to leave a 'gap' for more accurate phrase queries.
                -->
                <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt" enablePositionIncrements="true" />
                <filter class="solr.LowerCaseFilterFactory" />
                <filter class="solr.EnglishPossessiveFilterFactory" />
                <!--
                <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt" />
                -->
                <!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory:
                <filter class="solr.EnglishMinimalStemFilterFactory"/>
                -->
                <filter class="solr.PorterStemFilterFactory" />
            </analyzer>
            <analyzer type="query">
                <tokenizer class="solr.StandardTokenizerFactory" />
                <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true" />
                <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt" enablePositionIncrements="true" />
                <filter class="solr.LowerCaseFilterFactory" />
                <filter class="solr.EnglishPossessiveFilterFactory" />
                <!--
                <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt" />
                -->
                <!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory:
                <filter class="solr.EnglishMinimalStemFilterFactory"/>
                -->
                <filter class="solr.PorterStemFilterFactory" />
            </analyzer>
        </fieldType>

        <!-- A text field with defaults appropriate for English, plus aggressive word-splitting and autophrase features enabled.
            This field is just like text_en, except it adds WordDelimiterFilter to enable splitting and matching of
            words on case-change, alpha numeric boundaries, and non-alphanumeric chars.  This means certain compound word
            cases will work, for example query "wi fi" will match document "WiFi" or "wi-fi".
        -->
        <fieldType name="text_en_splitting" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
            <analyzer type="index">
                <tokenizer class="solr.WhitespaceTokenizerFactory" />
                <!-- in this example, we will only use synonyms at query time
                <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
                -->
                <!-- Case insensitive stop word removal. add enablePositionIncrements=true in both the index and query
                    analyzers to leave a 'gap' for more accurate phrase queries.
                -->
                <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt" enablePositionIncrements="true" />
                <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" 
                        catenateAll="0" splitOnCaseChange="1" />
                <filter class="solr.LowerCaseFilterFactory" />
                <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt" />
                <filter class="solr.PorterStemFilterFactory" />
            </analyzer>
            <analyzer type="query">
                <tokenizer class="solr.WhitespaceTokenizerFactory" />
                <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true" />
                <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt" enablePositionIncrements="true" />
                <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" 
                        catenateAll="0" splitOnCaseChange="1" />
                <filter class="solr.LowerCaseFilterFactory" />
                <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt" />
                <filter class="solr.PorterStemFilterFactory" />
            </analyzer>
        </fieldType>
    </types>
</schema>