All Downloads are FREE. Search and download functionalities are using the official Maven repository.

sar.pulsar-index.1.10.8.source-code.indexer-mapping.xml Maven / Gradle / Ivy

<?xml version="1.0" encoding="UTF-8"?>
<!--
 Licensed to the Apache Software Foundation (ASF) under one or more
 contributor license agreements.  See the NOTICE file distributed with
 this work for additional information regarding copyright ownership.
 The ASF licenses this file to You under the Apache License, Version 2.0
 (the "License"); you may not use this file except in compliance with
 the License.  You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-->

<!-- Just copy it from Solr -->
<mapping>
    <fields>

        <field name="id" type="string" indexed="true" stored="true" required="true" multiValued="false"/>

        <!-- Common metadata fields, named specifically to match up with
          SolrCell metadata when parsing rich documents such as Word, PDF.
          Some fields are multiValued only because Tika currently may return
          multiple values for them. Some metadata is parsed from the documents,
          but there are some which come from the client context:
            "content_type": From the HTTP headers of incoming stream
        -->

        <!-- ************************************************************  -->
        <!--                                元数据                         -->
        <!-- ************************************************************* -->
        <!-- 站点域名 -->
        <field name="domain" type="text_general" indexed="true" stored="true"/>
        <!-- 站点主机名 -->
        <field name="host" type="text_general" indexed="true" stored="true"/>
        <!-- 站点名称 -->
        <field name="site_name" type="text_zh" indexed="true" stored="true"/>
        <!-- 页面链接 -->
        <field name="url" type="text_zh" indexed="true" stored="true"/>
        <!-- 页面链接 -->
        <field name="seed_url" type="text_zh" indexed="true" stored="true"/>
        <!-- 页面类型,index/detail page -->
        <field name="page_category" type="string" indexed="true" stored="true"/>
        <!-- 页面编码 -->
        <field name="encoding" type="string" indexed="true" stored="true"/>
        <!-- 页面标题(元数据) -->
        <field name="page_title" type="text_zh" indexed="true" stored="true"/>
        <!-- 页面关键词(元数据) -->
        <field name="meta_keywords" type="text_zh" indexed="true" stored="true"/>
        <!-- 页面描述(元数据) -->
        <field name="meta_description" type="text_zh" indexed="true" stored="true"/>
        <!-- MIME type, from HTML header -->
        <field name="content_type" type="string" indexed="true" stored="true"/>
        <!-- MIME type, from HTML header -->
        <field name="mime_type" type="string" indexed="true" stored="true" multiValued="true"/>
        <!-- HTTP content length -->
        <field name="content_length" type="int" indexed="true" stored="true" multiValued="true"/>
        <!-- 正文内链接列表 -->
        <field name="content_links" type="string" indexed="true" stored="true" multiValued="true"/>
        <!-- 外部链接数 -->
        <field name="outlinks_count" type="string" indexed="true" stored="true"/>
        <!-- 首次采集时间 -->
        <field name="first_crawl_time" type="date" indexed="true" stored="true"/>
        <!-- 最近采集时间 -->
        <field name="last_crawl_time" type="date" indexed="true" stored="true"/>
        <!-- 采集时间列表 -->
        <field name="fetch_time_history" type="string" indexed="false" stored="true"/>
        <!-- 首次索引时间 -->
        <field name="first_index_time" type="date" indexed="true" stored="true"/>
        <!-- 最近索引时间 -->
        <field name="last_index_time" type="date" indexed="true" stored="true"/>
        <!-- 最近索引时间,Pulsar兼容 -->
        <field name="last_modified_s" type="string" indexed="true" />
        <!-- 索引时间列表 -->
        <field name="index_time_history" type="string" indexed="false" stored="true"/>
        <!-- 文档最后修改时间(HTTP HEADER) -->
        <field name="header_last_modified" type="date" indexed="true" stored="true"/>

        <!-- ************************************************************  -->
        <!--                        Pulsar元数据                         -->
        <!-- ************************************************************* -->
        <field name="anchor" type="text_zh" indexed="true" stored="true" multiValued="true"/>
        <field name="batchId" type="string" indexed="true" stored="true"/>
        <field name="boost" type="float" indexed="true" stored="true"/>
        <field name="digest" type="string" indexed="true" stored="true"/>

        <!-- ************************************************************  -->
        <!--                        业务数据 - 文章内容                     -->
        <!-- ************************************************************* -->
        <!-- 监控主题名称。由管理员定义,如:微博微信定向监测/预警信息/凉山要闻与自媒体动态/悬崖村,TODO : 这个字段是否需要保存到Solr?还是仅仅保存到网站系统? -->
        <field name="admin_subject" indexed="true" type="text_zh" stored="true"/>
        <!-- 监控标签。由管理员定义,如:其他负面预警。TODO : 这个字段是否需要保存到Solr?还是仅仅保存到网站系统? -->
        <field name="admin_tags" indexed="true" type="text_zh" stored="true" multiValued="true"/>

        <!-- 资源类别。包括博客、论坛、新闻、纸媒、贴吧、资讯、视频、问答、微博、微信 -->
        <field name="resource_category" type="text_zh" indexed="true" stored="true"/>
        <!-- 文章标题,由信息抽取获得 -->
        <field name="title" type="text_zh" indexed="true" stored="true" multiValued="true"/>
        <!-- 文章标题,由信息抽取获得 -->
        <field name="article_title" type="text_zh" indexed="true" stored="true" multiValued="true"/>
        <!-- 实体名称,由信息抽取获得 -->
        <field name="entity_name" type="text_zh" indexed="true" stored="true" multiValued="true"/>
        <!--
             正文全文,由信息抽取获得。拷贝到text字段计算索引; 由于需要存储词频信息,故需要索引; 由html_content存储内容,故不存储
             存储the term vector, the term frequency, inverse document frequency, position and offset information
        -->
        <field name="text_content" type="text_zh" indexed="true" stored="false" />
        <!-- 正文HTML,由信息抽取获得。拷贝到text字段计算索引,故不索引 -->
        <field name="html_content" type="text_zh" indexed="false" stored="true"/>

        <!-- 引用网站,由信息抽取获得 -->
        <field name="reference" type="text_zh" indexed="true" stored="true"/>
        <!-- 原始网站,由传播路径计算获得,默认为采集网站 -->
        <field name="original_site" type="text_zh" indexed="true" stored="true"/>
        <!-- 作者,由信息抽取获得。使用标点分词 -->
        <field name="author" type="text_punctuation" indexed="true" stored="true" multiValued="true"/>
        <!-- 责任编辑,由信息抽取获得。使用标点分词 -->
        <field name="director" type="text_punctuation" indexed="true" stored="true" multiValued="true"/>
        <!-- 摘要,由信息抽取获得 -->
        <field name="abstract" type="text_zh" indexed="true" stored="true"/>
        <!-- 自动摘要,由自然语言处理模块生成 -->
        <!-- @experimental -->
        <field name="auto_abstract" type="text_zh" indexed="true" stored="true"/>
        <!-- 发布时间,由信息抽取获得 -->
        <field name="publish_time" type="date" indexed="true" stored="true"/>
        <!-- 评论数,由信息抽取获得 -->
        <field name="comment_count" type="int" indexed="true" stored="true"/>
        <!-- 点击量,由信息抽取获得 -->
        <field name="click_count" type="int" indexed="true" stored="true"/>
        <!-- 转发量,由信息抽取获得 -->
        <field name="forward_count" type="int" indexed="true" stored="true"/>
        <!-- 情感分析。由情感分析模块计算得出。可存储多个不同情感维度,格式:name:value, name:value, ... -->
        <field name="sentiment" type="text_punctuation" indexed="true" stored="true" multiValued="true"/>
        <!-- 情感分析。由情感分析模块计算得出。-->
        <field name="sentiment_main" type="text_punctuation" indexed="true" stored="true" multiValued="true"/>

        <!-- ************************************************************  -->
        <!--                        业务数据 - 文章评论                     -->
        <!-- ************************************************************* -->
        <field name="comment_article_url" type="text_zh" indexed="true" stored="true"/>
        <field name="comment_author" type="text_zh" indexed="true" stored="true"/>
        <field name="comment_time" type="date" indexed="true" stored="true"/>
        <field name="comment_content" type="text_zh" indexed="true" stored="true"/>
        <field name="comment_ip" type="string" indexed="true" stored="true"/>
        <field name="comment_device" type="text_zh" indexed="true" stored="true"/>

    </fields>
    <copyFields>

    </copyFields>
    <uniqueKey>id</uniqueKey>
</mapping>




© 2015 - 2025 Weber Informatics LLC | Privacy Policy