edu.emory.mathcs.nlp.configuration.config-train-ner.xml Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of nlp4j-api Show documentation
The newest version!
<!-- named entity recognition -->
<configuration>
    <tsv>
        <column index="1" field="form"/>
        <column index="2" field="lemma"/>
        <column index="3" field="pos"/>
        <column index="4" field="feats"/>
        <column index="5" field="nament"/>
    </tsv>

    <lexica>
        <word_clusters field="word_form_simplified_lowercase">edu/emory/mathcs/nlp/lexica/en-brown-clusters-simplified-lowercase.xz</word_clusters>
        <word_embeddings field="word_form_undigitalized">edu/emory/mathcs/nlp/lexica/en-word-embeddings-undigitalized.xz</word_embeddings>
        <named_entity_gazetteers field="word_form_simplified">edu/emory/mathcs/nlp/lexica/en-named-entity-gazetteers-simplified.xz</named_entity_gazetteers>
    </lexica>

    <optimizer>
        <l1_regularization>0.00001</l1_regularization>
        <algorithm>adagrad-mini-batch</algorithm>
        <learning_rate>0.02</learning_rate>
        <feature_cutoff>0</feature_cutoff>
        <lols fixed="0" decaying="0.95"/>
        <batch_size>5</batch_size>
        <max_epoch>20</max_epoch>
        <bias>0</bias>
    </optimizer>

    <reducer>
        <lower_bound>86.98</lower_bound>
        <increment>0.01</increment>
        <iteration>2</iteration>
        <start>0.05</start>
        <range>0.005</range>
    </reducer>

    <feature_template>
        <!-- 1-gram features -->
        <feature f0="i-1:word_form_simplified"/>
        <feature f0="i:word_form_simplified"/>
        <feature f0="i+1:word_form_simplified"/>

        <feature f0="i-2:word_form_simplified_lowercase"/>
        <feature f0="i-1:word_form_simplified_lowercase"/>
        <feature f0="i:word_form_simplified_lowercase"/>
        <feature f0="i+1:word_form_simplified_lowercase"/>
        <feature f0="i+2:word_form_simplified_lowercase"/>

        <feature f0="i-1:word_shape"/>
        <feature f0="i:word_shape"/>
        <feature f0="i+1:word_shape"/>

        <feature f0="i-1:part_of_speech_tag"/>
        <feature f0="i:part_of_speech_tag"/>
        <feature f0="i+1:part_of_speech_tag"/>

        <feature f0="i-2:named_entity_tag"/>
        <feature f0="i-1:named_entity_tag"/>

        <feature set="true" f0="i-1:named_entity_gazetteers"/>
        <feature set="true" f0="i:named_entity_gazetteers"/>
        <feature set="true" f0="i+1:named_entity_gazetteers"/>

        <!-- 2-gram features -->
        <feature f0="i-2:part_of_speech_tag" f1="i-1:part_of_speech_tag"/>
        <feature f0="i+1:part_of_speech_tag" f1="i+2:part_of_speech_tag"/>

        <feature f0="i:lemma"   f1="i:part_of_speech_tag"/>
        <feature f0="i+1:lemma" f1="i:part_of_speech_tag"/>

        <feature f0="i-1:lemma" f1="i:lemma"/>
        <feature f0="i:lemma"   f1="i+1:lemma"/>
        <feature f0="i+1:lemma" f1="i+2:lemma"/>

        <!-- 3-gram features -->
        <feature f0="i-1:lemma" f1="i-1:part_of_speech_tag" f2="i-1:named_entity_tag"/>
        <feature f0="i:lemma"   f1="i:part_of_speech_tag"   f2="i-1:named_entity_tag"/>

        <!-- affix features -->
        <feature f0="i:suffix:3"/>
        <feature f0="i+1:prefix:3"/>

        <feature f0="i:suffix:3"   f1="i:word_form_simplified_lowercase"/>
        <feature f0="i-1:suffix:3" f1="i:word_form_simplified_lowercase"/>

        <!-- word cluster features -->
        <feature set="true" f0="i-2:word_clusters"/>
        <feature set="true" f0="i-1:word_clusters"/>
        <feature set="true" f0="i:word_clusters"/>
        <feature set="true" f0="i+1:word_clusters"/>
        <feature set="true" f0="i+2:word_clusters"/>

        <feature f0="i:word_embedding"/>
    </feature_template>
</configuration>