org.apache.tika.parser.ner.opennlp.OpenNLPNERecogniser Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.ner.opennlp;
import org.apache.tika.parser.ner.NERecogniser;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
*
* This implementation of {@link NERecogniser} chains an array of
* {@link OpenNLPNameFinder}s for which NER models are
* available in classpath.
*
* The following models are scanned during initialization via class loader.:
*
*
*
* Entity Type Path
*
*
* {@value PERSON} {@value PERSON_FILE}
*
*
* {@value LOCATION} {@value LOCATION_FILE}
*
*
* {@value ORGANIZATION} {@value ORGANIZATION_FILE}
*
*
* {@value TIME} {@value TIME_FILE}
*
*
* {@value DATE} {@value DATE_FILE}
*
*
* {@value PERCENT} {@value PERCENT_FILE}
*
*
* {@value MONEY} {@value MONEY_FILE}
*
*
*
* @see org.apache.tika.parser.ner.NamedEntityParser#DEFAULT_NER_IMPL
*/
public class OpenNLPNERecogniser implements NERecogniser {
public static final String MODELS_DIR = OpenNLPNERecogniser.class
.getPackage().getName().replace(".", "/");
public static final String PERSON_FILE = "ner-person.bin";
public static final String LOCATION_FILE = "ner-location.bin";
public static final String ORGANIZATION_FILE = "ner-organization.bin";
public static final String TIME_FILE = "ner-time.bin";
public static final String DATE_FILE = "ner-date.bin";
public static final String PERCENT_FILE = "ner-percentage.bin";
public static final String MONEY_FILE = "ner-money.bin";
//Default (English) Models for the common 7 classes of named types
public static final String NER_PERSON_MODEL = MODELS_DIR + "/" + PERSON_FILE;
public static final String NER_LOCATION_MODEL = MODELS_DIR + "/" + LOCATION_FILE;
public static final String NER_ORGANIZATION_MODEL = MODELS_DIR + "/" + ORGANIZATION_FILE;
public static final String NER_TIME_MODEL = MODELS_DIR + "/" + TIME_FILE;
public static final String NER_DATE_MODEL = MODELS_DIR + "/" + DATE_FILE;
public static final String NER_PERCENT_MODEL = MODELS_DIR + "/" + PERCENT_FILE;
public static final String NER_MONEY_MODEL = MODELS_DIR + "/" + MONEY_FILE;
public static final Map DEFAULT_MODELS =
new HashMap(){{
put(PERSON, NER_PERSON_MODEL);
put(LOCATION, NER_LOCATION_MODEL);
put(ORGANIZATION, NER_ORGANIZATION_MODEL);
put(TIME, NER_TIME_MODEL);
put(DATE, NER_DATE_MODEL);
put(PERCENT, NER_PERCENT_MODEL);
put(MONEY, NER_MONEY_MODEL);
}};
private Set entityTypes;
private List nameFinders;
private boolean available;
/**
* Creates a default chain of Name finders using default OpenNLP recognizers
*/
public OpenNLPNERecogniser(){
this(DEFAULT_MODELS);
}
/**
* Creates a chain of Named Entity recognisers
* @param models map of entityType -> model path
* NOTE: the model path should be known to class loader.
*/
public OpenNLPNERecogniser(Map models){
this.nameFinders = new ArrayList<>();
this.entityTypes = new HashSet<>();
for (Map.Entry entry : models.entrySet()) {
OpenNLPNameFinder finder =
new OpenNLPNameFinder(entry.getKey(), entry.getValue());
if (finder.isAvailable()) {
this.nameFinders.add(finder);
this.entityTypes.add(entry.getKey());
}
}
this.entityTypes = Collections.unmodifiableSet(this.entityTypes);
this.available = nameFinders.size() > 0; //at least one finder is present
}
@Override
public boolean isAvailable() {
return available;
}
@Override
public Set getEntityTypes() {
return entityTypes;
}
@Override
public Map> recognise(String text) {
String[] tokens = OpenNLPNameFinder.tokenize(text);
Map> names = new HashMap<>();
for (OpenNLPNameFinder finder : nameFinders) {
names.putAll(finder.findNames(tokens));
}
return names;
}
}