org.apache.tika.parser.ner.regex.RegexNERecogniser Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.ner.regex;
import org.apache.commons.io.IOUtils;
import org.apache.tika.parser.ner.NERecogniser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* This class offers an implementation of {@link NERecogniser} based on
* Regular Expressions.
*
* The default configuration file {@value NER_REGEX_FILE} is used when no
* argument constructor is used to instantiate this class. The regex file is
* loaded via {@link Class#getResourceAsStream(String)}, so the file should be
* placed in the same package path as of this class.
*
* The format of regex configuration as follows:
*
* ENTITY_TYPE1=REGEX1
* ENTITY_TYPE2=REGEX2
*
*
* For example, to extract week day from text:
* WEEK_DAY=(?i)((sun)|(mon)|(tues)|(thurs)|(fri)|((sat)(ur)?))(day)?
*
* @since Nov. 7, 2015
*/
public class RegexNERecogniser implements NERecogniser {
public static final String NER_REGEX_FILE = "ner-regex.txt";
private static Logger LOG = LoggerFactory.getLogger(RegexNERecogniser.class);
public Set entityTypes = new HashSet<>();
public Map patterns;
private boolean available = false;
private static RegexNERecogniser INSTANCE;
public RegexNERecogniser(){
this(RegexNERecogniser.class.getResourceAsStream(NER_REGEX_FILE));
}
public RegexNERecogniser(InputStream stream){
try {
patterns = new HashMap<>();
List lines = IOUtils.readLines(stream, StandardCharsets.UTF_8);
IOUtils.closeQuietly(stream);
for (String line : lines) {
line = line.trim();
if (line.isEmpty() || line.startsWith("#")){ //empty or comment
continue; //skip
}
int delim = line.indexOf('=');
if (delim < 0) { //delim not found
//skip
LOG.error("Skipped : Invalid config : {} ", line);
continue;
}
String type = line.substring(0, delim).trim();
String patternStr = line.substring(delim+1, line.length()).trim();
patterns.put(type, Pattern.compile(patternStr));
entityTypes.add(type);
}
} catch (Exception e) {
LOG.error(e.getMessage(), e);
}
available = !entityTypes.isEmpty();
}
public synchronized static RegexNERecogniser getInstance() {
if (INSTANCE == null) {
INSTANCE = new RegexNERecogniser();
}
return INSTANCE;
}
@Override
public boolean isAvailable() {
return available;
}
@Override
public Set getEntityTypes() {
return entityTypes;
}
/**
* finds matching sub groups in text
* @param text text containing interesting sub strings
* @param pattern pattern to find sub strings
* @return set of sub strings if any found, or null if none found
*/
public Set findMatches(String text, Pattern pattern){
Set results = null;
Matcher matcher = pattern.matcher(text);
if (matcher.find()) {
results = new HashSet<>();
results.add(matcher.group(0));
while (matcher.find()) {
results.add(matcher.group(0));
}
}
return results;
}
@Override
public Map> recognise(String text) {
Map> result = new HashMap<>();
for (Map.Entry entry : patterns.entrySet()) {
Set names = findMatches(text, entry.getValue());
if (names != null) {
result.put(entry.getKey(), names);
}
}
return result;
}
}