opennlp.tools.util.featuregen.GeneratorFactory Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.util.featuregen;
import java.io.IOException;
import java.io.InputStream;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.Text;
import org.xml.sax.SAXException;
import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.XmlUtil;
import opennlp.tools.util.ext.ExtensionLoader;
import opennlp.tools.util.model.ArtifactSerializer;
import opennlp.tools.util.model.DictionarySerializer;
import opennlp.tools.util.model.POSModelSerializer;
/**
* Creates a set of feature generators based on a provided XML descriptor.
*
* Example of an XML descriptor:
*
* <featureGenerators name="namefind">
* <generator class="opennlp.tools.util.featuregen.CachedFeatureGeneratorFactory">
* <generator class="opennlp.tools.util.featuregen.WindowFeatureGeneratorFactory">
* <int name="prevLength">2</int>
* <int name="nextLength">2</int>
* <generator class="opennlp.tools.util.featuregen.TokenClassFeatureGeneratorFactory"/>
* </generator>
* <generator class="opennlp.tools.util.featuregen.WindowFeatureGeneratorFactory">
* <int name="prevLength">2</int>
* <int name="nextLength">2</int>
* <generator class="opennlp.tools.util.featuregen.TokenFeatureGeneratorFactory"/>
* </generator>
* <generator class="opennlp.tools.util.featuregen.DefinitionFeatureGeneratorFactory"/>
* <generator class="opennlp.tools.util.featuregen.PreviousMapFeatureGeneratorFactory"/>
* <generator class="opennlp.tools.util.featuregen.BigramNameFeatureGeneratorFactory"/>
* <generator class="opennlp.tools.util.featuregen.SentenceFeatureGeneratorFactory">
* <bool name="begin">true</bool>
* <bool name="end">false</bool>
* </generator>
* </generator>
* </featureGenerators>
*
*
* Each XML element is mapped to a {@link GeneratorFactory.XmlFeatureGeneratorFactory} which
* is responsible to process the element and create the specified
* {@link AdaptiveFeatureGenerator}. Elements can contain other
* elements in this case it is the responsibility of the mapped factory to process
* the child elements correctly. In some factories this leads to recursive
* calls the
* {@link GeneratorFactory.XmlFeatureGeneratorFactory#create(Element, FeatureGeneratorResourceProvider)}
* method.
*
* In the example above the generators element is mapped to the
* {@link AggregatedFeatureGeneratorFactory} which then
* creates all the aggregated {@link AdaptiveFeatureGenerator}s to
* accomplish this it evaluates the mapping with the same mechanism
* and gives the child element to the corresponding factories. All
* created generators are added to a new instance of the
* {@link AggregatedFeatureGenerator} which is then returned.
*/
public class GeneratorFactory {
/**
* The {@link XmlFeatureGeneratorFactory} is responsible to construct
* an {@link AdaptiveFeatureGenerator} from an given XML {@link Element}
* which contains all necessary configuration if any.
*/
@Deprecated // TODO: (OPENNLP-1174) just remove when back-compat is no longer needed
interface XmlFeatureGeneratorFactory {
/**
* Creates an {@link AdaptiveFeatureGenerator} from a the describing
* XML element.
*
* @param generatorElement the element which contains the configuration
* @param resourceManager the resource manager which could be used
* to access referenced resources
*
* @return the configured {@link AdaptiveFeatureGenerator}
*/
AdaptiveFeatureGenerator create(Element generatorElement,
FeatureGeneratorResourceProvider resourceManager) throws InvalidFormatException;
}
public static abstract class AbstractXmlFeatureGeneratorFactory {
protected Element generatorElement;
protected FeatureGeneratorResourceProvider resourceManager;
// to respect the order in AggregatedFeatureGenerator, let's use LinkedHashMap
protected LinkedHashMap args;
public AbstractXmlFeatureGeneratorFactory() {
args = new LinkedHashMap<>();
}
public Map>
getArtifactSerializerMapping() throws InvalidFormatException {
return null;
}
final void init(Element element, FeatureGeneratorResourceProvider resourceManager)
throws InvalidFormatException {
this.generatorElement = element;
this.resourceManager = resourceManager;
List generators = new ArrayList<>();
NodeList childNodes = generatorElement.getChildNodes();
for (int i = 0; i < childNodes.getLength(); i++) {
Node childNode = childNodes.item(i);
if (childNode instanceof Element) {
Element elem = (Element)childNode;
String type = elem.getTagName();
if (type.equals("generator")) {
String key = "generator#" + Integer.toString(generators.size());
AdaptiveFeatureGenerator afg = buildGenerator(elem, resourceManager);
generators.add(afg);
if (afg != null)
args.put(key, afg);
}
else {
String name = elem.getAttribute("name");
Node cn = elem.getFirstChild();
Text text = (Text)cn;
switch (type) {
case "int" :
args.put(name, Integer.parseInt(text.getWholeText()));
break;
case "long" :
args.put(name, Long.parseLong(text.getWholeText()));
break;
case "float" :
args.put(name, Float.parseFloat(text.getWholeText()));
break;
case "double" :
args.put(name, Double.parseDouble(text.getWholeText()));
break;
case "str" :
args.put(name, text.getWholeText());
break;
case "bool" :
args.put(name, Boolean.parseBoolean(text.getWholeText()));
break;
default:
throw new InvalidFormatException(
"child element must be one of generator, int, long, float, double," +
" str or bool");
}
}
}
}
if (generators.size() > 1) {
AdaptiveFeatureGenerator aggregatedFeatureGenerator =
new AggregatedFeatureGenerator(generators.toArray(
new AdaptiveFeatureGenerator[generators.size()]));
args.put("generator#0", aggregatedFeatureGenerator);
}
}
public int getInt(String name) throws InvalidFormatException {
Object value = args.get(name);
if (value == null) {
throw new InvalidFormatException("parameter " + name + " must be set!");
}
else if (value instanceof Integer) {
return (Integer)value;
}
else {
throw new InvalidFormatException("parameter " + name + " must be integer!");
}
}
public int getInt(String name, int defValue) throws InvalidFormatException {
Object value = args.get(name);
if (value == null) {
return defValue;
}
else if (value instanceof Integer) {
return (Integer)value;
}
else {
throw new InvalidFormatException("parameter " + name + " must be integer!");
}
}
public long getLong(String name) throws InvalidFormatException {
Object value = args.get(name);
if (value == null) {
throw new InvalidFormatException("parameter " + name + " must be set!");
}
else if (value instanceof Long) {
return (Long)value;
}
else {
throw new InvalidFormatException("parameter " + name + " must be long!");
}
}
public long getLong(String name, long defValue) throws InvalidFormatException {
Object value = args.get(name);
if (value == null) {
return defValue;
}
else if (value instanceof Long) {
return (Long)value;
}
else {
throw new InvalidFormatException("parameter " + name + " must be long!");
}
}
public float getFloat(String name) throws InvalidFormatException {
Object value = args.get(name);
if (value == null) {
throw new InvalidFormatException("parameter " + name + " must be set!");
}
else if (value instanceof Float) {
return (Float)value;
}
else {
throw new InvalidFormatException("parameter " + name + " must be float!");
}
}
public float getFloat(String name, float defValue) throws InvalidFormatException {
Object value = args.get(name);
if (value == null) {
return defValue;
}
else if (value instanceof Float) {
return (Float)value;
}
else {
throw new InvalidFormatException("parameter " + name + " must be float!");
}
}
public double getDouble(String name) throws InvalidFormatException {
Object value = args.get(name);
if (value == null) {
throw new InvalidFormatException("parameter " + name + " must be set!");
}
else if (value instanceof Double) {
return (Double)value;
}
else {
throw new InvalidFormatException("parameter " + name + " must be double!");
}
}
public double getDouble(String name, double defValue) throws InvalidFormatException {
Object value = args.get(name);
if (value == null) {
return defValue;
}
else if (value instanceof Double) {
return (Double)value;
}
else {
throw new InvalidFormatException("parameter " + name + " must be double!");
}
}
public String getStr(String name) throws InvalidFormatException {
Object value = args.get(name);
if (value == null) {
throw new InvalidFormatException("parameter " + name + " must be set!");
}
else if (value instanceof String) {
return (String)value;
}
else {
throw new InvalidFormatException("parameter " + name + " must be double!");
}
}
public String getStr(String name, String defValue) throws InvalidFormatException {
Object value = args.get(name);
if (value == null) {
return defValue;
}
else if (value instanceof String) {
return (String)value;
}
else {
throw new InvalidFormatException("parameter " + name + " must be String!");
}
}
public boolean getBool(String name) throws InvalidFormatException {
Object value = args.get(name);
if (value == null) {
throw new InvalidFormatException("parameter " + name + " must be set!");
}
else if (value instanceof Boolean) {
return (Boolean)value;
}
else {
throw new InvalidFormatException("parameter " + name + " must be boolean!");
}
}
public boolean getBool(String name, boolean defValue) throws InvalidFormatException {
Object value = args.get(name);
if (value == null) {
return defValue;
}
else if (value instanceof Boolean) {
return (Boolean)value;
}
else {
throw new InvalidFormatException("parameter " + name + " must be boolean!");
}
}
/**
*
* @return null if the subclass uses {@link #resourceManager} to instantiate
* @throws InvalidFormatException
*/
public abstract AdaptiveFeatureGenerator create() throws InvalidFormatException;
}
// TODO: We have to support custom resources here. How does it work ?!
// Attributes get into a Map properties
// How can serialization be supported ?!
// The model is loaded, and the manifest should contain all serializer classes registered for the
// resources by name.
// When training, the descriptor could be consulted first to register the serializers, and afterwards
// they are stored in the model.
// TODO: (OPENNLP-1174) just remove this class when back-compat is no longer needed
static class CustomFeatureGeneratorFactory implements XmlFeatureGeneratorFactory {
public AdaptiveFeatureGenerator create(Element generatorElement,
FeatureGeneratorResourceProvider resourceManager) throws InvalidFormatException {
String featureGeneratorClassName = generatorElement.getAttribute("class");
AdaptiveFeatureGenerator generator =
ExtensionLoader.instantiateExtension(AdaptiveFeatureGenerator.class, featureGeneratorClassName);
if (generator instanceof CustomFeatureGenerator) {
CustomFeatureGenerator customGenerator = (CustomFeatureGenerator) generator;
Map properties = new HashMap<>();
NamedNodeMap attributes = generatorElement.getAttributes();
for (int i = 0; i < attributes.getLength(); i++) {
Node attribute = attributes.item(i);
if (!"class".equals(attribute.getNodeName())) {
properties.put(attribute.getNodeName(), attribute.getNodeValue());
}
}
if (resourceManager != null) {
customGenerator.init(properties, resourceManager);
}
}
return generator;
}
static void register(Map factoryMap) {
factoryMap.put("custom", new CustomFeatureGeneratorFactory());
}
}
// TODO: (OPENNLP-1174) just remove when back-compat is no longer needed
private static Map factories = new HashMap<>();
// TODO: (OPENNLP-1174) just remove when back-compat is no longer needed
static {
AggregatedFeatureGeneratorFactory.register(factories);
CachedFeatureGeneratorFactory.register(factories);
CharacterNgramFeatureGeneratorFactory.register(factories);
DefinitionFeatureGeneratorFactory.register(factories);
DictionaryFeatureGeneratorFactory.register(factories);
DocumentBeginFeatureGeneratorFactory.register(factories);
PreviousMapFeatureGeneratorFactory.register(factories);
SentenceFeatureGeneratorFactory.register(factories);
TokenClassFeatureGeneratorFactory.register(factories);
TokenFeatureGeneratorFactory.register(factories);
BigramNameFeatureGeneratorFactory.register(factories);
TokenPatternFeatureGeneratorFactory.register(factories);
PosTaggerFeatureGeneratorFactory.register(factories);
PrefixFeatureGeneratorFactory.register(factories);
SuffixFeatureGeneratorFactory.register(factories);
WindowFeatureGeneratorFactory.register(factories);
WordClusterFeatureGeneratorFactory.register(factories);
BrownClusterTokenFeatureGeneratorFactory.register(factories);
BrownClusterTokenClassFeatureGeneratorFactory.register(factories);
BrownClusterBigramFeatureGeneratorFactory.register(factories);
CustomFeatureGeneratorFactory.register(factories);
POSTaggerNameFeatureGeneratorFactory.register(factories);
}
/**
* Creates a {@link AdaptiveFeatureGenerator} for the provided element.
* To accomplish this it looks up the corresponding factory by the
* element tag name. The factory is then responsible for the creation
* of the generator from the element.
*
* @param generatorElement
* @param resourceManager
*
* @return
*/
@Deprecated // TODO: (OPENNLP-1174) remove back-compat support when it is unnecessary
static AdaptiveFeatureGenerator createGenerator(Element generatorElement,
FeatureGeneratorResourceProvider resourceManager) throws InvalidFormatException {
String elementName = generatorElement.getTagName();
// check it is new format?
if (elementName.equals("featureGenerators")) {
List generators = new ArrayList<>();
NodeList childNodes = generatorElement.getChildNodes();
for (int i = 0; i < childNodes.getLength(); i++) {
Node childNode = childNodes.item(i);
if (childNode instanceof Element) {
Element elem = (Element)childNode;
String type = elem.getTagName();
if (type.equals("generator")) {
generators.add(buildGenerator(elem, resourceManager));
}
else
throw new InvalidFormatException("Unexpected element: " + elementName);
}
}
AdaptiveFeatureGenerator featureGenerator = null;
if (generators.size() == 1)
featureGenerator = generators.get(0);
else if (generators.size() > 1)
featureGenerator = new AggregatedFeatureGenerator(generators.toArray(
new AdaptiveFeatureGenerator[generators.size()]));
else
throw new InvalidFormatException("featureGenerators must have one or more generators");
// disallow manually specifying CachedFeatureGenerator
if (featureGenerator instanceof CachedFeatureGenerator)
throw new InvalidFormatException("CachedFeatureGeneratorFactory cannot be specified manually." +
"Use cache=\"true\" attribute in featureGenerators element instead.");
// check cache usage
if (Boolean.parseBoolean(generatorElement.getAttribute("cache")))
return new CachedFeatureGenerator(featureGenerator);
else
return featureGenerator;
}
else {
// support classic format
XmlFeatureGeneratorFactory generatorFactory = factories.get(elementName);
if (generatorFactory != null) {
return generatorFactory.create(generatorElement, resourceManager);
}
else
throw new InvalidFormatException("Unexpected element: " + elementName);
}
}
static Element getFirstChild(Element elem) {
NodeList nodes = elem.getChildNodes();
for (int i = 0; i < nodes.getLength(); i++) {
if (nodes.item(i) instanceof Element) {
return (Element)nodes.item(i);
}
}
return null;
}
/**
* Creates a {@link AdaptiveFeatureGenerator} for the provided element.
* To accomplish this it looks up the corresponding factory by the
* element tag name. The factory is then responsible for the creation
* of the generator from the element.
*
* @param generatorElement
* @param resourceManager
*
* @return
*/
static AdaptiveFeatureGenerator buildGenerator(Element generatorElement,
FeatureGeneratorResourceProvider resourceManager) throws InvalidFormatException {
String className = generatorElement.getAttribute("class");
if (className == null) {
throw new InvalidFormatException("generator must have class attribute");
}
else {
try {
Class factoryClass = Class.forName(className);
try {
Constructor constructor = factoryClass.getConstructor();
AbstractXmlFeatureGeneratorFactory factory =
(AbstractXmlFeatureGeneratorFactory)constructor.newInstance();
factory.init(generatorElement, resourceManager);
return factory.create();
} catch (NoSuchMethodException e) {
throw new RuntimeException(e);
} catch (InvocationTargetException e) {
throw new RuntimeException(e);
} catch (InstantiationException e) {
throw new RuntimeException(e);
} catch (IllegalAccessException e) {
throw new RuntimeException(e);
}
} catch (ClassNotFoundException e) {
throw new RuntimeException(e);
}
}
}
private static org.w3c.dom.Document createDOM(InputStream xmlDescriptorIn)
throws IOException {
DocumentBuilder documentBuilder = XmlUtil.createDocumentBuilder();
org.w3c.dom.Document xmlDescriptorDOM;
try {
xmlDescriptorDOM = documentBuilder.parse(xmlDescriptorIn);
} catch (SAXException e) {
throw new InvalidFormatException("Descriptor is not valid XML!", e);
}
return xmlDescriptorDOM;
}
/**
* Creates an {@link AdaptiveFeatureGenerator} from an provided XML descriptor.
*
* Usually this XML descriptor contains a set of nested feature generators
* which are then used to generate the features by one of the opennlp
* components.
*
* @param xmlDescriptorIn the {@link InputStream} from which the descriptor
* is read, the stream remains open and must be closed by the caller.
*
* @param resourceManager the resource manager which is used to resolve resources
* referenced by a key in the descriptor
*
* @return created feature generators
*
* @throws IOException if an error occurs during reading from the descriptor
* {@link InputStream}
*/
public static AdaptiveFeatureGenerator create(InputStream xmlDescriptorIn,
FeatureGeneratorResourceProvider resourceManager) throws IOException {
org.w3c.dom.Document xmlDescriptorDOM = createDOM(xmlDescriptorIn);
Element generatorElement = xmlDescriptorDOM.getDocumentElement();
// TODO: (OPENNLP-1174) use #buildGenerator() after back-compat support is gone
return createGenerator(generatorElement, resourceManager);
}
public static Map> extractArtifactSerializerMappings(
InputStream xmlDescriptorIn) throws IOException {
org.w3c.dom.Document xmlDescriptorDOM = createDOM(xmlDescriptorIn);
Element element = xmlDescriptorDOM.getDocumentElement();
String elementName = element.getTagName();
// check it is new format?
if (elementName.equals("featureGenerators")) {
Map> mapping = new HashMap<>();
NodeList nodes = element.getChildNodes();
for (int i = 0; i < nodes.getLength(); i++) {
if (nodes.item(i) instanceof Element) {
Element childElem = (Element)nodes.item(i);
if (childElem.getTagName().equals("generator")) {
extractArtifactSerializerMappings(mapping, childElem);
}
}
}
return mapping;
}
else {
return extractArtifactSerializerMappingsClassicFormat(element);
}
}
static void extractArtifactSerializerMappings(Map> mapping, Element element) {
String className = element.getAttribute("class");
if (className != null) {
try {
Class factoryClass = Class.forName(className);
try {
Constructor constructor = factoryClass.getConstructor();
AbstractXmlFeatureGeneratorFactory factory =
(AbstractXmlFeatureGeneratorFactory)constructor.newInstance();
factory.init(element, null);
Map> map = factory.getArtifactSerializerMapping();
if (map != null)
mapping.putAll(map);
} catch (NoSuchMethodException e) {
throw new RuntimeException(e);
} catch (InvocationTargetException e) {
throw new RuntimeException(e);
} catch (InstantiationException e) {
throw new RuntimeException(e);
} catch (IllegalAccessException e) {
throw new RuntimeException(e);
} catch (InvalidFormatException ignored) {
}
} catch (ClassNotFoundException e) {
throw new RuntimeException(e);
}
}
NodeList nodes = element.getChildNodes();
for (int i = 0; i < nodes.getLength(); i++) {
if (nodes.item(i) instanceof Element) {
Element childElem = (Element)nodes.item(i);
if (childElem.getTagName().equals("generator")) {
extractArtifactSerializerMappings(mapping, childElem);
}
}
}
}
@Deprecated // TODO: (OPENNLP-1174) remove back-compat support when it is unnecessary
static Map> extractArtifactSerializerMappingsClassicFormat(
Element elem) throws IOException {
Map> mapping = new HashMap<>();
XPath xPath = XPathFactory.newInstance().newXPath();
NodeList customElements;
try {
XPathExpression exp = xPath.compile("//custom");
customElements = (NodeList) exp.evaluate(elem, XPathConstants.NODESET);
} catch (XPathExpressionException e) {
throw new IllegalStateException("The hard coded XPath expression should always be valid!");
}
for (int i = 0; i < customElements.getLength(); i++) {
if (customElements.item(i) instanceof Element) {
Element customElement = (Element) customElements.item(i);
// Note: The resource provider is not available at that point, to provide
// resources they need to be loaded first!
AdaptiveFeatureGenerator generator = createGenerator(customElement, null);
if (generator instanceof ArtifactToSerializerMapper) {
ArtifactToSerializerMapper mapper = (ArtifactToSerializerMapper) generator;
mapping.putAll(mapper.getArtifactSerializerMapping());
}
}
}
NodeList allElements;
try {
XPathExpression exp = xPath.compile("//*");
allElements = (NodeList) exp.evaluate(elem, XPathConstants.NODESET);
} catch (XPathExpressionException e) {
throw new IllegalStateException("The hard coded XPath expression should always be valid!");
}
for (int i = 0; i < allElements.getLength(); i++) {
if (allElements.item(i) instanceof Element) {
Element xmlElement = (Element) allElements.item(i);
String dictName = xmlElement.getAttribute("dict");
if (dictName != null) {
switch (xmlElement.getTagName()) {
case "wordcluster":
mapping.put(dictName, new WordClusterDictionary.WordClusterDictionarySerializer());
break;
case "brownclustertoken":
mapping.put(dictName, new BrownCluster.BrownClusterSerializer());
break;
case "brownclustertokenclass"://, ;
mapping.put(dictName, new BrownCluster.BrownClusterSerializer());
break;
case "brownclusterbigram": //, ;
mapping.put(dictName, new BrownCluster.BrownClusterSerializer());
break;
case "dictionary":
mapping.put(dictName, new DictionarySerializer());
break;
}
}
String modelName = xmlElement.getAttribute("model");
if (modelName != null) {
switch (xmlElement.getTagName()) {
case "tokenpos":
mapping.put(modelName, new POSModelSerializer());
break;
}
}
}
}
return mapping;
}
/**
* Provides a list with all the elements in the xml feature descriptor.
* @param xmlDescriptorIn the xml feature descriptor
* @return a list containing all elements
* @throws IOException if inputstream cannot be open
* @throws InvalidFormatException if xml is not well-formed
*/
public static List getDescriptorElements(InputStream xmlDescriptorIn)
throws IOException {
List elements = new ArrayList<>();
org.w3c.dom.Document xmlDescriptorDOM = createDOM(xmlDescriptorIn);
XPath xPath = XPathFactory.newInstance().newXPath();
NodeList allElements;
try {
XPathExpression exp = xPath.compile("//*");
allElements = (NodeList) exp.evaluate(xmlDescriptorDOM.getDocumentElement(), XPathConstants.NODESET);
} catch (XPathExpressionException e) {
throw new IllegalStateException("The hard coded XPath expression should always be valid!");
}
for (int i = 0; i < allElements.getLength(); i++) {
if (allElements.item(i) instanceof Element) {
Element customElement = (Element) allElements.item(i);
elements.add(customElement);
}
}
return elements;
}
}