opennlp.tools.util.featuregen.GeneratorFactory Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.util.featuregen;
import java.io.IOException;
import java.io.InputStream;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.Text;
import org.xml.sax.SAXException;
import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.XmlUtil;
import opennlp.tools.util.model.ArtifactSerializer;
/**
* Creates a set of feature generators based on a provided XML descriptor.
*
* Example of an XML descriptor:
*
* <featureGenerators name="namefind">
* <generator class="opennlp.tools.util.featuregen.CachedFeatureGeneratorFactory">
* <generator class="opennlp.tools.util.featuregen.WindowFeatureGeneratorFactory">
* <int name="prevLength">2</int>
* <int name="nextLength">2</int>
* <generator class="opennlp.tools.util.featuregen.TokenClassFeatureGeneratorFactory"/>
* </generator>
* <generator class="opennlp.tools.util.featuregen.WindowFeatureGeneratorFactory">
* <int name="prevLength">2</int>
* <int name="nextLength">2</int>
* <generator class="opennlp.tools.util.featuregen.TokenFeatureGeneratorFactory"/>
* </generator>
* <generator class="opennlp.tools.util.featuregen.DefinitionFeatureGeneratorFactory"/>
* <generator class="opennlp.tools.util.featuregen.PreviousMapFeatureGeneratorFactory"/>
* <generator class="opennlp.tools.util.featuregen.BigramNameFeatureGeneratorFactory"/>
* <generator class="opennlp.tools.util.featuregen.SentenceFeatureGeneratorFactory">
* <bool name="begin">true</bool>
* <bool name="end">false</bool>
* </generator>
* </generator>
* </featureGenerators>
*
* Each XML element is mapped to a {@link GeneratorFactory.AbstractXmlFeatureGeneratorFactory} which
* is responsible to process the element and create the specified
* {@link AdaptiveFeatureGenerator}. Elements can contain other
* elements in this case it is the responsibility of the mapped factory to process
* the child elements correctly.
*
* In the example above the generators element is mapped to the
* {@link AggregatedFeatureGeneratorFactory} which then
* creates all the aggregated {@link AdaptiveFeatureGenerator}s to
* accomplish this it evaluates the mapping with the same mechanism
* and gives the child element to the corresponding factories. All
* created generators are added to a new instance of the
* {@link AggregatedFeatureGenerator} which is then returned.
*/
public class GeneratorFactory {
/**
* Creates an {@link AdaptiveFeatureGenerator} from an provided XML descriptor.
*
* Usually this XML descriptor contains a set of nested feature generators
* which are then used to generate the features by one of the opennlp
* components.
*
* @param xmlDescriptorIn the {@link InputStream} from which the descriptor
* is read, the stream remains open and must be closed by the caller.
* @param resourceManager the resource manager which is used to resolve resources
* referenced by a key in the descriptor
* @return created feature generators
* @throws IOException if an error occurs during reading from the descriptor
* {@link InputStream}
*/
public static AdaptiveFeatureGenerator create(InputStream xmlDescriptorIn,
FeatureGeneratorResourceProvider resourceManager)
throws IOException {
final org.w3c.dom.Document xmlDescriptorDOM = createDOM(xmlDescriptorIn);
final Element generatorElement = xmlDescriptorDOM.getDocumentElement();
return createGenerator(generatorElement, resourceManager);
}
/**
* Creates an {@link AdaptiveFeatureGenerator} for the provided element.
* To accomplish this it looks up the corresponding factory by the
* element tag name. The factory is then responsible for the creation
* of the generator from the element.
*
* @param generatorElement must not be {@code null}
* @param resourceManager may be {@code null}
* @return an {@link AdaptiveFeatureGenerator}
* @throws IllegalArgumentException if the given {@link Element generatorElement} is {@code null}.
* @throws IllegalStateException if the given {@link Element generatorElement} has
* no {@code class} attribute.
*/
private static AdaptiveFeatureGenerator buildGenerator(Element generatorElement,
FeatureGeneratorResourceProvider resourceManager)
throws InvalidFormatException {
if (generatorElement == null) {
throw new IllegalArgumentException("generatorElement must not be NULL");
}
final String className = generatorElement.getAttribute("class");
if (className.isBlank()) {
throw new InvalidFormatException("generator must have class attribute");
} else {
try {
final Class> factoryClass = Class.forName(className);
try {
final Constructor> constructor = factoryClass.getConstructor();
final AbstractXmlFeatureGeneratorFactory factory =
(AbstractXmlFeatureGeneratorFactory) constructor.newInstance();
factory.init(generatorElement, resourceManager);
return factory.create();
} catch (NoSuchMethodException | InvocationTargetException | InstantiationException |
InvalidFormatException | IllegalAccessException e) {
throw new RuntimeException(e);
}
} catch (ClassNotFoundException e) {
throw new RuntimeException(e);
}
}
}
/**
* Creates an {@link AdaptiveFeatureGenerator} for the provided element.
* To accomplish this it looks up the corresponding factory by the
* element tag name. The factory is then responsible for the creation
* of the generator from the element.
*
* @param generatorElement must not be {@code null}
* @param resourceManager may be {@code null}
* @return an {@link AdaptiveFeatureGenerator}
* @throws IllegalArgumentException if the given {@link Element generatorElement} is {@code null}
*/
private static AdaptiveFeatureGenerator createGenerator(Element generatorElement,
FeatureGeneratorResourceProvider resourceManager)
throws InvalidFormatException {
if (generatorElement == null) {
throw new IllegalArgumentException("generatorElement must not be NULL");
}
final String elementName = generatorElement.getTagName();
// check it is new format?
if ("featureGenerators".equals(elementName)) {
final List generators = new ArrayList<>();
final NodeList childNodes = generatorElement.getChildNodes();
for (int i = 0; i < childNodes.getLength(); i++) {
final Node childNode = childNodes.item(i);
if (childNode instanceof Element elem) {
final String type = elem.getTagName();
if ("generator".equals(type)) {
generators.add(buildGenerator(elem, resourceManager));
} else {
throw new InvalidFormatException("Unexpected element: " + elementName);
}
}
}
AdaptiveFeatureGenerator featureGenerator;
if (generators.size() == 1) {
featureGenerator = generators.get(0);
} else if (generators.size() > 1) {
featureGenerator = new AggregatedFeatureGenerator(generators.toArray(
new AdaptiveFeatureGenerator[0]));
} else {
throw new InvalidFormatException("featureGenerators must have one or more generators");
}
// disallow manually specifying CachedFeatureGenerator
if (featureGenerator instanceof CachedFeatureGenerator) {
throw new InvalidFormatException("CachedFeatureGeneratorFactory cannot be specified manually." +
"Use cache=\"true\" attribute in featureGenerators element instead.");
}
// check cache usage
if (Boolean.parseBoolean(generatorElement.getAttribute("cache"))) {
return new CachedFeatureGenerator(featureGenerator);
} else {
return featureGenerator;
}
} else {
throw new IllegalArgumentException(
"[OPENNLP-1174] - Classic configuration format is no longer supported!");
}
}
private static org.w3c.dom.Document createDOM(InputStream xmlDescriptorIn)
throws IOException {
final DocumentBuilder documentBuilder = XmlUtil.createDocumentBuilder();
org.w3c.dom.Document xmlDescriptorDOM;
try {
xmlDescriptorDOM = documentBuilder.parse(xmlDescriptorIn);
} catch (SAXException e) {
throw new InvalidFormatException("Descriptor is not valid XML!", e);
}
return xmlDescriptorDOM;
}
public static Map> extractArtifactSerializerMappings(
InputStream xmlDescriptorIn) throws IOException {
final org.w3c.dom.Document xmlDescriptorDOM = createDOM(xmlDescriptorIn);
final Element element = xmlDescriptorDOM.getDocumentElement();
final String elementName = element.getTagName();
// check it is new format?
if ("featureGenerators".equals(elementName)) {
return addMappingsFromXmlChildren(element.getChildNodes(), new HashMap<>());
} else {
throw new IllegalArgumentException(
"[OPENNLP-1174] - Classic configuration format is no longer supported!");
}
}
private static void extractArtifactSerializerMappings(
Map> mapping, Element element) {
final String className = element.getAttribute("class");
if (!className.isBlank()) {
try {
final Class> factoryClass = Class.forName(className);
try {
final Constructor> constructor = factoryClass.getConstructor();
final AbstractXmlFeatureGeneratorFactory factory =
(AbstractXmlFeatureGeneratorFactory) constructor.newInstance();
factory.init(element, null);
final Map> map = factory.getArtifactSerializerMapping();
if (map != null) {
mapping.putAll(map);
}
} catch (NoSuchMethodException | InvocationTargetException | InstantiationException
| IllegalAccessException e) {
throw new RuntimeException(e);
} catch (InvalidFormatException ignored) {
}
} catch (ClassNotFoundException e) {
throw new RuntimeException(e);
}
}
addMappingsFromXmlChildren(element.getChildNodes(), mapping);
}
private static Map> addMappingsFromXmlChildren(
final NodeList nodes, final Map> mapping) {
for (int i = 0; i < nodes.getLength(); i++) {
if (nodes.item(i) instanceof Element childElem) {
if ("generator".equals(childElem.getTagName())) {
extractArtifactSerializerMappings(mapping, childElem);
}
}
}
return mapping;
}
/**
* Provides a list with all the elements in the xml feature descriptor.
*
* @param xmlDescriptorIn the xml feature descriptor
* @return a list containing all elements
* @throws IOException if the given {@link InputStream} cannot be open
* @throws InvalidFormatException if xml is not well-formed
*/
public static List getDescriptorElements(InputStream xmlDescriptorIn)
throws IOException {
final List elements = new ArrayList<>();
final org.w3c.dom.Document xmlDescriptorDOM = createDOM(xmlDescriptorIn);
final XPath xPath = XPathFactory.newInstance().newXPath();
NodeList allElements;
try {
final XPathExpression exp = xPath.compile("//*");
allElements = (NodeList) exp.evaluate(xmlDescriptorDOM.getDocumentElement(), XPathConstants.NODESET);
} catch (XPathExpressionException e) {
throw new InvalidFormatException("The hard coded XPath expression should always be valid!");
}
for (int i = 0; i < allElements.getLength(); i++) {
if (allElements.item(i) instanceof Element customElement) {
elements.add(customElement);
}
}
return elements;
}
public static abstract class AbstractXmlFeatureGeneratorFactory {
protected Element generatorElement;
protected FeatureGeneratorResourceProvider resourceManager;
// to respect the order in AggregatedFeatureGenerator, let's use LinkedHashMap
protected LinkedHashMap args;
public AbstractXmlFeatureGeneratorFactory() {
args = new LinkedHashMap<>();
}
public Map> getArtifactSerializerMapping() throws InvalidFormatException {
return null;
}
final void init(Element element, FeatureGeneratorResourceProvider resourceManager)
throws InvalidFormatException {
this.generatorElement = element;
this.resourceManager = resourceManager;
List generators = new ArrayList<>();
NodeList childNodes = generatorElement.getChildNodes();
for (int i = 0; i < childNodes.getLength(); i++) {
Node childNode = childNodes.item(i);
if (childNode instanceof Element elem) {
String type = elem.getTagName();
if (type.equals("generator")) {
String key = "generator#" + generators.size();
AdaptiveFeatureGenerator afg = buildGenerator(elem, resourceManager);
if (afg != null) {
generators.add(afg);
args.put(key, afg);
}
} else {
String name = elem.getAttribute("name");
Node cn = elem.getFirstChild();
Text text = (Text) cn;
switch (type) {
case "int" -> args.put(name, Integer.parseInt(text.getWholeText()));
case "long" -> args.put(name, Long.parseLong(text.getWholeText()));
case "float" -> args.put(name, Float.parseFloat(text.getWholeText()));
case "double" -> args.put(name, Double.parseDouble(text.getWholeText()));
case "str" -> args.put(name, text.getWholeText());
case "bool" -> args.put(name, Boolean.parseBoolean(text.getWholeText()));
default -> throw new InvalidFormatException(
"child element must be one of generator, int, long, float, double," +
" str or bool");
}
}
}
}
if (generators.size() > 1) {
AdaptiveFeatureGenerator aggregatedFeatureGenerator =
new AggregatedFeatureGenerator(generators.toArray(
new AdaptiveFeatureGenerator[0]));
args.put("generator#0", aggregatedFeatureGenerator);
}
}
public int getInt(String name) throws InvalidFormatException {
final Object value = args.get(name);
if (value == null) {
throw new InvalidFormatException("parameter " + name + " must be set!");
} else if (value instanceof Integer) {
return (Integer) value;
} else {
throw new InvalidFormatException("parameter " + name + " must be integer!");
}
}
public int getInt(String name, int defValue) throws InvalidFormatException {
final Object value = args.get(name);
if (value == null) {
return defValue;
} else if (value instanceof Integer) {
return (Integer) value;
} else {
throw new InvalidFormatException("parameter " + name + " must be integer!");
}
}
public long getLong(String name) throws InvalidFormatException {
final Object value = args.get(name);
if (value == null) {
throw new InvalidFormatException("parameter " + name + " must be set!");
} else if (value instanceof Long) {
return (Long) value;
} else {
throw new InvalidFormatException("parameter " + name + " must be long!");
}
}
public long getLong(String name, long defValue) throws InvalidFormatException {
final Object value = args.get(name);
if (value == null) {
return defValue;
} else if (value instanceof Long) {
return (Long) value;
} else {
throw new InvalidFormatException("parameter " + name + " must be long!");
}
}
public float getFloat(String name) throws InvalidFormatException {
final Object value = args.get(name);
if (value == null) {
throw new InvalidFormatException("parameter " + name + " must be set!");
} else if (value instanceof Float) {
return (Float) value;
} else {
throw new InvalidFormatException("parameter " + name + " must be float!");
}
}
public float getFloat(String name, float defValue) throws InvalidFormatException {
final Object value = args.get(name);
if (value == null) {
return defValue;
} else if (value instanceof Float) {
return (Float) value;
} else {
throw new InvalidFormatException("parameter " + name + " must be float!");
}
}
public double getDouble(String name) throws InvalidFormatException {
final Object value = args.get(name);
if (value == null) {
throw new InvalidFormatException("parameter " + name + " must be set!");
} else if (value instanceof Double) {
return (Double) value;
} else {
throw new InvalidFormatException("parameter " + name + " must be double!");
}
}
public double getDouble(String name, double defValue) throws InvalidFormatException {
final Object value = args.get(name);
if (value == null) {
return defValue;
} else if (value instanceof Double) {
return (Double) value;
} else {
throw new InvalidFormatException("parameter " + name + " must be double!");
}
}
public String getStr(String name) throws InvalidFormatException {
final Object value = args.get(name);
if (value == null) {
throw new InvalidFormatException("parameter " + name + " must be set!");
} else if (value instanceof String) {
return (String) value;
} else {
throw new InvalidFormatException("parameter " + name + " must be double!");
}
}
public String getStr(String name, String defValue) throws InvalidFormatException {
final Object value = args.get(name);
if (value == null) {
return defValue;
} else if (value instanceof String) {
return (String) value;
} else {
throw new InvalidFormatException("parameter " + name + " must be String!");
}
}
public boolean getBool(String name) throws InvalidFormatException {
Object value = args.get(name);
if (value == null) {
throw new InvalidFormatException("parameter " + name + " must be set!");
} else if (value instanceof Boolean) {
return (Boolean) value;
} else {
throw new InvalidFormatException("parameter " + name + " must be boolean!");
}
}
public boolean getBool(String name, boolean defValue) throws InvalidFormatException {
Object value = args.get(name);
if (value == null) {
return defValue;
} else if (value instanceof Boolean) {
return (Boolean) value;
} else {
throw new InvalidFormatException("parameter " + name + " must be boolean!");
}
}
/**
* @return {@code null} if the subclass uses {@link #resourceManager} to instantiate
* @throws InvalidFormatException
*/
public abstract AdaptiveFeatureGenerator create() throws InvalidFormatException;
}
}