All Downloads are FREE. Search and download functionalities are using the official Maven repository.

opennlp.tools.util.featuregen.GeneratorFactory Maven / Gradle / Ivy

There is a newer version: 2.5.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.util.featuregen;

import java.io.IOException;
import java.io.InputStream;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.Text;
import org.xml.sax.SAXException;

import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.XmlUtil;
import opennlp.tools.util.ext.ExtensionLoader;
import opennlp.tools.util.model.ArtifactSerializer;
import opennlp.tools.util.model.DictionarySerializer;
import opennlp.tools.util.model.POSModelSerializer;

/**
 * Creates a set of feature generators based on a provided XML descriptor.
 *
 * Example of an XML descriptor:
 * 

* <featureGenerators name="namefind"> * <generator class="opennlp.tools.util.featuregen.CachedFeatureGeneratorFactory"> * <generator class="opennlp.tools.util.featuregen.WindowFeatureGeneratorFactory"> * <int name="prevLength">2</int> * <int name="nextLength">2</int> * <generator class="opennlp.tools.util.featuregen.TokenClassFeatureGeneratorFactory"/> * </generator> * <generator class="opennlp.tools.util.featuregen.WindowFeatureGeneratorFactory"> * <int name="prevLength">2</int> * <int name="nextLength">2</int> * <generator class="opennlp.tools.util.featuregen.TokenFeatureGeneratorFactory"/> * </generator> * <generator class="opennlp.tools.util.featuregen.DefinitionFeatureGeneratorFactory"/> * <generator class="opennlp.tools.util.featuregen.PreviousMapFeatureGeneratorFactory"/> * <generator class="opennlp.tools.util.featuregen.BigramNameFeatureGeneratorFactory"/> * <generator class="opennlp.tools.util.featuregen.SentenceFeatureGeneratorFactory"> * <bool name="begin">true</bool> * <bool name="end">false</bool> * </generator> * </generator> * </featureGenerators> *

* * Each XML element is mapped to a {@link GeneratorFactory.XmlFeatureGeneratorFactory} which * is responsible to process the element and create the specified * {@link AdaptiveFeatureGenerator}. Elements can contain other * elements in this case it is the responsibility of the mapped factory to process * the child elements correctly. In some factories this leads to recursive * calls the * {@link GeneratorFactory.XmlFeatureGeneratorFactory#create(Element, FeatureGeneratorResourceProvider)} * method. * * In the example above the generators element is mapped to the * {@link AggregatedFeatureGeneratorFactory} which then * creates all the aggregated {@link AdaptiveFeatureGenerator}s to * accomplish this it evaluates the mapping with the same mechanism * and gives the child element to the corresponding factories. All * created generators are added to a new instance of the * {@link AggregatedFeatureGenerator} which is then returned. */ public class GeneratorFactory { /** * The {@link XmlFeatureGeneratorFactory} is responsible to construct * an {@link AdaptiveFeatureGenerator} from an given XML {@link Element} * which contains all necessary configuration if any. */ @Deprecated // TODO: (OPENNLP-1174) just remove when back-compat is no longer needed interface XmlFeatureGeneratorFactory { /** * Creates an {@link AdaptiveFeatureGenerator} from a the describing * XML element. * * @param generatorElement the element which contains the configuration * @param resourceManager the resource manager which could be used * to access referenced resources * * @return the configured {@link AdaptiveFeatureGenerator} */ AdaptiveFeatureGenerator create(Element generatorElement, FeatureGeneratorResourceProvider resourceManager) throws InvalidFormatException; } public static abstract class AbstractXmlFeatureGeneratorFactory { protected Element generatorElement; protected FeatureGeneratorResourceProvider resourceManager; // to respect the order in AggregatedFeatureGenerator, let's use LinkedHashMap protected LinkedHashMap args; public AbstractXmlFeatureGeneratorFactory() { args = new LinkedHashMap<>(); } public Map> getArtifactSerializerMapping() throws InvalidFormatException { return null; } final void init(Element element, FeatureGeneratorResourceProvider resourceManager) throws InvalidFormatException { this.generatorElement = element; this.resourceManager = resourceManager; List generators = new ArrayList<>(); NodeList childNodes = generatorElement.getChildNodes(); for (int i = 0; i < childNodes.getLength(); i++) { Node childNode = childNodes.item(i); if (childNode instanceof Element) { Element elem = (Element)childNode; String type = elem.getTagName(); if (type.equals("generator")) { String key = "generator#" + Integer.toString(generators.size()); AdaptiveFeatureGenerator afg = buildGenerator(elem, resourceManager); generators.add(afg); if (afg != null) args.put(key, afg); } else { String name = elem.getAttribute("name"); Node cn = elem.getFirstChild(); Text text = (Text)cn; switch (type) { case "int" : args.put(name, Integer.parseInt(text.getWholeText())); break; case "long" : args.put(name, Long.parseLong(text.getWholeText())); break; case "float" : args.put(name, Float.parseFloat(text.getWholeText())); break; case "double" : args.put(name, Double.parseDouble(text.getWholeText())); break; case "str" : args.put(name, text.getWholeText()); break; case "bool" : args.put(name, Boolean.parseBoolean(text.getWholeText())); break; default: throw new InvalidFormatException( "child element must be one of generator, int, long, float, double," + " str or bool"); } } } } if (generators.size() > 1) { AdaptiveFeatureGenerator aggregatedFeatureGenerator = new AggregatedFeatureGenerator(generators.toArray( new AdaptiveFeatureGenerator[generators.size()])); args.put("generator#0", aggregatedFeatureGenerator); } } public int getInt(String name) throws InvalidFormatException { Object value = args.get(name); if (value == null) { throw new InvalidFormatException("parameter " + name + " must be set!"); } else if (value instanceof Integer) { return (Integer)value; } else { throw new InvalidFormatException("parameter " + name + " must be integer!"); } } public int getInt(String name, int defValue) throws InvalidFormatException { Object value = args.get(name); if (value == null) { return defValue; } else if (value instanceof Integer) { return (Integer)value; } else { throw new InvalidFormatException("parameter " + name + " must be integer!"); } } public long getLong(String name) throws InvalidFormatException { Object value = args.get(name); if (value == null) { throw new InvalidFormatException("parameter " + name + " must be set!"); } else if (value instanceof Long) { return (Long)value; } else { throw new InvalidFormatException("parameter " + name + " must be long!"); } } public long getLong(String name, long defValue) throws InvalidFormatException { Object value = args.get(name); if (value == null) { return defValue; } else if (value instanceof Long) { return (Long)value; } else { throw new InvalidFormatException("parameter " + name + " must be long!"); } } public float getFloat(String name) throws InvalidFormatException { Object value = args.get(name); if (value == null) { throw new InvalidFormatException("parameter " + name + " must be set!"); } else if (value instanceof Float) { return (Float)value; } else { throw new InvalidFormatException("parameter " + name + " must be float!"); } } public float getFloat(String name, float defValue) throws InvalidFormatException { Object value = args.get(name); if (value == null) { return defValue; } else if (value instanceof Float) { return (Float)value; } else { throw new InvalidFormatException("parameter " + name + " must be float!"); } } public double getDouble(String name) throws InvalidFormatException { Object value = args.get(name); if (value == null) { throw new InvalidFormatException("parameter " + name + " must be set!"); } else if (value instanceof Double) { return (Double)value; } else { throw new InvalidFormatException("parameter " + name + " must be double!"); } } public double getDouble(String name, double defValue) throws InvalidFormatException { Object value = args.get(name); if (value == null) { return defValue; } else if (value instanceof Double) { return (Double)value; } else { throw new InvalidFormatException("parameter " + name + " must be double!"); } } public String getStr(String name) throws InvalidFormatException { Object value = args.get(name); if (value == null) { throw new InvalidFormatException("parameter " + name + " must be set!"); } else if (value instanceof String) { return (String)value; } else { throw new InvalidFormatException("parameter " + name + " must be double!"); } } public String getStr(String name, String defValue) throws InvalidFormatException { Object value = args.get(name); if (value == null) { return defValue; } else if (value instanceof String) { return (String)value; } else { throw new InvalidFormatException("parameter " + name + " must be String!"); } } public boolean getBool(String name) throws InvalidFormatException { Object value = args.get(name); if (value == null) { throw new InvalidFormatException("parameter " + name + " must be set!"); } else if (value instanceof Boolean) { return (Boolean)value; } else { throw new InvalidFormatException("parameter " + name + " must be boolean!"); } } public boolean getBool(String name, boolean defValue) throws InvalidFormatException { Object value = args.get(name); if (value == null) { return defValue; } else if (value instanceof Boolean) { return (Boolean)value; } else { throw new InvalidFormatException("parameter " + name + " must be boolean!"); } } /** * * @return null if the subclass uses {@link #resourceManager} to instantiate * @throws InvalidFormatException */ public abstract AdaptiveFeatureGenerator create() throws InvalidFormatException; } // TODO: We have to support custom resources here. How does it work ?! // Attributes get into a Map properties // How can serialization be supported ?! // The model is loaded, and the manifest should contain all serializer classes registered for the // resources by name. // When training, the descriptor could be consulted first to register the serializers, and afterwards // they are stored in the model. // TODO: (OPENNLP-1174) just remove this class when back-compat is no longer needed static class CustomFeatureGeneratorFactory implements XmlFeatureGeneratorFactory { public AdaptiveFeatureGenerator create(Element generatorElement, FeatureGeneratorResourceProvider resourceManager) throws InvalidFormatException { String featureGeneratorClassName = generatorElement.getAttribute("class"); AdaptiveFeatureGenerator generator = ExtensionLoader.instantiateExtension(AdaptiveFeatureGenerator.class, featureGeneratorClassName); if (generator instanceof CustomFeatureGenerator) { CustomFeatureGenerator customGenerator = (CustomFeatureGenerator) generator; Map properties = new HashMap<>(); NamedNodeMap attributes = generatorElement.getAttributes(); for (int i = 0; i < attributes.getLength(); i++) { Node attribute = attributes.item(i); if (!"class".equals(attribute.getNodeName())) { properties.put(attribute.getNodeName(), attribute.getNodeValue()); } } if (resourceManager != null) { customGenerator.init(properties, resourceManager); } } return generator; } static void register(Map factoryMap) { factoryMap.put("custom", new CustomFeatureGeneratorFactory()); } } // TODO: (OPENNLP-1174) just remove when back-compat is no longer needed private static Map factories = new HashMap<>(); // TODO: (OPENNLP-1174) just remove when back-compat is no longer needed static { AggregatedFeatureGeneratorFactory.register(factories); CachedFeatureGeneratorFactory.register(factories); CharacterNgramFeatureGeneratorFactory.register(factories); DefinitionFeatureGeneratorFactory.register(factories); DictionaryFeatureGeneratorFactory.register(factories); DocumentBeginFeatureGeneratorFactory.register(factories); PreviousMapFeatureGeneratorFactory.register(factories); SentenceFeatureGeneratorFactory.register(factories); TokenClassFeatureGeneratorFactory.register(factories); TokenFeatureGeneratorFactory.register(factories); BigramNameFeatureGeneratorFactory.register(factories); TokenPatternFeatureGeneratorFactory.register(factories); PosTaggerFeatureGeneratorFactory.register(factories); PrefixFeatureGeneratorFactory.register(factories); SuffixFeatureGeneratorFactory.register(factories); WindowFeatureGeneratorFactory.register(factories); WordClusterFeatureGeneratorFactory.register(factories); BrownClusterTokenFeatureGeneratorFactory.register(factories); BrownClusterTokenClassFeatureGeneratorFactory.register(factories); BrownClusterBigramFeatureGeneratorFactory.register(factories); CustomFeatureGeneratorFactory.register(factories); POSTaggerNameFeatureGeneratorFactory.register(factories); } /** * Creates a {@link AdaptiveFeatureGenerator} for the provided element. * To accomplish this it looks up the corresponding factory by the * element tag name. The factory is then responsible for the creation * of the generator from the element. * * @param generatorElement * @param resourceManager * * @return */ @Deprecated // TODO: (OPENNLP-1174) remove back-compat support when it is unnecessary static AdaptiveFeatureGenerator createGenerator(Element generatorElement, FeatureGeneratorResourceProvider resourceManager) throws InvalidFormatException { String elementName = generatorElement.getTagName(); // check it is new format? if (elementName.equals("featureGenerators")) { List generators = new ArrayList<>(); NodeList childNodes = generatorElement.getChildNodes(); for (int i = 0; i < childNodes.getLength(); i++) { Node childNode = childNodes.item(i); if (childNode instanceof Element) { Element elem = (Element)childNode; String type = elem.getTagName(); if (type.equals("generator")) { generators.add(buildGenerator(elem, resourceManager)); } else throw new InvalidFormatException("Unexpected element: " + elementName); } } AdaptiveFeatureGenerator featureGenerator = null; if (generators.size() == 1) featureGenerator = generators.get(0); else if (generators.size() > 1) featureGenerator = new AggregatedFeatureGenerator(generators.toArray( new AdaptiveFeatureGenerator[generators.size()])); else throw new InvalidFormatException("featureGenerators must have one or more generators"); // disallow manually specifying CachedFeatureGenerator if (featureGenerator instanceof CachedFeatureGenerator) throw new InvalidFormatException("CachedFeatureGeneratorFactory cannot be specified manually." + "Use cache=\"true\" attribute in featureGenerators element instead."); // check cache usage if (Boolean.parseBoolean(generatorElement.getAttribute("cache"))) return new CachedFeatureGenerator(featureGenerator); else return featureGenerator; } else { // support classic format XmlFeatureGeneratorFactory generatorFactory = factories.get(elementName); if (generatorFactory != null) { return generatorFactory.create(generatorElement, resourceManager); } else throw new InvalidFormatException("Unexpected element: " + elementName); } } static Element getFirstChild(Element elem) { NodeList nodes = elem.getChildNodes(); for (int i = 0; i < nodes.getLength(); i++) { if (nodes.item(i) instanceof Element) { return (Element)nodes.item(i); } } return null; } /** * Creates a {@link AdaptiveFeatureGenerator} for the provided element. * To accomplish this it looks up the corresponding factory by the * element tag name. The factory is then responsible for the creation * of the generator from the element. * * @param generatorElement * @param resourceManager * * @return */ static AdaptiveFeatureGenerator buildGenerator(Element generatorElement, FeatureGeneratorResourceProvider resourceManager) throws InvalidFormatException { String className = generatorElement.getAttribute("class"); if (className == null) { throw new InvalidFormatException("generator must have class attribute"); } else { try { Class factoryClass = Class.forName(className); try { Constructor constructor = factoryClass.getConstructor(); AbstractXmlFeatureGeneratorFactory factory = (AbstractXmlFeatureGeneratorFactory)constructor.newInstance(); factory.init(generatorElement, resourceManager); return factory.create(); } catch (NoSuchMethodException e) { throw new RuntimeException(e); } catch (InvocationTargetException e) { throw new RuntimeException(e); } catch (InstantiationException e) { throw new RuntimeException(e); } catch (IllegalAccessException e) { throw new RuntimeException(e); } } catch (ClassNotFoundException e) { throw new RuntimeException(e); } } } private static org.w3c.dom.Document createDOM(InputStream xmlDescriptorIn) throws IOException { DocumentBuilder documentBuilder = XmlUtil.createDocumentBuilder(); org.w3c.dom.Document xmlDescriptorDOM; try { xmlDescriptorDOM = documentBuilder.parse(xmlDescriptorIn); } catch (SAXException e) { throw new InvalidFormatException("Descriptor is not valid XML!", e); } return xmlDescriptorDOM; } /** * Creates an {@link AdaptiveFeatureGenerator} from an provided XML descriptor. * * Usually this XML descriptor contains a set of nested feature generators * which are then used to generate the features by one of the opennlp * components. * * @param xmlDescriptorIn the {@link InputStream} from which the descriptor * is read, the stream remains open and must be closed by the caller. * * @param resourceManager the resource manager which is used to resolve resources * referenced by a key in the descriptor * * @return created feature generators * * @throws IOException if an error occurs during reading from the descriptor * {@link InputStream} */ public static AdaptiveFeatureGenerator create(InputStream xmlDescriptorIn, FeatureGeneratorResourceProvider resourceManager) throws IOException { org.w3c.dom.Document xmlDescriptorDOM = createDOM(xmlDescriptorIn); Element generatorElement = xmlDescriptorDOM.getDocumentElement(); // TODO: (OPENNLP-1174) use #buildGenerator() after back-compat support is gone return createGenerator(generatorElement, resourceManager); } public static Map> extractArtifactSerializerMappings( InputStream xmlDescriptorIn) throws IOException { org.w3c.dom.Document xmlDescriptorDOM = createDOM(xmlDescriptorIn); Element element = xmlDescriptorDOM.getDocumentElement(); String elementName = element.getTagName(); // check it is new format? if (elementName.equals("featureGenerators")) { Map> mapping = new HashMap<>(); NodeList nodes = element.getChildNodes(); for (int i = 0; i < nodes.getLength(); i++) { if (nodes.item(i) instanceof Element) { Element childElem = (Element)nodes.item(i); if (childElem.getTagName().equals("generator")) { extractArtifactSerializerMappings(mapping, childElem); } } } return mapping; } else { return extractArtifactSerializerMappingsClassicFormat(element); } } static void extractArtifactSerializerMappings(Map> mapping, Element element) { String className = element.getAttribute("class"); if (className != null) { try { Class factoryClass = Class.forName(className); try { Constructor constructor = factoryClass.getConstructor(); AbstractXmlFeatureGeneratorFactory factory = (AbstractXmlFeatureGeneratorFactory)constructor.newInstance(); factory.init(element, null); Map> map = factory.getArtifactSerializerMapping(); if (map != null) mapping.putAll(map); } catch (NoSuchMethodException e) { throw new RuntimeException(e); } catch (InvocationTargetException e) { throw new RuntimeException(e); } catch (InstantiationException e) { throw new RuntimeException(e); } catch (IllegalAccessException e) { throw new RuntimeException(e); } catch (InvalidFormatException ignored) { } } catch (ClassNotFoundException e) { throw new RuntimeException(e); } } NodeList nodes = element.getChildNodes(); for (int i = 0; i < nodes.getLength(); i++) { if (nodes.item(i) instanceof Element) { Element childElem = (Element)nodes.item(i); if (childElem.getTagName().equals("generator")) { extractArtifactSerializerMappings(mapping, childElem); } } } } @Deprecated // TODO: (OPENNLP-1174) remove back-compat support when it is unnecessary static Map> extractArtifactSerializerMappingsClassicFormat( Element elem) throws IOException { Map> mapping = new HashMap<>(); XPath xPath = XPathFactory.newInstance().newXPath(); NodeList customElements; try { XPathExpression exp = xPath.compile("//custom"); customElements = (NodeList) exp.evaluate(elem, XPathConstants.NODESET); } catch (XPathExpressionException e) { throw new IllegalStateException("The hard coded XPath expression should always be valid!"); } for (int i = 0; i < customElements.getLength(); i++) { if (customElements.item(i) instanceof Element) { Element customElement = (Element) customElements.item(i); // Note: The resource provider is not available at that point, to provide // resources they need to be loaded first! AdaptiveFeatureGenerator generator = createGenerator(customElement, null); if (generator instanceof ArtifactToSerializerMapper) { ArtifactToSerializerMapper mapper = (ArtifactToSerializerMapper) generator; mapping.putAll(mapper.getArtifactSerializerMapping()); } } } NodeList allElements; try { XPathExpression exp = xPath.compile("//*"); allElements = (NodeList) exp.evaluate(elem, XPathConstants.NODESET); } catch (XPathExpressionException e) { throw new IllegalStateException("The hard coded XPath expression should always be valid!"); } for (int i = 0; i < allElements.getLength(); i++) { if (allElements.item(i) instanceof Element) { Element xmlElement = (Element) allElements.item(i); String dictName = xmlElement.getAttribute("dict"); if (dictName != null) { switch (xmlElement.getTagName()) { case "wordcluster": mapping.put(dictName, new WordClusterDictionary.WordClusterDictionarySerializer()); break; case "brownclustertoken": mapping.put(dictName, new BrownCluster.BrownClusterSerializer()); break; case "brownclustertokenclass"://, ; mapping.put(dictName, new BrownCluster.BrownClusterSerializer()); break; case "brownclusterbigram": //, ; mapping.put(dictName, new BrownCluster.BrownClusterSerializer()); break; case "dictionary": mapping.put(dictName, new DictionarySerializer()); break; } } String modelName = xmlElement.getAttribute("model"); if (modelName != null) { switch (xmlElement.getTagName()) { case "tokenpos": mapping.put(modelName, new POSModelSerializer()); break; } } } } return mapping; } /** * Provides a list with all the elements in the xml feature descriptor. * @param xmlDescriptorIn the xml feature descriptor * @return a list containing all elements * @throws IOException if inputstream cannot be open * @throws InvalidFormatException if xml is not well-formed */ public static List getDescriptorElements(InputStream xmlDescriptorIn) throws IOException { List elements = new ArrayList<>(); org.w3c.dom.Document xmlDescriptorDOM = createDOM(xmlDescriptorIn); XPath xPath = XPathFactory.newInstance().newXPath(); NodeList allElements; try { XPathExpression exp = xPath.compile("//*"); allElements = (NodeList) exp.evaluate(xmlDescriptorDOM.getDocumentElement(), XPathConstants.NODESET); } catch (XPathExpressionException e) { throw new IllegalStateException("The hard coded XPath expression should always be valid!"); } for (int i = 0; i < allElements.getLength(); i++) { if (allElements.item(i) instanceof Element) { Element customElement = (Element) allElements.item(i); elements.add(customElement); } } return elements; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy