All Downloads are FREE. Search and download functionalities are using the official Maven repository.

opennlp.tools.doccat.DoccatFactory Maven / Gradle / Ivy

There is a newer version: 2.5.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.doccat;

import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.tokenize.WhitespaceTokenizer;
import opennlp.tools.util.BaseToolFactory;
import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.ext.ExtensionLoader;

/**
 * The factory that provides Doccat default implementations and resources
 */
public class DoccatFactory extends BaseToolFactory {

  private static final String FEATURE_GENERATORS = "doccat.featureGenerators";
  private static final String TOKENIZER_NAME = "doccat.tokenizer";

  private FeatureGenerator[] featureGenerators;
  private Tokenizer tokenizer;

  /**
   * Creates a {@link DoccatFactory} that provides the default implementation of
   * the resources.
   */
  public DoccatFactory() {
  }

  /**
   * Creates a {@link DoccatFactory}. Use this constructor to programmatically
   * create a factory.
   *
   * @param tokenizer         the tokenizer
   * @param featureGenerators the feature generators
   */
  public DoccatFactory(Tokenizer tokenizer, FeatureGenerator[] featureGenerators) {
    this.init(tokenizer, featureGenerators);
  }

  protected void init(Tokenizer tokenizer, FeatureGenerator[] featureGenerators) {

    this.featureGenerators = featureGenerators;
    this.tokenizer = tokenizer;
  }

  @Override
  public Map createManifestEntries() {
    Map manifestEntries = super.createManifestEntries();

    if (getTokenizer() != null) {
      manifestEntries.put(TOKENIZER_NAME, getTokenizer().getClass()
          .getCanonicalName());
    }

    if (getFeatureGenerators() != null) {
      manifestEntries.put(FEATURE_GENERATORS, featureGeneratorsAsString());
    }

    return manifestEntries;
  }

  private String featureGeneratorsAsString() {
    List fgs = Arrays.asList(getFeatureGenerators());
    Iterator iter = fgs.iterator();
    StringBuilder sb = new StringBuilder();
    if (iter.hasNext()) {
      sb.append(iter.next().getClass().getCanonicalName());
      while (iter.hasNext()) {
        sb.append(',').append(iter.next().getClass().getCanonicalName());
      }
    }
    return sb.toString();
  }

  @Override
  public void validateArtifactMap() throws InvalidFormatException {
    // nothing to validate
  }

  public static DoccatFactory create(String subclassName, Tokenizer tokenizer,
                                     FeatureGenerator[] featureGenerators) throws InvalidFormatException {
    if (subclassName == null) {
      // will create the default factory
      return new DoccatFactory(tokenizer, featureGenerators);
    }
    try {
      DoccatFactory theFactory = ExtensionLoader.instantiateExtension(
          DoccatFactory.class, subclassName);
      theFactory.init(tokenizer, featureGenerators);
      return theFactory;
    } catch (Exception e) {
      String msg = "Could not instantiate the " + subclassName
          + ". The initialization throw an exception.";
      System.err.println(msg);
      e.printStackTrace();
      throw new InvalidFormatException(msg, e);
    }

  }

  private FeatureGenerator[] loadFeatureGenerators(String classNames) {
    String[] classes = classNames.split(",");
    FeatureGenerator[] fgs = new FeatureGenerator[classes.length];

    for (int i = 0; i < classes.length; i++) {
      fgs[i] = ExtensionLoader.instantiateExtension(FeatureGenerator.class,
          classes[i]);
    }
    return fgs;
  }

  public FeatureGenerator[] getFeatureGenerators() {
    if (featureGenerators == null) {
      if (artifactProvider != null) {
        String classNames = artifactProvider
            .getManifestProperty(FEATURE_GENERATORS);
        if (classNames != null) {
          this.featureGenerators = loadFeatureGenerators(classNames);
        }
      }
      if (featureGenerators == null) { // could not load using artifact provider
        // load bag of words as default
        this.featureGenerators = new FeatureGenerator[]{new BagOfWordsFeatureGenerator()};
      }
    }
    return featureGenerators;
  }

  public void setFeatureGenerators(FeatureGenerator[] featureGenerators) {
    this.featureGenerators = featureGenerators;
  }

  public Tokenizer getTokenizer() {
    if (this.tokenizer == null) {
      if (artifactProvider != null) {
        String className = artifactProvider.getManifestProperty(TOKENIZER_NAME);
        if (className != null) {
          this.tokenizer = ExtensionLoader.instantiateExtension(
              Tokenizer.class, className);
        }
      }
      if (this.tokenizer == null) { // could not load using artifact provider
        this.tokenizer = WhitespaceTokenizer.INSTANCE;
      }
    }
    return tokenizer;
  }

  public void setTokenizer(Tokenizer tokenizer) {
    this.tokenizer = tokenizer;
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy