All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.cogroo.analyzer.ComponentFactory Maven / Gradle / Ivy

/**
 * Copyright (C) 2012 cogroo 
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.cogroo.analyzer;

import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.FutureTask;

import opennlp.model.AbstractModel;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSTaggerME;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;

import org.apache.log4j.Logger;
import org.cogroo.config.Analyzers;
import org.cogroo.config.LanguageConfiguration;
import org.cogroo.config.LanguageConfigurationUtil;
import org.cogroo.config.Model;
import org.cogroo.dictionary.impl.FSADictionary;
import org.cogroo.tools.chunker2.ChunkerME;
import org.cogroo.tools.chunker2.ChunkerModel;
import org.cogroo.tools.featurizer.FeaturizerME;
import org.cogroo.tools.featurizer.FeaturizerModel;

import com.google.common.io.Closeables;

public class ComponentFactory implements ComponentFactoryI {

  protected static final Logger LOGGER = Logger
      .getLogger(ComponentFactory.class);

  private LanguageConfiguration lc = null;
  private Map modelPathMap;

  private ComponentFactory() {

  }

  private ComponentFactory(LanguageConfiguration lc) {
    this.lc = lc;

    modelPathMap = new HashMap(lc.getModel().size());

    for (Model m : lc.getModel()) {
      modelPathMap.put(m.getType(), m.getValue());
    }
  }

  public static ComponentFactory create(Locale locale) {
    ComponentFactory factory = null;

    factory = new ComponentFactory(LanguageConfigurationUtil.get(locale));

    return factory;
  }

  /**
   * Creates a {@link ComponentFactory} from a configuration file. The stream
   * remains open after execution.
   * 
   * @param configuration
   *          the configuration XML, that conforms with
   *          languageConfiguration.xsd
   * @return a {@link ComponentFactory}
   */
  public static ComponentFactory create(InputStream configuration) {
    ComponentFactory factory = null;
    factory = new ComponentFactory(LanguageConfigurationUtil.get(configuration));
    return factory;
  }

  public Analyzer createSentenceDetector() {
    long start = System.nanoTime();
    SentenceDetectorME sentenceDetector = null;
    InputStream modelIn = null;
    Analyzer analyzer = null; 
    if (modelPathMap.containsKey(Analyzers.SENTENCE_DETECTOR)) {
      try {
        modelIn = ComponentFactory.class.getResourceAsStream(modelPathMap
            .get(Analyzers.SENTENCE_DETECTOR));
        SentenceModel model = new SentenceModel(modelIn);
        sentenceDetector = new SentenceDetectorME(model);
      } catch (IOException e) {
        LOGGER.fatal("Couldn't load sentence model!", e);
      } finally {
        Closeables.closeQuietly(modelIn);
      }

      if (sentenceDetector == null)
        throw new InitializationException(
            "Couldn't load SentenceDetectorME class");

      analyzer = new SentenceDetector(sentenceDetector);
    }
    if (LOGGER.isInfoEnabled()) {
      LOGGER.info("Initialized SentenceDetector in "
          + ((System.nanoTime() - start) / 1000000) + "ms]");
    }
    return analyzer;
  }

  public Analyzer createTokenizer() {
    long start = System.nanoTime();
    Analyzer analyzer = null;
    TokenizerME tokenizer = null;
    InputStream modelIn = null;

    if (modelPathMap.containsKey(Analyzers.TOKENIZER)) {
      try {
        modelIn = ComponentFactory.class.getResourceAsStream(modelPathMap
            .get(Analyzers.TOKENIZER));
        TokenizerModel model = new TokenizerModel(modelIn);
        tokenizer = new TokenizerME(model);
      } catch (IOException e) {
        LOGGER.fatal("Couldn't load tokenizer model!", e);
      } finally {
        Closeables.closeQuietly(modelIn);
      }

      if (tokenizer == null)
        throw new InitializationException("Couldn't load TokenizerME class");

      analyzer = new Tokenizer(tokenizer);
    }
    if (LOGGER.isInfoEnabled()) {
      LOGGER.info("Initialized Tokenizer in "
          + ((System.nanoTime() - start) / 1000000) + "ms]");
    }
    return analyzer;
  }

  public Analyzer createNameFinder() {
    long start = System.nanoTime();
    Analyzer analyzer = null;
    NameFinderME nameFinder = null;
    InputStream modelIn = null;

    if (modelPathMap.containsKey(Analyzers.NAME_FINDER)) {
      try {
        modelIn = ComponentFactory.class.getResourceAsStream(modelPathMap
            .get(Analyzers.NAME_FINDER));
        TokenNameFinderModel model = new TokenNameFinderModel(modelIn);
        nameFinder = new NameFinderME(model);
      } catch (IOException e) {
        LOGGER.fatal("Couldn't load name finder model!", e);
      } finally {
        Closeables.closeQuietly(modelIn);
      }

      if (nameFinder == null)
        throw new InitializationException("Couldn't load NameFinderME class");

      analyzer = new NameFinder(nameFinder);
    }
    if (LOGGER.isInfoEnabled()) {
      LOGGER.info("Initialized NameFinder in "
          + ((System.nanoTime() - start) / 1000000) + "ms]");
    }
    return analyzer;
  }

  public Analyzer createContractionFinder() {
    long start = System.nanoTime();
    Analyzer analyzer = null;
    NameFinderME contractionFinder = null;
    InputStream modelIn = null;

    if (modelPathMap.containsKey(Analyzers.CONTRACTION_FINDER)) {
      try {
        modelIn = ComponentFactory.class.getResourceAsStream(modelPathMap
            .get(Analyzers.CONTRACTION_FINDER));
        TokenNameFinderModel model = new TokenNameFinderModel(modelIn);
        contractionFinder = new NameFinderME(model);
      } catch (IOException e) {
        LOGGER.fatal("Couldn't load contractions finder model!", e);
      } finally {
        Closeables.closeQuietly(modelIn);
      }

      if (contractionFinder == null)
        throw new InitializationException("Couldn't load NameFinderME class");

      analyzer = new ContractionFinder(contractionFinder);
    }
    if (LOGGER.isInfoEnabled()) {
      LOGGER.info("Initialized ContractionFinder in "
          + ((System.nanoTime() - start) / 1000000) + "ms]");
    }
    return analyzer;
  }

  public Analyzer createPOSTagger() {
    long start = System.nanoTime();
    Analyzer analyzer = null;
    POSTaggerME tagger = null;
    InputStream modelIn = null;

    if (modelPathMap.containsKey(Analyzers.POS_TAGGER)) {
      try {
        modelIn = ComponentFactory.class.getResourceAsStream(modelPathMap
            .get(Analyzers.POS_TAGGER));
        POSModel model = new POSModel(modelIn);
        tagger = new POSTaggerME(model);
      } catch (IOException e) {
        LOGGER.fatal("Couldn't load POS-tagger model!", e);
      } finally {
        Closeables.closeQuietly(modelIn);
      }

      if (tagger == null)
        throw new InitializationException("Couldn't load POSTaggerME class");

      analyzer = new POSTagger(tagger);
    }
    if (LOGGER.isInfoEnabled()) {
      LOGGER.info("Initialized POSTagger in "
          + ((System.nanoTime() - start) / 1000000) + "ms]");
    }
    return analyzer;
  }

  public Analyzer createFeaturizer() {
    long start = System.nanoTime();
    Analyzer analyzer = null;
    FeaturizerME featurizer = null;
    InputStream modelIn = null;

    if (modelPathMap.containsKey(Analyzers.FEATURIZER)) {
      try {
        modelIn = ComponentFactory.class.getResourceAsStream(modelPathMap
            .get(Analyzers.FEATURIZER));
        FeaturizerModel model = new FeaturizerModel(modelIn);
        featurizer = new FeaturizerME(model);
      } catch (IOException e) {
        LOGGER.fatal("Couldn't load Featurizer model!", e);
      } finally {
        Closeables.closeQuietly(modelIn);
      }

      if (featurizer == null)
        throw new InitializationException("Couldn't load FeaturizerME class");

      analyzer = new Featurizer(featurizer);
    }
    if (LOGGER.isInfoEnabled()) {
      LOGGER.info("Initialized Featurizer in "
          + ((System.nanoTime() - start) / 1000000) + "ms]");
    }
    return analyzer;
  }

  public Analyzer createLemmatizer() {
    long start = System.nanoTime();
    Analyzer analyzer = null;

    try {
      FSADictionary dict = FSADictionary
          .createFromResources("/fsa_dictionaries/pos/pt_br_jspell.dict");
      Lemmatizer lemmatizer = new Lemmatizer(dict);

      analyzer = lemmatizer;

    } catch (IllegalArgumentException e) {
      LOGGER.fatal("Couldn't load ");
      throw new InitializationException("Couldn't load", e);
    } catch (IOException e) {
      LOGGER.fatal("Couldn't find the dictionary.");
      throw new InitializationException("Couldn't locate dictionary", e);
    }
    if (LOGGER.isInfoEnabled()) {
      LOGGER.info("Initialized Lemmatizer in "
          + ((System.nanoTime() - start) / 1000000) + "ms]");
    }

    return analyzer;
  }

  public Analyzer createChunker() {
    long start = System.nanoTime();
    Analyzer analyzer = null;
    ChunkerME chunker = null;
    InputStream modelIn = null;

    if (modelPathMap.containsKey(Analyzers.CHUNKER)) {
      try {
        modelIn = ComponentFactory.class.getResourceAsStream(modelPathMap
            .get(Analyzers.CHUNKER));
        ChunkerModel model = new ChunkerModel(modelIn);
        chunker = new ChunkerME(model);
      } catch (IOException e) {
        LOGGER.fatal("Couldn't load Chunker model!", e);
      } finally {
        Closeables.closeQuietly(modelIn);
      }

      if (chunker == null)
        throw new InitializationException("Couldn't load ChunkerME class");

      analyzer = new Chunker(chunker);
    }
    if (LOGGER.isInfoEnabled()) {
      LOGGER.info("Initialized Chunker in "
          + ((System.nanoTime() - start) / 1000000) + "ms]");
    }
    return analyzer;
  }
  
  public Analyzer createHeadFinder() {
    long start = System.nanoTime();
    Analyzer analyzer = null;
    ChunkerME headFinder = null;
    InputStream modelIn = null;

    if (modelPathMap.containsKey(Analyzers.HEAD_FINDER)) {
      try {
        modelIn = ComponentFactory.class.getResourceAsStream(modelPathMap
            .get(Analyzers.HEAD_FINDER));
        ChunkerModel model = new ChunkerModel(modelIn);
        headFinder = new ChunkerME(model);
      } catch (IOException e) {
        LOGGER.fatal("Couldn't load HeadFinder model!", e);
      } finally {
        Closeables.closeQuietly(modelIn);
      }

      if (headFinder == null)
        throw new InitializationException("Couldn't load ChunkerME class");

      analyzer = new HeadFinder(headFinder);
    }
    if (LOGGER.isInfoEnabled()) {
      LOGGER.info("Initialized HeadFinder in "
          + ((System.nanoTime() - start) / 1000000) + "ms]");
    }
    return analyzer;
  }
  
  public Analyzer createShallowParser() {
    long start = System.nanoTime();
    Analyzer analyzer = null;
    ChunkerME shallowParser = null;
    InputStream modelIn = null;

    if (modelPathMap.containsKey(Analyzers.SHALLOW_PARSER)) {
      try {
        modelIn = ComponentFactory.class.getResourceAsStream(modelPathMap
            .get(Analyzers.SHALLOW_PARSER));
        ChunkerModel model = new ChunkerModel(modelIn);
        logOutcomes(model.getChunkerModel());
        shallowParser = new ChunkerME(model, 20);
      } catch (IOException e) {
        LOGGER.fatal("Couldn't load ShallowParser model!", e);
      } finally {
        Closeables.closeQuietly(modelIn);
      }

      if (shallowParser == null)
        throw new InitializationException("Couldn't load ChunkerME class");

      analyzer = new ShallowParser(shallowParser);
    }
    if (LOGGER.isInfoEnabled()) {
      LOGGER.info("Initialized ShallowParser in "
          + ((System.nanoTime() - start) / 1000000) + "ms]");
    }
    return analyzer;
  }

  private void logOutcomes(AbstractModel chunkerModel) {
    StringBuilder sb = new StringBuilder("Outcomes: ");
    for (int i = 0; i < chunkerModel.getNumOutcomes(); i++) {
      sb.append(chunkerModel.getOutcome(i)).append(" ");
    }
    LOGGER.info(sb.toString());
  }

  public Analyzer createPipe() {
    long start = System.nanoTime();
    Pipe pipe = new Pipe();

    
    // to accelerate the startup we do it in two steps. First we start initialization with
    // FutureTasks, and finally we wait for the results..
    
    FutureTask future;
    List> initializers = new LinkedList>();
    ExecutorService executor = Executors.newCachedThreadPool();
    
    LOGGER.info("Loading pipe assynchronously...");
    
    for (Analyzers analyzer : lc.getPipe().getAnalyzer()) {
      switch (analyzer) {
      case SENTENCE_DETECTOR:
        future = new FutureTask(new Callable() {
          public Analyzer call() {
            return createSentenceDetector();
          }
        });
        executor.execute(future);
        initializers.add(future);
        break;
      case TOKENIZER:
        future = new FutureTask(new Callable() {
          public Analyzer call() {
            return createTokenizer();
          }
        });
        executor.execute(future);
        initializers.add(future);
        break;
      case NAME_FINDER:
        future = new FutureTask(new Callable() {
          public Analyzer call() {
            return createNameFinder();
          }
        });
        executor.execute(future);
        initializers.add(future);
        break;
      case CONTRACTION_FINDER:
        future = new FutureTask(new Callable() {
          public Analyzer call() {
            return createContractionFinder();
          }
        });
        executor.execute(future);
        initializers.add(future);
        break;
      case POS_TAGGER:
        future = new FutureTask(new Callable() {
          public Analyzer call() {
            return createPOSTagger();
          }
        });
        executor.execute(future);
        initializers.add(future);
        break;
      case FEATURIZER:
        future = new FutureTask(new Callable() {
          public Analyzer call() {
            return createFeaturizer();
          }
        });
        executor.execute(future);
        initializers.add(future);
        break;
      case LEMMATIZER:
        future = new FutureTask(new Callable() {
          public Analyzer call() {
            return createLemmatizer();
          }
        });
        executor.execute(future);
        initializers.add(future);
        break;
      case CHUNKER:
        future = new FutureTask(new Callable() {
          public Analyzer call() {
            return createChunker();
          }
        });
        executor.execute(future);
        initializers.add(future);
        break;
      case HEAD_FINDER:
        future = new FutureTask(new Callable() {
          public Analyzer call() {
            return createHeadFinder();
          }
        });
        executor.execute(future);
        initializers.add(future);
        break;
      case SHALLOW_PARSER:
        future = new FutureTask(new Callable() {
          public Analyzer call() {
            return createShallowParser();
          }
        });
        executor.execute(future);
        initializers.add(future);
        break;
      default:
        throw new InitializationException("Unknown analyzer: " + analyzer);
      }
    }
    
    // now we get it...
    for (FutureTask futureTask : initializers) {
      try {
        pipe.add(futureTask.get());
      } catch (InterruptedException e) {
        throw new InitializationException("Failed to load pipe.", e);
      } catch (ExecutionException e) {
        throw new InitializationException("Failed to load pipe.", e);
      }
    }
    
    executor.shutdown();
    
    if (LOGGER.isInfoEnabled()) {
      LOGGER.info("Initialized Pipe and its components in "
          + ((System.nanoTime() - start) / 1000000) + "ms]");
    }
    return pipe;
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy