org.languagetool.chunking.EnglishChunker Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of language-en Show documentation
There is a newer version: 6.5
/* LanguageTool, a natural language style checker
 * Copyright (C) 2013 Daniel Naber (http://www.danielnaber.de)
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.chunking;

import opennlp.tools.chunker.ChunkerME;
import opennlp.tools.chunker.ChunkerModel;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSTaggerME;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.languagetool.AnalyzedTokenReadings;
import org.languagetool.tools.Tools;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

/**
 * OpenNLP-based chunker. Also uses the OpenNLP tokenizer and POS tagger and
 * maps the result to our own tokens (we have our own tokenizer), as far as trivially possible.
 * @since 2.3
 */
public class EnglishChunker implements Chunker {

  private static final String TOKENIZER_MODEL = "/en-token.bin";
  private static final String POS_TAGGER_MODEL = "/en-pos-maxent.bin";
  private static final String CHUNKER_MODEL = "/en-chunker.bin";

  /**
   * This needs to be static to save memory: as Language.LANGUAGES is static, any language
   * that is once created there will never be released. As English has several variants,
   * we'd have as many posModels etc. as we have variants -> huge waste of memory:
   */
  private static volatile TokenizerModel tokenModel;
  private static volatile POSModel posModel;
  private static volatile ChunkerModel chunkerModel;

  private final EnglishChunkFilter chunkFilter;

  public EnglishChunker() {
    try {
      if (tokenModel == null) {
        tokenModel = new TokenizerModel(Tools.getStream(TOKENIZER_MODEL));
      }
      if (posModel == null) {
        posModel = new POSModel(Tools.getStream(POS_TAGGER_MODEL));
      }
      if (chunkerModel == null) {
        chunkerModel = new ChunkerModel(Tools.getStream(CHUNKER_MODEL));
      }
      chunkFilter = new EnglishChunkFilter();
    } catch (IOException e) {
      throw new RuntimeException("Could not initialize English chunker", e);
    }
  }

  @Override
  public void addChunkTags(List tokenReadings) {
    List origChunkTags = getChunkTagsForReadings(tokenReadings);
    List chunkTags = chunkFilter.filter(origChunkTags);
    assignChunksToReadings(chunkTags);
  }

  private List getChunkTagsForReadings(List tokenReadings) {
    // these are not thread-safe, so create them here, not as members:
    String sentence = getSentence(tokenReadings);
    String[] tokens = cleanZeroWidthWhitespaces(tokenize(sentence)).toArray(new String[0]);
    String[] posTags = posTag(tokens);
    String[] chunkTags = chunk(tokens, posTags);
    if (tokens.length != posTags.length || tokens.length != chunkTags.length) {
      throw new RuntimeException("Length of results must be the same: " + tokens.length + ", " + posTags.length + ", " + chunkTags.length);
    }
    return getTokensWithTokenReadings(tokenReadings, tokens, chunkTags);
  }

  // workaround for the add-on, which adds ZERO WIDTH NO-BREAK SPACE, which confuses the chunker: 
  @NotNull
  private List cleanZeroWidthWhitespaces(String[] tokens) {
    List cleanTokens = new ArrayList<>();
    for (String token : tokens) {
      String[] splits = token.split("\uFEFF");
      for (String split : splits) {
        if (split.length() == 0) {
          cleanTokens.add("");
        } else {
          cleanTokens.add(token);
        }
      }
    }
    return cleanTokens;
  }

  // non-private for test cases
  String[] tokenize(String sentence) {
    TokenizerME tokenizer = new TokenizerME(tokenModel);
    String cleanString = sentence.replace('’', '\'');  // this is the type of apostrophe that OpenNLP expects
    return tokenizer.tokenize(cleanString);
  }

  private String[] posTag(String[] tokens) {
    POSTaggerME posTagger = new POSTaggerME(posModel);
    return posTagger.tag(tokens);
  }

  private String[] chunk(String[] tokens, String[] posTags) {
    ChunkerME chunker = new ChunkerME(chunkerModel);
    return chunker.chunk(tokens, posTags);
  }

  private List getTokensWithTokenReadings(List tokenReadings, String[] tokens, String[] chunkTags) {
    List result = new ArrayList<>();
    int i = 0;
    int pos = 0;
    for (String chunkTag : chunkTags) {
      int startPos = pos;
      int endPos = startPos + tokens[i].length();
      //System.out.println("OPEN: " + tokens[i]);
      AnalyzedTokenReadings readings = getAnalyzedTokenReadingsFor(startPos, endPos, tokenReadings);
      result.add(new ChunkTaggedToken(tokens[i], Collections.singletonList(new ChunkTag(chunkTag)), readings));
      pos = endPos;
      i++;
    }
    return result;
  }

  private void assignChunksToReadings(List chunkTaggedTokens) {
    for (ChunkTaggedToken taggedToken : chunkTaggedTokens) {
      AnalyzedTokenReadings readings = taggedToken.getReadings();
      if (readings != null) {
        readings.setChunkTags(taggedToken.getChunkTags());
      }
    }
  }

  private String getSentence(List sentenceTokens) {
    StringBuilder sb = new StringBuilder();
    for (AnalyzedTokenReadings token : sentenceTokens) {
      sb.append(token.getToken());
    }
    return sb.toString();
  }

  // Get only exact position matches - i.e. this can only be used for a trivial mapping
  // where tokens that are not exactly at the same position will be skipped. For example,
  // the tokens of "I'll" ([I] ['ll] vs [I]['][ll) cannot be mapped with this.
  @Nullable
  private AnalyzedTokenReadings getAnalyzedTokenReadingsFor(int startPos, int endPos, List tokenReadings) {
    int pos = 0;
    for (AnalyzedTokenReadings tokenReading : tokenReadings) {
      String token = tokenReading.getToken();
      if (token.trim().isEmpty() ||
          (token.length() == 1 && Character.isSpaceChar(token.charAt(0)))) {  // needed for non-breaking space
        continue;  // the OpenNLP result has no whitespace, so we need to skip it
      }
      int tokenStart = pos;
      int tokenEnd = pos + token.length();
      if (tokenStart == startPos && tokenEnd == endPos) {
        //System.out.println("!!!" + startPos + " " + endPos + "  " + tokenReading);
        return tokenReading;
      }
      pos = tokenEnd;
    }
    return null;
  }

}