All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.languagetool.LanguageAnnotator Maven / Gradle / Ivy

Go to download

LanguageTool is an Open Source proofreading software for English, French, German, Polish, Romanian, and more than 20 other languages. It finds many errors that a simple spell checker cannot detect like mixing up there/their and it detects some grammar problems.

There is a newer version: 6.5
Show newest version
/* LanguageTool, a natural language style checker
 * Copyright (C) 2020 Daniel Naber (http://www.danielnaber.de)
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool;

import org.languagetool.rules.spelling.VagueSpellChecker;

import java.util.*;
import java.util.stream.Collectors;

/**
 * Detect language per token and add that to the analysis.
 * @since 4.9
 */
public class LanguageAnnotator {

  private static final int MIN_TOKENS = 4;  // fragments shorter than this will get the previous fragment's language

  public LanguageAnnotator() {
  }

  public List detectLanguages(String input, Language mainLang, List secondLangs) {
    List tokens = getTokensWithPotentialLanguages(input, mainLang, secondLangs);
    List> tokenRanges = getTokenRanges(tokens);   // split at boundaries like "."
    //System.out.println("tokenRanges: " + tokenRanges);
    List tokenRangesWithLang = getTokenRangesWithLang(tokenRanges, mainLang, secondLangs);
    //System.out.println("tokenRangesWithLang: " + tokenRangesWithLang);
    Language curLang;
    Language prevLang = mainLang;
    int curPos = 0;
    int fromPos = 0;
    List result = new ArrayList<>();
    for (TokenRangeWithLanguage tokenRange : tokenRangesWithLang) {
      curLang = tokenRange.lang;
      if (tokenRange.tokens.size() == 1 && isQuote(tokenRange.tokens.get(0))) {
        // assign single quote the language of the previous element, assuming it belongs there
        curLang = prevLang;
      } else if (curLang != prevLang) {
        result.add(new FragmentWithLanguage(prevLang.getShortCodeWithCountryAndVariant(), input.substring(fromPos, curPos)));
        fromPos = curPos;
      }
      prevLang = curLang;
      curPos += tokenRange.tokens.stream().mapToInt(String::length).sum();
    }
    result.add(new FragmentWithLanguage(prevLang.getShortCodeWithCountryAndVariant(), input.substring(fromPos)));
    return result;
  }

  List getTokensWithPotentialLanguages(String input, Language mainLang, List secondLangs) {
    List tokens = new ArrayList<>();
    long t1 = System.nanoTime();
    // TODO: tokenizing might be different for languages...
    VagueSpellChecker speller = new VagueSpellChecker();
    List mainLangTokens = mainLang.getWordTokenizer().tokenize(input);
    for (String token : mainLangTokens) {
      if (isWord(token) && speller.isValidWord(token, mainLang)) {
        tokens.add(new TokenWithLanguages(token, mainLang));
      } else {
        tokens.add(new TokenWithLanguages(token));
      }
    }
    for (Language secondLang : secondLangs) {
      int i = 0;
      for (TokenWithLanguages token : tokens) {
        if (isWord(token.token) && speller.isValidWord(token.token, secondLang)) {
          List langs = new ArrayList<>(token.langs);
          langs.add(secondLang);
          tokens.set(i, new TokenWithLanguages(token.token, langs));
        }
        i++;
      }
    }
    long t2 = System.nanoTime();
    long runTime = t2 - t1;
    //System.out.printf(Locale.ENGLISH, "%.0fms, %.2f ms/lookup, %d lookups [%s, %s]\n", runTime/1000.0/1000.0,
    //  (float)runTime/mainLangTokens.size()/1000.0/1000.0,
    //  mainLangTokens.size(), mainLang.getShortCodeWithCountryAndVariant(), secondLangs);
    return tokens;
  }

  List> getTokenRanges(List tokens) {
    List> result = new ArrayList<>();
    List l = new ArrayList<>();
    boolean inQuote = false;
    for (TokenWithLanguages token : tokens) {
      if (isQuote(token.token) && !inQuote) {
        // this is an opening quote - add to next range
        if (l.size() > 0) {
          result.add(l);
        }
        l = new ArrayList<>();
        l.add(token);
      } else if (isBoundary(token)) {
        l.add(token);
        result.add(l);
        l = new ArrayList<>();
      } else {
        l.add(token);
      }
      if (isQuote(token.token)) {
        inQuote = !inQuote;
      }
    }
    if (l.size() > 0) {
      result.add(l);
    }
    return result;
  }

  // For each range, assign the language that has most tokens there. For ambiguous parts
  // (no winning language), use the top language of the next non-ambiguous part:
  List getTokenRangesWithLang(List> tokenRanges, Language mainLang, List secondLangs) {
    List result = new ArrayList<>();
    int i = 0;
    Language prevTopLang = null;
    for (List tokens : tokenRanges) {
      //System.out.println(">"+tokens);
      Language topLang = null;
      boolean allAmbiguous = tokens.stream().allMatch(k -> k.ambiguous());
      if (allAmbiguous) {
        //System.out.println("ALL AMBIG: " + tokens);
        for (int j = i + 1; j < tokenRanges.size(); j++) {
          //System.out.println("testing " + j);
          List nextTokens = tokenRanges.get(j);
          boolean nextAllAmbiguous = nextTokens.stream().allMatch(k -> k.ambiguous());
          if (!nextAllAmbiguous) {
            topLang = getTopLang(mainLang, secondLangs, nextTokens);
            //System.out.println("not ambig at " + j + " (" + nextTokens + ")" +  ", top lang  + " + topLang);
            break;
          }
        }
      }
      if (topLang == null) {
        if (tokens.size() < MIN_TOKENS && prevTopLang != null) {
          topLang = prevTopLang;
        } else {
          topLang = getTopLang(mainLang, secondLangs, tokens);
        }
      }
      List tokenList = tokens.stream().map(k -> k.token).collect(Collectors.toList());
      result.add(new TokenRangeWithLanguage(tokenList, topLang));
      prevTopLang = topLang;
      i++;
    }
    return result;
  }

  private Language getTopLang(Language mainLang, List secondLangs, List tokens) {
    Map langToCount = new HashMap<>();
    langToCount.put(mainLang, tokens.stream().mapToInt(k -> k.langs.contains(mainLang) ? 1 : 0).sum());
    for (Language lang : secondLangs) {
      int langCount = tokens.stream().mapToInt(k -> k.langs.contains(lang) ? 1 : 0).sum();
      langToCount.put(lang, langCount);
    }
    int max = 0;
    Language topLang = mainLang;
    for (Map.Entry entry : langToCount.entrySet()) {
      if (entry.getValue() > max) {
        topLang = entry.getKey();
        max = entry.getValue();
      }
    }
    return topLang;
  }

  private boolean isBoundary(TokenWithLanguages token) {
    return token.token.matches("[.?!;:\"„“»«()\\[\\]\n]");   // TODO: " - "
  }

  private boolean isQuote(String token) {
    return token.matches("[\"„“”»«]");  }

  private static boolean isWord(String s) {
    return !s.trim().isEmpty() && s.matches("\\w+");
  }

  static class TokenWithLanguages {
    private final String token;
    private final List langs;
    TokenWithLanguages(String token, Language... langs) {
      this.token = Objects.requireNonNull(token);
      this.langs = new ArrayList<>(Arrays.asList(Objects.requireNonNull(langs)));
    }
    TokenWithLanguages(String token, List langs) {
      this.token = Objects.requireNonNull(token);
      this.langs = Objects.requireNonNull(langs);
    }
    boolean ambiguous() {
      return langs.size() != 1;
    }
    @Override
    public String toString() {
      //return token + "/" + langs.stream().map(k -> k.getShortCodeWithCountryAndVariant()).collect(Collectors.joining("/"));
      if (isWord(token)) {
        return token + "/" + langs.stream().map(k -> k.getShortCodeWithCountryAndVariant()).collect(Collectors.joining("/"));
      } else {
        return token;
      }
    }
    String getToken() {
      return token;
    }
    @Override
    public boolean equals(Object o) {
      if (this == o) return true;
      if (o == null || getClass() != o.getClass()) return false;
      TokenWithLanguages that = (TokenWithLanguages) o;
      return token.equals(that.token) && langs.equals(that.langs);
    }
    @Override
    public int hashCode() {
      return Objects.hash(token, langs);
    }
  }
  
  static class TokenRangeWithLanguage {
    private final List tokens;
    private final Language lang;
    TokenRangeWithLanguage(List tokens, Language lang) {
      this.tokens = Objects.requireNonNull(tokens);
      this.lang = Objects.requireNonNull(lang);
    }
    @Override
    public String toString() {
      return lang.getShortCodeWithCountryAndVariant() + ": " + tokens;
    }
  }
  
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy