All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.languagetool.chunking.RussianChunker Maven / Gradle / Ivy

/* LanguageTool, a natural language style checker
 * Copyright (C) 2021 Yakov Reztsov (http://www.languagetool.org)
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.Y
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.chunking;

import org.languagetool.Experimental;
import edu.washington.cs.knowitall.regex.Match;
import edu.washington.cs.knowitall.regex.RegularExpression;
import org.languagetool.AnalyzedTokenReadings;

import java.util.*;
import java.util.regex.Pattern;

import static org.languagetool.chunking.RussianChunker.PhraseType.*;

/**
 * A rule-based prototype Russian chunker. Please note that this chunker
 * has not been evaluated as a stand-alone chunker, it has only been used
 * in the context of LanguageTool's error detection rules.
 * @author Yakov Reztsov
 * Based on idea of German LanguageTool Сhunker.
 * @since 5.6
 */
@Experimental
public class RussianChunker implements Chunker {

  private static final Set FILTER_TAGS = new HashSet<>(Arrays.asList("PP", "NPP", "NPS", "MayMissingYO", "VP", "SBAR", "ADJP", "DPT"));
  private static final TokenExpressionFactory FACTORY = new TokenExpressionFactory(false);
 
  private static final Map SYNTAX_EXPANSION = new HashMap<>();
  static {
    SYNTAX_EXPANSION.put("", " *");
    SYNTAX_EXPANSION.put("", " *");
    SYNTAX_EXPANSION.put("", " *");
    SYNTAX_EXPANSION.put("", " *");


  }

  enum PhraseType {
    NP,   // "noun phrase", will be assigned as B-NP for the first token and I-NP for following tokens (like OpenNLP)
    NPS,  // "noun phrase singular"
    NPP,  // "noun phrase plural"
    PP,    // "prepositional phrase" and similar
    MayMissingYO,
    VP,     // verb phrase
    SBAR,
    ADJP,   // participle
    DPT     // adverbial participle
  }

  /** @deprecated for internal use only */
  public static void setDebug(boolean debugMode) {
    debug = debugMode;
  }
  /** @deprecated for internal use only */
  public static boolean isDebug() {
    return debug;
  }
  private static boolean debug = false;

  /*
   * REGEXES1 and REGEXES2 are OpenRegex (https:*github.com/knowitall/openregex) expressions.
   * REGEXES1 roughly emulates the behavior of the OpenNLP chunker by tagging the first
   * token of a noun phrase with B-NP and the remaining ones with I-NP.
   * REGEXES2 builds on those annotations to find complex noun phrases.
   *
   * Syntax:
   *    
   *       string: matches the token itself
   *       regex: matches the token against a regular expression
   *       regexCS: is like regex but case-sensitive
   *       chunk: matches the token's chunk
   *       pos: matches the token's POS tags
   *       posregex: matches the token's POS tags against a regular expression
   *       posre: is a synonym for posregex
   *     is a short form of 
   *     will match tokens with POS tags that contain X as a substring
   *
   * Example to combine two conditions via logical AND:
   *    
   * Example: Quote a regular expression so OpenRegex doesn't get confused:
   *    
   *
   * See SYNTAX_EXPANSION for strings that get expanded before interpreted by OpenRegex.
   * The chunks are added to the existing chunks, unless the last argument of build() is
   * true, in which case existing chunks get overwritten.
   */
  
  private static final List REGEXES1 = Arrays.asList(
      // Иванов Иван Иванович
      build(" + " , NP, true),
      // Иванов И.И.
      build("  <.>  <.> ", NP, true),
      // И.И. Иванов
      build(" <.>  <.>  ", NP, true),
      // verb+verb
      build("* " , VP, false),

      build("<если>", SBAR),  //
      build("<поэтому>", SBAR),  //
      // noun phrase 
      build("  " , NP, true),
     
      build("   " , NP, true),
      // adj -> participle phrase
      build("   " , ADJP, true),
      
      //adverbial participle
      build(" " , DPT),
      build("  " , DPT, true),
      build("   " , DPT, true),
      //participle
      build(" " , ADJP),

      build("  " , ADJP, true),

      build("  " , ADJP, true),
      build("   " , ADJP, true),
      build("    " , ADJP, true),
      build("   " , ADJP, true),      
      build("   " , ADJP, true),
      //
      build("  " , ADJP, false),
      //
      build("<тов>", NP)  // simulate OpenNLP?!
  );

  private static final List REGEXES2 = Arrays.asList(
      // ===== plural and singular noun phrases, based on OpenNLP chunker output ===============
      // "Маша и Миша":
      build(" <и> ", NPP, true),
      build(" <или> ", NPP, true),
      // не + VB
      build("<не> * " , VP, false)
  );

  private static RegularExpressionWithPhraseType build(String expr, PhraseType phraseType) {
    return build(expr, phraseType, false);
  }

  private static RegularExpressionWithPhraseType build(String expr, PhraseType phraseType, boolean overwrite) {
    String expandedExpr = expr;
    for (Map.Entry entry : SYNTAX_EXPANSION.entrySet()) {
      expandedExpr = expandedExpr.replace(entry.getKey(), entry.getValue());
    }
    RegularExpression expression = RegularExpression.compile(expandedExpr, FACTORY);
    return new RegularExpressionWithPhraseType(expression, phraseType, overwrite);
  }

  public RussianChunker() {
  }

  @Override
  public void addChunkTags(List tokenReadings) {
    List chunkTaggedTokens = getBasicChunks(tokenReadings);
    for (RegularExpressionWithPhraseType regex : REGEXES2) {
      apply(regex, chunkTaggedTokens);
    }
    assignChunksToReadings(chunkTaggedTokens);
  }

  List getBasicChunks(List tokenReadings) {
    List chunkTaggedTokens = new ArrayList<>();
    for (AnalyzedTokenReadings tokenReading : tokenReadings) {
      if ((!tokenReading.isWhitespace()) && (!tokenReading.getChunkTags().contains(new ChunkTag("MayMissingYO"))))    {
        List chunkTags = Collections.singletonList(new ChunkTag("O"));
          ChunkTaggedToken chunkTaggedToken = new ChunkTaggedToken(tokenReading.getToken(), chunkTags, tokenReading);
          chunkTaggedTokens.add(chunkTaggedToken);
      }
    }
    if (debug) {
      System.out.println("=============== CHUNKER INPUT ===============");
      System.out.println(getDebugString(chunkTaggedTokens));
    }
    for (RegularExpressionWithPhraseType regex : REGEXES1) {
      apply(regex, chunkTaggedTokens);
    }
    return chunkTaggedTokens;
  }

  private void apply(RegularExpressionWithPhraseType regex, List tokens) {
    String prevDebug = getDebugString(tokens);
    try {
      AffectedSpans affectedSpans = doApplyRegex(regex, tokens);
      String debug = getDebugString(tokens);
      if (!debug.equals(prevDebug)) {
        printDebugInfo(regex, affectedSpans, debug);
      }
    } catch (Exception e) {
      throw new RuntimeException("Could not apply chunk regexp '" + regex + "' to tokens: " + tokens, e);
    }
  }

  private void assignChunksToReadings(List chunkTaggedTokens) {
    for (ChunkTaggedToken taggedToken : chunkTaggedTokens) {
      AnalyzedTokenReadings readings = taggedToken.getReadings();
      if (readings != null) {
        readings.setChunkTags(taggedToken.getChunkTags());
      }
    }
  }

  private AffectedSpans doApplyRegex(RegularExpressionWithPhraseType regex, List tokens) {
    List> matches = regex.expression.findAll(tokens);
    List affectedSpans = new ArrayList<>();
    for (Match match : matches) {
      affectedSpans.add(new Span(match.startIndex(), match.endIndex()));
      for (int i = match.startIndex(); i < match.endIndex(); i++) {
        ChunkTaggedToken token = tokens.get(i);
        List newChunkTags = new ArrayList<>();
        newChunkTags.addAll(token.getChunkTags());
        if (regex.overwrite) {
          List filtered = new ArrayList<>();
          for (ChunkTag newChunkTag : newChunkTags) {
            if (!FILTER_TAGS.contains(newChunkTag.getChunkTag())) {
              filtered.add(newChunkTag);
            }
          }
          newChunkTags = filtered;
        }
        ChunkTag newTag = getChunkTag(regex, match, i);
        if (!newChunkTags.contains(newTag)) {
          newChunkTags.add(newTag);
          newChunkTags.remove(new ChunkTag("O"));
        }
        tokens.set(i, new ChunkTaggedToken(token.getToken(), newChunkTags, token.getReadings()));
      }
    }
    return new AffectedSpans(affectedSpans);
  }

  private ChunkTag getChunkTag(RegularExpressionWithPhraseType regex, Match match, int i) {
    ChunkTag newTag;
    if (regex.phraseType == NP) {
      // we assign the same tags as the OpenNLP chunker, noun
      if (i == match.startIndex()) {
        newTag = new ChunkTag("B-NP");
      } else {
        newTag = new ChunkTag("I-NP");
      }
    } else if (regex.phraseType == NPP) {
      // we assign the same tags as the OpenNLP chunker, plural noun
      if (i == match.startIndex()) {
        newTag = new ChunkTag("B-NP-plural");
      } else {
        newTag = new ChunkTag("I-NP-plural");
      }
    } else if (regex.phraseType == VP) {
      // we assign the same tags as the OpenNLP chunker, verb
      if (i == match.startIndex()) {
        newTag = new ChunkTag("B-VP");
      } else {
        newTag = new ChunkTag("I-VP");
      }
    } else if (regex.phraseType == ADJP) { 
      // 
      if (i == match.startIndex()) {
        newTag = new ChunkTag("B-ADJP");
      } else {
        newTag = new ChunkTag("I-ADJP");
      }
    } else if (regex.phraseType == DPT) {
      // 
      if (i == match.startIndex()) {
        newTag = new ChunkTag("B-DPT");
      } else {
        newTag = new ChunkTag("I-DPT");
      }

    }   else {
      newTag = new ChunkTag(regex.phraseType.name());
    }
    return newTag;
  }

  private void printDebugInfo(RegularExpressionWithPhraseType regex, AffectedSpans affectedSpans, String debug) {
    System.out.println("=== Applied " + regex + " ===");
    if (regex.overwrite) {
      System.out.println("Note: overwrite mode, replacing old " + FILTER_TAGS + " tags");
    }
    String[] debugLines = debug.split("\n");
    int i = 0;
    for (String debugLine : debugLines) {
      if (affectedSpans.isAffected(i)) {
        System.out.println(debugLine.replaceFirst("^  ", " *"));
      } else {
        System.out.println(debugLine);
      }
      i++;
    }
    System.out.println();
  }

  private String getDebugString(List tokens) {
    if (!debug) {
      return "";
    }
    StringBuilder sb = new StringBuilder();
    for (ChunkTaggedToken token : tokens) {
      String tokenReadingStr = token.getReadings().toString().replaceFirst(Pattern.quote(token.getToken()) + "\\[", "[");
      sb.append("  ").append(token).append(" -- ").append(tokenReadingStr).append('\n');
    }
    return sb.toString();
  }

  private static class Span {
    final int startIndex;
    final int endIndex;
    Span(int startIndex, int endIndex) {
      this.startIndex = startIndex;
      this.endIndex = endIndex;
    }
  }

  private static class AffectedSpans {
    final List spans;
    AffectedSpans(List spans) {
      this.spans = spans;
    }
    boolean isAffected(int pos) {
      for (Span span : spans) {
        if (pos >= span.startIndex && pos < span.endIndex) {
          return true;
        }
      }
      return false;
    }
  }

  private static class RegularExpressionWithPhraseType {
    final RegularExpression expression;
    final PhraseType phraseType;
    final boolean overwrite;
    RegularExpressionWithPhraseType(RegularExpression expression, PhraseType phraseType, boolean overwrite) {
      this.expression = expression;
      this.phraseType = phraseType;
      this.overwrite = overwrite;
    }
    @Override
    public String toString() {
      return phraseType + " <= " + expression + " (overwrite: " + overwrite + ")";
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy