org.languagetool.rules.uk.TokenAgreementAdjNounRule Maven / Gradle / Ivy

Go to download
/* LanguageTool, a natural language style checker 
 * Copyright (C) 2013 Andriy Rysin
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.rules.uk;

import java.io.IOException;
import java.text.MessageFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.ResourceBundle;
import java.util.regex.Pattern;

import org.apache.commons.lang3.StringUtils;
import org.languagetool.AnalyzedSentence;
import org.languagetool.AnalyzedToken;
import org.languagetool.AnalyzedTokenReadings;
import org.languagetool.JLanguageTool;
import org.languagetool.Language;
import org.languagetool.rules.Categories;
import org.languagetool.rules.Rule;
import org.languagetool.rules.RuleMatch;
import org.languagetool.rules.uk.InflectionHelper.Inflection;
import org.languagetool.synthesis.Synthesizer;
import org.languagetool.tagging.uk.IPOSTag;
import org.languagetool.tagging.uk.PosTagHelper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * A rule that checks if adjective and following noun agree on gender and inflection
 * 
 * @author Andriy Rysin
 */
public class TokenAgreementAdjNounRule extends Rule {
  static final List FAKE_FEM_LIST = Arrays.asList("ступінь", "степінь", "продаж", "собака", "дріб", "ярмарок", "нежить", "рукопис", "накип", "насип", "путь");

  private static final Logger logger = LoggerFactory.getLogger(TokenAgreementAdjNounRule.class);

  static final Pattern ADJ_INFLECTION_PATTERN = Pattern.compile(":([mfnp]):(v_...)(:r(in)?anim)?");
  static final Pattern NOUN_INFLECTION_PATTERN = Pattern.compile("((?:[iu]n)?anim):([mfnps]):(v_...)");

  private final Synthesizer synthesizer;

  public TokenAgreementAdjNounRule(ResourceBundle messages, Language ukrainian) throws IOException {
    super.setCategory(Categories.MISC.getCategory(messages));
    this.synthesizer = ukrainian.getSynthesizer();
  }

  @Override
  public final String getId() {
    return "UK_ADJ_NOUN_INFLECTION_AGREEMENT";
  }

  @Override
  public String getDescription() {
    return "Узгодження відмінків, роду і числа прикметника та іменника";
  }

  public String getShort() {
    return "Узгодження прикметника та іменника";
  }

  private static class State {
    int adjPos;
    List adjTokenReadings = new ArrayList<>(); 
    AnalyzedTokenReadings adjAnalyzedTokenReadings = null;
    
    public boolean isEmpty() {
      return adjTokenReadings.isEmpty();
    }
    public void reset() {
      adjTokenReadings.clear();
      adjAnalyzedTokenReadings = null;
    }
  }
  
  @Override
  public final RuleMatch[] match(AnalyzedSentence sentence) {
    List ruleMatches = new ArrayList<>();
    AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();

    State state = new State();

    for (int i = 1; i < tokens.length; i++) {
      AnalyzedTokenReadings tokenReadings = tokens[i];

      String posTag0 = tokenReadings.getAnalyzedToken(0).getPOSTag();

      if( posTag0 == null ) {
        state.reset();
        continue;
      }

      if( state.isEmpty() ) {
        // no need to start checking on last token or if no noun
        if( i == tokens.length - 1 )
          continue;
      }
      else {

        if( (PosTagHelper.hasPosTagPartAll(tokenReadings, "adv")
             || Arrays.asList("дуже", "небагато", "багато").contains(tokenReadings.getCleanToken()) )
            //TODO: temp for adv/prep
            && ! ( i < tokens.length - 1
//                && Arrays.asList("стосовно", "відносно", "усередині", "всередині", "неподалік").contains(tokenReadings.getCleanToken())
//                 && PosTagHelper.hasPosTag(tokens[i+1], Pattern.compile(".*v_rod.*")) )
                && PosTagHelper.hasPosTagStart(tokens[i], "prep")
                && TokenAgreementPrepNounRule.hasVidmPosTag(
                     CaseGovernmentHelper.getCaseGovernments(tokens[i], IPOSTag.prep.name()), tokens[i+1]))
            && PosTagHelper.hasPosTagPart(state.adjTokenReadings, "adjp") ) {
          continue;
        }

      }

      // grab initial adjective inflections

      if( PosTagHelper.hasPosTagStart(tokens[i], "adj") ) {
        state.reset();

        //TODO: nv still can be wrong if :np/:ns is present to it's not much gain for lots of work
        if( PosTagHelper.hasPosTagPart(tokens[i], PosTagHelper.NO_VIDMINOK_SUBSTR)
            //TODO: turn back on when we can handle pron
//            || PosTagHelper.hasPosTagPart(tokens[i], "&pron")
//            || Arrays.asList("сам").contains(tokens[i].getCleanToken())
            || LemmaHelper.hasLemma(tokens[i], Arrays.asList("який", "котрий", "сам"))
            || PosTagHelper.hasPosTagPart(tokens[i], "<") )
          continue;

        //        if( LemmaHelper.hasLemma(tokens[i], Arrays.asList("червоний", "правий", "місцевий", "найсильніший", "найкращі"), ":p:")
        //            || LemmaHelper.hasLemma(tokens[i], Arrays.asList("новенький", "головний", "вибраний", "більший", "побачений", "подібний"), ":n:")
        //            || LemmaHelper.hasLemma(tokens[i], Arrays.asList("державний"), ":f:") ) {
        //          adjTokenReadings.clear();
        //          break;
        //        }

        if ( LemmaHelper.hasLemma(tokens[i], Arrays.asList("подібний"), ":n:") ) {
          state.reset();
          break;
        }

        for (AnalyzedToken token: tokenReadings) {
          String adjPosTag = token.getPOSTag();

          if( adjPosTag == null ) { // can happen for words with \u0301 or \u00AD
            continue;
          }

          if( adjPosTag.startsWith("adj") ) {
            state.adjPos = i;
            state.adjTokenReadings.add(token);
            state.adjAnalyzedTokenReadings = tokenReadings;
          }
          else if( ! LemmaHelper.hasLemma(tokenReadings, Arrays.asList("другий"), "adj:f:")
              || (i + 1 < tokens.length && ! LemmaHelper.hasLemma(tokens[i+1], FAKE_FEM_LIST, "noun:inanim:m:"))
              && ! PosTagHelper.isPredictOrInsert(token) ) {
            state.reset();
            break;
          }
        }

        continue;
      }

      if( state.isEmpty() )
        continue;
      
      if( // ! PosTagHelper.hasPosTagPart(tokens[i+1], "noun:")
          PosTagHelper.hasPosTagPart(tokens[i], PosTagHelper.NO_VIDMINOK_SUBSTR)
          || PosTagHelper.hasPosTagPart(tokens[i], "&pron")
         // || PosTagHelper.hasPosTagPart(tokens[i], "<") 
         ) {
        
        state.reset();
        continue;
      }


      List nounTokenReadings = new ArrayList<>();

      for (AnalyzedToken token: tokenReadings) {
        String nounPosTag = token.getPOSTag();

        if( nounPosTag == null ) { // can happen for words with \u0301 or \u00AD
          continue;
        }

        if( nounPosTag.startsWith("noun") ) { 
//            && ! nounPosTag.contains(PosTagHelper.NO_VIDMINOK_SUBSTR) ) {

          nounTokenReadings.add(token);
        }
        else if ( nounPosTag.equals(JLanguageTool.SENTENCE_END_TAGNAME)
            || nounPosTag.equals(JLanguageTool.PARAGRAPH_END_TAGNAME) ) {
          continue;
        }
        else if( ! PosTagHelper.isPredictOrInsert(token) ) {
          nounTokenReadings.clear();
          break;
        }
      }


      // no slave token - restart

      if( nounTokenReadings.isEmpty() ) {
        state.reset();
        continue;
      }

      logger.debug("=== Checking:\n\t{}\n\t{}", state.adjTokenReadings, nounTokenReadings);

      // perform the check

      List masterInflections = InflectionHelper.getAdjInflections(state.adjTokenReadings);

      List slaveInflections = InflectionHelper.getNounInflections(nounTokenReadings, Pattern.compile("v_zna:var"));

      if( Collections.disjoint(masterInflections, slaveInflections) ) {

        if( TokenAgreementAdjNounExceptionHelper.isException(tokens, state.adjPos, i, masterInflections, slaveInflections, state.adjTokenReadings, nounTokenReadings) ) {
          state.reset();
          continue;
        }

        if( logger.isDebugEnabled()) {
          logger.debug(MessageFormat.format("=== Found:\n\t{0}\n\t",
              state.adjAnalyzedTokenReadings.getToken() + ": " + masterInflections + " // " + state.adjAnalyzedTokenReadings,
            nounTokenReadings.get(0).getToken() + ": " + slaveInflections+ " // " + nounTokenReadings));
        }

        String msg = String.format("Потенційна помилка: прикметник не узгоджений з іменником: \"%s\": [%s] і \"%s\": [%s]", 
            state.adjTokenReadings.get(0).getToken(), formatInflections(masterInflections, true),
            nounTokenReadings.get(0).getToken(), formatInflections(slaveInflections, false));

        if( PosTagHelper.hasPosTagPart(state.adjTokenReadings, ":m:v_rod")
            && tokens[i].getToken().matches(".*[ую]")
            && PosTagHelper.hasPosTag(nounTokenReadings, Pattern.compile("noun.*?:m:v_dav.*")) ) {
          msg += CaseGovernmentHelper.USED_U_INSTEAD_OF_A_MSG;
        }
        else if( state.adjAnalyzedTokenReadings.getToken().contains("-")
            && Pattern.compile(".*([23]-є|[02-9]-а|[0-9]-м[иа])").matcher(state.adjAnalyzedTokenReadings.getToken()).matches() ) {
          msg += ". Можливо, вжито зайве літерне нарощення після кількісного числівника?";
        }
        else if( state.adjAnalyzedTokenReadings.getToken().startsWith("не")
        // TODO: && tag(adjAnalyzedTokenReadings.getToken().substring(2)) has adjp
            && PosTagHelper.hasPosTag(nounTokenReadings, "noun.*?:v_oru.*") ) {
          msg += ". Можливо, тут «не» потрібно писати окремо?";
        }
        else if( ! PosTagHelper.hasPosTag(state.adjTokenReadings, "adj.*?v_mis.*")
            && PosTagHelper.hasPosTag(nounTokenReadings, "noun.*?v_mis.*") ) {
          msg += ". Можливо, пропущено прийменник на/в/у...?";
        }

        RuleMatch potentialRuleMatch = new RuleMatch(this, sentence, state.adjAnalyzedTokenReadings.getStartPos(), tokenReadings.getEndPos(), msg, getShort());

        List suggestions = new ArrayList<>();

        try {

        for (Inflection adjInflection : masterInflections) {
          String genderTag = ":"+adjInflection.gender+":";
          String vidmTag = adjInflection._case;


            if( ! adjInflection._case.equals("v_kly")
                && (adjInflection.gender.equals("p")
                || PosTagHelper.hasPosTagPart(nounTokenReadings, genderTag)) ) {
              for(AnalyzedToken nounToken: nounTokenReadings) {

                if( adjInflection.animMatters() ) {
                  if( ! nounToken.getPOSTag().contains(":" + adjInflection.animTag) )
                    continue;
                }

                String newNounPosTag = nounToken.getPOSTag().replaceFirst(":.:v_...", genderTag + vidmTag);

                String[] synthesized = synthesizer.synthesize(nounToken, newNounPosTag, false);

                for (String s : synthesized) {
                  String suggestion = state.adjAnalyzedTokenReadings.getToken() + " " + s;
                  if( ! suggestions.contains(suggestion) ) {
                    suggestions.add(suggestion);
                  }
                }
              }
            }
        }

        for (Inflection nounInflection : slaveInflections) {
          String genderTag = ":"+nounInflection.gender+":";
          String vidmTag = nounInflection._case;

          if( nounInflection.animMatters() ) {
            vidmTag += ":r" + nounInflection.animTag;
          }

          for(AnalyzedToken adjToken: state.adjTokenReadings) {
            String newAdjTag = adjToken.getPOSTag().replaceFirst(":.:v_...(:r(in)?anim)?", genderTag + vidmTag);

            String[] synthesized = synthesizer.synthesize(adjToken, newAdjTag, false);

            for (String s : synthesized) {
              String suggestion = s + " " + tokenReadings.getToken();
              if( ! suggestions.contains(suggestion) ) {
                suggestions.add(suggestion);
              }
            }
          }

        }

        } catch (IOException e) {
          throw new RuntimeException(e);
        }

        if( msg.contains("кількісного числівника") ) {
          String suggNum = state.adjAnalyzedTokenReadings.getCleanToken().replaceFirst("[-–]м[аи]$", "")
              + " " + tokenReadings.getToken();
          suggestions.add(suggNum);
        }
        
        if( suggestions.size() > 0 ) {
            potentialRuleMatch.setSuggestedReplacements(suggestions);
        }

        ruleMatches.add(potentialRuleMatch);
      }

      state.reset();
    }

    return toRuleMatchArray(ruleMatches);
  }

  static String formatInflections(List inflections, boolean adj) {

    Collections.sort(inflections);
    
    Map> map = new LinkedHashMap<>();
    
    for (Inflection inflection : inflections) {
      if( ! map.containsKey(inflection.gender) ) {
        map.put(inflection.gender, new ArrayList<>());
      }
      String caseStr = PosTagHelper.VIDMINKY_MAP.get(inflection._case);
      if( adj && inflection.animTag != null ) {
        caseStr += " (" + (inflection.animTag.equals("anim") ? "іст." : "неіст.") + ")";
      }
      map.get(inflection.gender).add(caseStr);
    }

    
    List list = new ArrayList<>();
    for(Entry> entry : map.entrySet()) {
      String genderStr = PosTagHelper.GENDER_MAP.get(entry.getKey());
      
      List caseValues = entry.getValue();

      list.add(genderStr + ": " + StringUtils.join(caseValues, ", "));
    }
    
    return StringUtils.join(list, ", ");
  }

}