org.languagetool.rules.uk.TokenAgreementPrepNounRule Maven / Gradle / Ivy

Go to download
/* LanguageTool, a natural language style checker 
 * Copyright (C) 2013 Andriy Rysin
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.rules.uk;

import java.io.IOException;
import java.text.MessageFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.ResourceBundle;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import org.apache.commons.lang3.StringUtils;
import org.jetbrains.annotations.Nullable;
import org.languagetool.AnalyzedSentence;
import org.languagetool.AnalyzedToken;
import org.languagetool.AnalyzedTokenReadings;
import org.languagetool.Language;
import org.languagetool.rules.Categories;
import org.languagetool.rules.Rule;
import org.languagetool.rules.RuleMatch;
import org.languagetool.rules.uk.LemmaHelper.Dir;
import org.languagetool.synthesis.Synthesizer;
import org.languagetool.tagging.uk.IPOSTag;
import org.languagetool.tagging.uk.PosTagHelper;

/**
 * A rule that checks if preposition and a noun agree on inflection etc
 * 
 * @author Andriy Rysin
 */
public class TokenAgreementPrepNounRule extends Rule {
  
  private static final List Z_ZI_IZ = Arrays.asList("з", "зі", "із");
  private static final List Z_ZI_IZ_ZO = Arrays.asList("з", "зі", "із", "зо");
  private static final Pattern NOUN_ANIM_V_NAZ_PATTERN = Pattern.compile("noun:anim:.:v_naz.*");
  private static final String VIDMINOK_SUBSTR = ":v_";
  private static final Pattern VIDMINOK_REGEX = Pattern.compile(":(v_[a-z]+)");
  private static final String reqAnimInanimRegex = ":r(?:in)?anim";
  private static final Pattern REQ_ANIM_INANIM_PATTERN = Pattern.compile(reqAnimInanimRegex);

  private final Synthesizer synthesizer;
  private final Language ukrainian;

  static class State {
    int prepPos;
    AnalyzedTokenReadings prepTokenReadings = null;
    boolean ziZnaRemoved = false;
    Set posTagsToFind;
  }

  
  public TokenAgreementPrepNounRule(ResourceBundle messages, Language ukrainian) throws IOException {
    super.setCategory(Categories.MISC.getCategory(messages));
    this.ukrainian = ukrainian;
    this.synthesizer = ukrainian.getSynthesizer();
  }

  @Override
  public final String getId() {
    return "UK_PREP_NOUN_INFLECTION_AGREEMENT";
  }

  @Override
  public String getDescription() {
    return "Узгодження прийменника та іменника у реченні";
  }

  public String getShort() {
    return "Узгодження прийменника та іменника";
  }
  
  @Override
  public final RuleMatch[] match(AnalyzedSentence sentence) throws IOException {
    List ruleMatches = new ArrayList<>();
    AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
    State state = null;
    
    for (int i = 1; i < tokens.length; i++) {
      AnalyzedTokenReadings tokenReadings = tokens[i];

      String posTag = tokenReadings.getAnalyzedToken(0).getPOSTag();
      String thisToken = tokenReadings.getCleanToken();

      // через, м’яко кажучи, невеликої популярності
//      if( prepTokenReadings != null ) {
//        int insertEndPos = findInsertEnd(prepTokenReadings, tokens, i, false);
//        if( insertEndPos > 0 ) {
//          i=insertEndPos;
//          continue;
//        }
//      }

      if (posTag == null
          || posTag.contains(IPOSTag.unknown.getText()) ){
        state = null;
        continue;
      }

      // часто вживають укр. В замість лат.: гепатит В
      // first token is always SENT_START
      if( i > 1
          && thisToken.length() == 1 
          && Character.isUpperCase(thisToken.charAt(0)) 
          && tokenReadings.isWhitespaceBefore() 
          && tokens[i-1].getToken().matches(".*[а-яіїєґ0-9]")) {
        state = null;
        continue;
      }

      AnalyzedToken multiwordReqToken = getMultiwordToken(tokenReadings);
      if( multiwordReqToken != null ) {

        if (Z_ZI_IZ.contains(tokenReadings.getCleanToken().toLowerCase()) 
            && multiwordReqToken.getLemma().startsWith("згідно ") ) { // напр. "згідно з"
          posTag = multiwordReqToken.getPOSTag(); // "rv_oru";
          state = new State();
          state.prepTokenReadings = tokenReadings;
          state.prepPos = i;
          continue;
        }
        else {
          if( posTag.startsWith(IPOSTag.prep.name()) ) {
            state = null;
            continue;
          }

          String mwPosTag = multiwordReqToken.getPOSTag();
          if( ! mwPosTag.contains("adv") && ! mwPosTag.contains("insert") ) {
            state = null;
          }
        }

        continue;
      }


      String token = tokenReadings.getCleanToken();
      if( posTag.startsWith(IPOSTag.prep.name()) ) {
        String prep = token.toLowerCase();

        // що то була за людина
        if( prep.equals("за") && LemmaHelper.reverseSearch(tokens, i, 4, Pattern.compile("що"), null) ) {
          state = null;
          continue;
        }

        // з понад тисячі
        if( prep.equals("понад") )
          continue;

        if( prep.equals("шляхом") || prep.equals("од") || prep.equals("поруч") ) {
          state = null;
          continue;
        }

        state = new State();
        state.prepTokenReadings = tokenReadings;
        state.prepPos = i;
        continue;
      }

      if( state == null )
        continue;

      // з Ван Дамом
      if( Arrays.asList("ван").contains(tokens[i].getCleanToken().toLowerCase()) ) {
        // prepTokenReadings = null;
        continue;
      }
      if( Arrays.asList("Фон").contains(tokens[i].getCleanToken()) ) {
        // prepTokenReadings = null;
        continue;
      }


      // Do actual check

      state.posTagsToFind = new LinkedHashSet<>();
      String prep = state.prepTokenReadings.getAnalyzedToken(0).getLemma();
//      state.prepTokenReadings = state.prepAnalyzedTokenReadings.getReadings();

      // замість Андрій вибрали Федір
      if( prep.equals("замість") ) {
        state.posTagsToFind.add("v_naz");
      }

      Set expectedCases = CaseGovernmentHelper.getCaseGovernments(state.prepTokenReadings, IPOSTag.prep.name());

      // згідно з документа
      if( Z_ZI_IZ_ZO.contains(prep) ) {
        if( "нізвідки".equalsIgnoreCase(tokens[i].getCleanToken()) ) {
          state = null;
          continue;
        }
      
        if( Z_ZI_IZ.contains(prep)
            && i >= 3 
            && tokens[i-2].getCleanToken().equalsIgnoreCase("згідно")) {
          expectedCases = new HashSet<>(Arrays.asList("v_oru"));
        } else if( ! isLikelyApproxWithZi(tokens, i, state) ) {
          expectedCases.remove("v_zna");
          state.ziZnaRemoved = true;
        }
      }

      // we want to ignore «залежно» + noun, but we want to catch «незважаючи» без «на»
//      if( expectedCases.isEmpty() ) {
//        prepTokenReadings = null;
//        continue;
//      }

      expectedCases.remove("v_inf"); // we don't care about rv_inf here
      state.posTagsToFind.addAll(expectedCases);

      RuleException exception = TokenAgreementPrepNounExceptionHelper.getExceptionStrong(tokens, i, state.prepTokenReadings);
      switch( exception.type ) {
      case exception:
        state = null;
        continue;
      case skip:
        i += exception.skip;
        continue;
      case none:
        break;
      }

      
      if( PosTagHelper.hasPosTagPart(tokenReadings, ":v_") ) {

        // домовився за їх. - ненормативна форма
        List pronPosNounReadings = tokenReadings.getReadings().stream()
            .filter(r -> PosTagHelper.hasPosTag(r, Pattern.compile("noun:unanim:.:v_rod.*pron.*")) 
                && Arrays.asList("вони", "він", "вона", "воно").contains(r.getLemma()))
            .collect(Collectors.toList());

        // нього-таки тощо
        //TODO: |його|її
        if( pronPosNounReadings.size() > 0 && ! thisToken.toLowerCase().matches("(них|нього|неї)(-[а-я]+)?") ) {
          if( i < tokens.length - 1 
              && (PosTagHelper.hasPosTag(tokens[i+1], Pattern.compile("(noun|adj|adv|part|num|conj:coord|noninfl).*"))
                  || StringUtils.defaultIfBlank(tokens[i+1].getCleanToken(), "").matches("[\"«„“/$€…]|[a-zA-Z'-]+") ) ) {
            // test next
            // при його ділянці 
            continue;
          }
          else {
            int insertEndPos = findInsertEnd(state.prepTokenReadings, tokens, i+1, true);
            if( insertEndPos > 0 ) {
              i=insertEndPos;
              continue;
            }
            
            RuleMatch potentialRuleMatch = createRuleMatch(state, sentence, tokens, i);
            ruleMatches.add(potentialRuleMatch);
            state = null;
            continue;
          }
        }

        List pronPosAdjReadings = tokenReadings.getReadings().stream()
            .filter(r -> PosTagHelper.hasPosTag(r, Pattern.compile("adj.*pron:pos(?!:bad).*")) 
                && Arrays.asList("їх", "його", "її").contains(r.getLemma()))
            .collect(Collectors.toList());

        // to detect: завдяки його зусиллі
        if( pronPosAdjReadings.size() > 0 ) {

          if (! TokenAgreementPrepNounRule.hasVidmPosTag(state.posTagsToFind, pronPosAdjReadings)) {
            RuleMatch potentialRuleMatch = createRuleMatch(state, sentence, tokens, i);
            ruleMatches.add(potentialRuleMatch);
            state = null;
            continue;
          }

          if( i < tokens.length - 1 ) {
            // test next
            // при їхній ділянці 
            continue;
          }
        }
        else if ( thisToken.equals("їх") ) {
          RuleMatch potentialRuleMatch = createRuleMatch(state, sentence, tokens, i);
          ruleMatches.add(potentialRuleMatch);
          state = null;
          continue;
        }

        if( hasVidmPosTag(state.posTagsToFind, tokenReadings) ) {
          state = null;
          continue;
        }

        exception = TokenAgreementPrepNounExceptionHelper.getExceptionNonInfl(tokens, i, state);
        switch( exception.type ) {
        case exception:
          state = null;
          continue;
        case skip:
          i += exception.skip;
          continue;
        case none:
          break;
        }

        exception = TokenAgreementPrepNounExceptionHelper.getExceptionInfl(tokens, i, state);
        switch( exception.type ) {
        case exception:
          state = null;
          continue;
        case skip:
          i += exception.skip;
          continue;
        case none:
          break;
        }

        RuleMatch potentialRuleMatch = createRuleMatch(state, sentence, tokens, i);
        ruleMatches.add(potentialRuleMatch);
      }
      else { // no _v found

        exception = TokenAgreementPrepNounExceptionHelper.getExceptionNonInfl(tokens, i, state);
        switch( exception.type ) {
        case exception:
          state = null;
          continue;
        case skip:
          i += exception.skip;
          continue;
        case none:
          break;
        }

      }
      state = null;
    }

    return toRuleMatchArray(ruleMatches);
  }

  private static final List approxLemmas = Arrays.asList(
      "розмір", "величина", "товщина", "вартість", "ріст", "зріст", "висота", "глибина", "діаметр", "вага", "обсяг", "площа",
      "приблизно", "десь", "завбільшки", "завширшки", "завдовжки", "завтовшки", "заввишки", "завглибшки");
  private static final Pattern approxTag = Pattern.compile("noun.*v_oru.*|adv.*|part.*");
  private static final Set lemmas = new HashSet<>(LemmaHelper.TIME_LEMMAS);
  static {
    lemmas.addAll(LemmaHelper.DISTANCE_LEMMAS);
    lemmas.addAll(LemmaHelper.PSEUDO_NUM_LEMMAS);
    lemmas.addAll(Arrays.asList("ложка", "ложечка"));
  }

  private boolean isLikelyApproxWithZi(AnalyzedTokenReadings[] tokens, int i, State state) {
    // TODO: ледь не

    // з 2-поверховий, з 10-поверхівку
    if( tokens[i].getCleanToken().matches(".*поверх(ов|ів).*") )
      return true;

    return PosTagHelper.hasPosTag(tokens[i], Pattern.compile("noun:inanim:[fnm]:v_zna.*num.*|num.*"))
        || LemmaHelper.hasLemma(tokens[i], lemmas, Pattern.compile("noun:inanim:[mnf]:v_zna.*"))
        || (i < tokens.length - 1 
            && PosTagHelper.hasPosTag(tokens[i], Pattern.compile("adj:[mnf]:v_zna.*"))
            && LemmaHelper.hasLemma(tokens[i+1], lemmas, Pattern.compile("noun:inanim:[mnf]:v_zna.*")))
        || LemmaHelper.hasLemma(tokens[state.prepPos-1], approxLemmas, approxTag)
        || (i < tokens.length - 1 
            && LemmaHelper.hasLemma(tokens[i+1], approxLemmas, approxTag));
  }

  private static int findInsertEnd(AnalyzedTokenReadings prepTokenReadings, AnalyzedTokenReadings[] tokens, int i, boolean lookForPart) {
    if( i >= tokens.length - 2 )
      return -1;
    
    int nextPos = i;
    AnalyzedTokenReadings tokenReadings = tokens[i];
    
    if( i > tokens.length - 2 )
      return -1;

    if( tokenReadings.getCleanToken().matches("же?") ) {
      nextPos = i+1;
    }

    if( nextPos > tokens.length - 3 )
      return nextPos==i ? -1 : nextPos-1;

    if( tokenReadings.isPosTagUnknown() && tokenReadings.getCleanToken().matches("[,(]") ) {
      int commaPos = LemmaHelper.tokenSearch(tokens, i+1, (String)null, Pattern.compile("[,)]"), null, Dir.FORWARD);
      if( commaPos > i+1 && commaPos < i+6 && commaPos < tokens.length-1 && ! tokens[commaPos+1].getCleanToken().equals("що") ) {
        if( tokenReadings.getCleanToken().replace('(', ')').equals(tokens[commaPos].getCleanToken()) )
          return commaPos;
      }
    }
    return nextPos==i ? -1 : nextPos-1;
  }

  static boolean hasVidmPosTag(Collection posTagsToFind, AnalyzedTokenReadings tokenReadings) {
    return hasVidmPosTag(posTagsToFind, tokenReadings.getReadings());
  }

  static boolean hasVidmPosTag(Collection posTagsToFind, List tokenReadings) {
    boolean vidminokFound = false;  // because POS dictionary is not complete

    for(AnalyzedToken token: tokenReadings) {
      String posTag = token.getPOSTag();

      if( posTag == null ) { // && ! ".".equals(tokenReadings.get(0).getToken()) ) {
        if( tokenReadings.size() == 1)
          return true;

        continue;
      }

      // shortcut
      if( posTag.contains(PosTagHelper.NO_VIDMINOK_SUBSTR) )
        return true;

      if( posTag.contains(VIDMINOK_SUBSTR) ) {
        vidminokFound = true;

        for(String posTagToFind: posTagsToFind) {
          if ( posTag.contains(posTagToFind) )
            return true;
        }
      }
    }

    return ! vidminokFound; //false;
  }

  private RuleMatch createRuleMatch(State state, AnalyzedSentence sentence, AnalyzedTokenReadings[] tokens, int i) throws IOException {
    AnalyzedTokenReadings tokenReadings = tokens[i];
    String tokenString = tokenReadings.getCleanToken().toLowerCase();
    
    List suggestions = new ArrayList<>();
    
    String requiredPostTagsRegEx = ":(" + String.join("|", state.posTagsToFind) + ")";
    for (AnalyzedToken analyzedToken: tokenReadings.getReadings()) {
    
      String oldPosTag = analyzedToken.getPOSTag();
      
      if( oldPosTag == null )
        continue;
      
      String requiredPostTagsRegExToApply = requiredPostTagsRegEx;

      Matcher matcher = REQ_ANIM_INANIM_PATTERN.matcher(oldPosTag);
      if( matcher.find() ) {
        requiredPostTagsRegExToApply += matcher.group(0);
      }
      else {
        requiredPostTagsRegExToApply += "(?:" + reqAnimInanimRegex + ")?";
      }

      String posTag = oldPosTag.replaceFirst(":v_[a-z]+", requiredPostTagsRegExToApply);

      try {
        String[] synthesized = synthesizer.synthesize(analyzedToken, posTag, true);

        suggestions.addAll( Arrays.asList(synthesized) );
      } catch (IOException e) {
        throw new RuntimeException(e);
      }
    }
    
    if( suggestions.size() > 0 ) {  // remove duplicates
      suggestions = new ArrayList<>(new LinkedHashSet<>(suggestions));
    }

    List reqVidminkyNames = new ArrayList<>();
    for (String vidm: state.posTagsToFind) {
      reqVidminkyNames.add(PosTagHelper.VIDMINKY_MAP.get(vidm));
    }

    List foundVidminkyNames = new ArrayList<>();
    for (AnalyzedToken token: tokenReadings) {
      String posTag2 = token.getPOSTag();
      if( posTag2 != null && posTag2.contains(VIDMINOK_SUBSTR) ) {
        String vidmName = PosTagHelper.VIDMINKY_MAP.get(posTag2.replaceFirst("^.*"+VIDMINOK_REGEX+".*$", "$1"));
        if( foundVidminkyNames.contains(vidmName) ) {
          if (posTag2.contains(":p:")) {
            vidmName = vidmName + " (мн.)";
            foundVidminkyNames.add(vidmName);
          }
          // else skip dup
        }
        else {
          foundVidminkyNames.add(vidmName);
        }
      }
    }

    String msg = MessageFormat.format("Прийменник «{0}» вимагає іншого відмінка: {1}, а знайдено: {2}", 
        state.prepTokenReadings.getToken(), String.join(", ", reqVidminkyNames), String.join(", ", foundVidminkyNames));

    if( state.ziZnaRemoved ) {
      msg += ". Але з.в. вимагається у випадках порівнянн предметів.";
    }

    if( state.posTagsToFind.contains("v_rod")
        && tokens[i].getToken().matches(".*[ую]")
        && PosTagHelper.hasPosTag(tokenReadings.getReadings(), Pattern.compile("noun.*?:m:v_dav.*")) ) {
      msg += CaseGovernmentHelper.USED_U_INSTEAD_OF_A_MSG;
    }
    else if( tokenString.equals("їх") && requiredPostTagsRegEx != null ) {
      msg += ". Можливо, тут потрібно присвійний займенник «їхній» або нормативна форма р.в. «них»?";
      try {
        String newYihPostag = "adj:p" + requiredPostTagsRegEx + ".*";
        String[] synthesized = synthesizer.synthesize(new AnalyzedToken("їхній", "adj:m:v_naz:&pron:pos", "їхній"), newYihPostag, true);
        suggestions.addAll( Arrays.asList(synthesized) );
      } catch (IOException e) {
        throw new RuntimeException(e);
      }
    }
    else if( (tokenString.equals("його") || tokenString.equals("її")) && requiredPostTagsRegEx != null ) {
      String repl = tokenString.equals("його") ? "нього" : "неї";
      msg += ". Можливо, тут потрібно присвійний займенник «" + repl + "»?";
      try {
        String newYihPostag = "adj:p" + requiredPostTagsRegEx + ".*";
        String[] synthesized = synthesizer.synthesize(new AnalyzedToken("їхній", "adj:m:v_naz:&pron:pos", "їхній"), newYihPostag, true);
        suggestions.addAll( Arrays.asList(synthesized) );
        suggestions.add(repl);
      } catch (IOException e) {
        throw new RuntimeException(e);
      }
    }
    else if( state.prepTokenReadings.getCleanToken().equalsIgnoreCase("о") ) {
      for(AnalyzedToken token: tokenReadings.getReadings()) {
        if( PosTagHelper.hasPosTag(token, NOUN_ANIM_V_NAZ_PATTERN) ) {
          msg += ". Можливо, тут «о» — це вигук і потрібно кличний відмінок?";
          try {
            String newPostag = token.getPOSTag().replace("v_naz", "v_kly");
            String[] synthesized = synthesizer.synthesize(token, newPostag, false);
            for (String string : synthesized) {
              if( ! string.equals(token.getToken()) && ! suggestions.contains(string) ) {
                suggestions.add( string );
              }
            }
            break;
          } catch (IOException e) {
            throw new RuntimeException(e);
          }
        }
      }
    }
    else if( PosTagHelper.hasPosTagStart(tokens[i-1], "adv")) {
      String mergedToken = state.prepTokenReadings.getCleanToken() + tokens[i-1].getCleanToken();
      List mergedTagged = ukrainian.getTagger().tag(Arrays.asList(mergedToken));
      if( PosTagHelper.hasPosTagStart(mergedTagged.get(0), "adv") ) {
        msg += ". Можливо, прийменник і прислівник мають бути одним словом?";
//        suggestions.add(mergedToken);
      }
      
    }

    RuleMatch potentialRuleMatch = new RuleMatch(this, sentence, tokenReadings.getStartPos(), tokenReadings.getEndPos(), msg, getShort());

    potentialRuleMatch.setSuggestedReplacements(suggestions);

    return potentialRuleMatch;
  }

  @Nullable
  private static AnalyzedToken getMultiwordToken(AnalyzedTokenReadings analyzedTokenReadings) {
      for(AnalyzedToken analyzedToken: analyzedTokenReadings) {
        String posTag = analyzedToken.getPOSTag();
        if( posTag != null && posTag.startsWith("<") )
          return analyzedToken;
      }
      return null;
  }

}