All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.languagetool.rules.uk.UkrainianWordRepeatRule Maven / Gradle / Ivy

The newest version!
package org.languagetool.rules.uk;

import java.util.*;
import java.util.regex.Pattern;

import org.languagetool.*;
import org.languagetool.rules.RuleMatch;
import org.languagetool.rules.WordRepeatRule;
import org.languagetool.tagging.uk.IPOSTag;
import org.languagetool.tagging.uk.PosTagHelper;

/**
 * @since 2.9
 */
public class UkrainianWordRepeatRule extends WordRepeatRule {
  private static final Pattern DATE_TIME_NUM_PATTERN = Pattern.compile("date|time|number.*");
  private static final HashSet REPEAT_ALLOWED_SET = new HashSet<>(
      Arrays.asList("ст.")
  );
  private static final HashSet REPEAT_ALLOWED_CAPS_SET = new HashSet<>(
      Arrays.asList("Джей", "Бі", "Сі", "Ла")
  );

  public UkrainianWordRepeatRule(ResourceBundle messages, Language language) {
    super(messages, language);
  }

  @Override
  public String getId() {
    return "UKRAINIAN_WORD_REPEAT_RULE";
  }

  @Override
  public boolean ignore(AnalyzedTokenReadings[] tokens, int position) {
    AnalyzedTokenReadings analyzedTokenReadings = tokens[position];
    String token = analyzedTokenReadings.getToken();

    // від добра добра не шукають
    if( position > 2 
        && token.equals("добра")
        && tokens[position-2].getToken().equalsIgnoreCase("від") )
      return true;

    // Тому що що?
    if( position > 1 
        && token.equals("що")
        && tokens[position-2].getToken().equalsIgnoreCase("тому") )
      return true;

    // ні так, ні ні
    if( position > 3
        && token.equals("ні")
        && tokens[position-2].getToken().equals(",")
        && tokens[position-3].getToken().equalsIgnoreCase("так") )
      return true;

    if( REPEAT_ALLOWED_SET.contains(token.toLowerCase()) )
      return true;

    if( REPEAT_ALLOWED_CAPS_SET.contains(token) )
      return true;
    
    if( PosTagHelper.hasPosTag(analyzedTokenReadings, DATE_TIME_NUM_PATTERN) )
      return true;
    
    for(AnalyzedToken analyzedToken: analyzedTokenReadings.getReadings()) {
      String posTag = analyzedToken.getPOSTag();
      if( posTag != null ) {
        if ( ! isInitial(analyzedToken, tokens, position)
//            && ! posTag.equals(JLanguageTool.SENTENCE_START_TAGNAME)
            && ! posTag.equals(JLanguageTool.SENTENCE_END_TAGNAME) )
          return false;
      }
    }
    return true;
  }

  private boolean isInitial(AnalyzedToken analyzedToken, AnalyzedTokenReadings[] tokens, int position) {
    return analyzedToken.getPOSTag().contains(IPOSTag.abbr.getText())
        || (analyzedToken.getToken().length() == 1 
        && Character.isUpperCase(analyzedToken.getToken().charAt(0))
        && position < tokens.length-1 && tokens[position+1].getToken().equals("."));
  }

  @Override
  protected RuleMatch createRuleMatch(String prevToken, String token, int prevPos, int pos, String msg, AnalyzedSentence sentence) {
    boolean doubleI = prevToken.equals("І") && token.equals("і");
    if( doubleI ) {
      msg += " або, можливо, перша І має бути латинською.";
    }
    
    RuleMatch ruleMatch = super.createRuleMatch(prevToken, token, prevPos, pos, msg, sentence);

    if( doubleI ) {
      List replacements = new ArrayList<>(ruleMatch.getSuggestedReplacements());
      replacements.add("I і");
      ruleMatch.setSuggestedReplacements(replacements);
    }
    return ruleMatch;
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy