org.apache.lucene.analysis.hunspell.AffixCondition Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-analyzers-common Show documentation
Additional Analyzers
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.hunspell;

import static org.apache.lucene.analysis.hunspell.AffixKind.PREFIX;
import static org.apache.lucene.analysis.hunspell.AffixKind.SUFFIX;

import java.util.regex.PatternSyntaxException;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.RegExp;

/**
 * Checks the "condition" part of affix definition, as in
 *
 * PFX flag stripping prefix [condition [morphological_fields...]]
 */
abstract class AffixCondition {
  public static final String ALWAYS_TRUE_KEY = ".*";
  public static final AffixCondition ALWAYS_TRUE = new AffixCondition() {
    @Override
    public boolean acceptsStem(char[] word, int offset, int length) {
      return true;
    }
  };
  public static final AffixCondition ALWAYS_FALSE = new AffixCondition() {
    @Override
    public boolean acceptsStem(char[] word, int offset, int length) {
      return false;
    }
  };

  public boolean acceptsStem(String stem) {
    return acceptsStem(stem.toCharArray(), 0, stem.length());
  }

  /**
   * @return whether the given word matches this condition as a stem with both "strip" and "affix"
   *     removed
   */
  public abstract boolean acceptsStem(char[] word, int offset, int length);

  /**
   * @return a key used to deduplicate same condition+strip+kind triples. For trivial conditions
   *     that need no check, {@link #ALWAYS_TRUE_KEY} is returned.
   */
  public static String uniqueKey(AffixKind kind, String strip, String condition) {
    if (".".equals(condition)
        || kind == PREFIX && strip.startsWith(condition)
        || kind == SUFFIX && strip.endsWith(condition) && !isRegexp(condition)) {
      return ALWAYS_TRUE_KEY;
    }
    return condition + " " + kind + " " + strip;
  }

  /**
   * Analyzes the given affix kind, strip and condition and returns an object able to efficiently
   * check that condition.
   */
  public static AffixCondition compile(AffixKind kind, String strip, String condition, String line) {
    if (!isRegexp(condition)) {
      if (kind == SUFFIX && condition.endsWith(strip)) {
        return substringCondition(
            kind, condition.substring(0, condition.length() - strip.length()));
      }
      if (kind == PREFIX && condition.startsWith(strip)) {
        return substringCondition(kind, condition.substring(strip.length()));
      }
      return ALWAYS_FALSE;
    }

    int lastBracket = condition.lastIndexOf('[');
    if (lastBracket >= 0 && condition.indexOf(']', lastBracket + 1) < 0) {
      // unclosed [ is tolerated by Hunspell and occurs in some dictionaries
      condition = condition + "]";
    }

    try {
      int conditionChars = countCharPatterns(condition);
      if (conditionChars <= strip.length()) {
        String regex = kind == PREFIX ? ".*" + condition : condition + ".*";
        return strip.matches(regex) ? ALWAYS_TRUE : ALWAYS_FALSE;
      }

      if (kind == PREFIX) {
        int split = skipCharPatterns(condition, strip.length());
        if (!strip.matches(condition.substring(0, split))) {
          return ALWAYS_FALSE;
        }
        return regexpCondition(kind, condition.substring(split), conditionChars - strip.length());
      }

      int split = skipCharPatterns(condition, conditionChars - strip.length());
      if (!strip.matches(condition.substring(split))) {
        return ALWAYS_FALSE;
      }
      return regexpCondition(kind, condition.substring(0, split), conditionChars - strip.length());
    } catch (
        @SuppressWarnings("unused")
        PatternSyntaxException e) {
      return ALWAYS_FALSE;
    } catch (Throwable e) {
      throw new IllegalArgumentException("On line: " + line, e);
    }
  }

  public static int skipCharPatterns(String condition, int count) {
    int pos = 0;
    for (int i = 0; i < count; i++) pos = skipCharPattern(condition, pos);
    return pos;
  }

  public static int countCharPatterns(String condition) {
    int conditionChars = 0;
    for (int i = 0; i < condition.length(); i = skipCharPattern(condition, i)) conditionChars++;
    return conditionChars;
  }

  public static int skipCharPattern(String condition, int pos) {
    if (condition.charAt(pos) == '[') {
      pos = condition.indexOf(']', pos + 1);
      if (pos < 0) {
        throw new AssertionError("Malformed condition " + condition);
      }
    }
    return pos + 1;
  }

  public static boolean isRegexp(String condition) {
    return condition.contains("[") || condition.contains(".") || condition.contains("-");
  }

  public static AffixCondition substringCondition(AffixKind kind, String stemCondition) {
    boolean forSuffix = kind == AffixKind.SUFFIX;
    int condLength = stemCondition.length();
    return new AffixCondition() {
      @Override
      public boolean acceptsStem(char[] word, int offset, int length) {
        if (length < condLength) {
          return false;
        }
        int matchStart = forSuffix ? offset + length - condLength : offset;
        for (int i = 0; i < condLength; i++) {
          if (stemCondition.charAt(i) != word[matchStart + i]) {
            return false;
          }
        }
        return true;
      }
    };
  }

  public static AffixCondition regexpCondition(AffixKind kind, String condition, int charCount) {
    boolean forSuffix = kind == AffixKind.SUFFIX;
    CharacterRunAutomaton automaton =
        new CharacterRunAutomaton(new RegExp(escapeDash(condition), RegExp.NONE).toAutomaton());
    return new AffixCondition() {
      @Override
      public boolean acceptsStem(char[] word, int offset, int length) {
        return length >= charCount
            && automaton.run(word, forSuffix ? offset + length - charCount : offset, charCount);
      }
    };
  }

  // "dash hasn't got special meaning" (we must escape it)
  public static String escapeDash(String re) {
    if (!re.contains("-")) return re;

    // we have to be careful, even though dash doesn't have a special meaning,
    // some dictionaries already escape it (e.g. pt_PT), so we don't want to nullify it
    StringBuilder escaped = new StringBuilder();
    for (int i = 0; i < re.length(); i++) {
      char c = re.charAt(i);
      if (c == '-') {
        escaped.append("\\-");
      } else {
        escaped.append(c);
        if (c == '\\' && i + 1 < re.length()) {
          escaped.append(re.charAt(i + 1));
          i++;
        }
      }
    }
    return escaped.toString();
  }
}