org.apache.lucene.analysis.hunspell.HunspellDictionary Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-analyzers Show documentation
Additional Analyzers
The newest version!
package org.apache.lucene.analysis.hunspell;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import org.apache.lucene.analysis.CharArrayMap;
import org.apache.lucene.util.Version;

import java.io.*;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Locale;

/**
 * In-memory structure for the dictionary (.dic) and affix (.aff)
 * data of a hunspell dictionary.
 */
public class HunspellDictionary {

  static final HunspellWord NOFLAGS = new HunspellWord();
  
  private static final String ALIAS_KEY = "AF";
  private static final String PREFIX_KEY = "PFX";
  private static final String SUFFIX_KEY = "SFX";
  private static final String FLAG_KEY = "FLAG";

  private static final String NUM_FLAG_TYPE = "num";
  private static final String UTF8_FLAG_TYPE = "UTF-8";
  private static final String LONG_FLAG_TYPE = "long";
  
  private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
  private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";

  private static final boolean IGNORE_CASE_DEFAULT = false;

  private CharArrayMap> words;
  private CharArrayMap> prefixes;
  private CharArrayMap> suffixes;

  private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy
  private boolean ignoreCase = IGNORE_CASE_DEFAULT;

  private final Version version;

  private String[] aliases;
  private int aliasCount = 0;

  /**
   * Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
   * and dictionary files.
   * You have to close the provided InputStreams yourself.
   *
   * @param affix InputStream for reading the hunspell affix file (won't be closed).
   * @param dictionary InputStream for reading the hunspell dictionary file (won't be closed).
   * @param version Lucene Version
   * @throws IOException Can be thrown while reading from the InputStreams
   * @throws ParseException Can be thrown if the content of the files does not meet expected formats
   */
  public HunspellDictionary(InputStream affix, InputStream dictionary, Version version) throws IOException, ParseException {
    this(affix, Arrays.asList(dictionary), version, IGNORE_CASE_DEFAULT);
  }

  /**
   * Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
   * and dictionary files.
   * You have to close the provided InputStreams yourself.
   *
   * @param affix InputStream for reading the hunspell affix file (won't be closed).
   * @param dictionary InputStream for reading the hunspell dictionary file (won't be closed).
   * @param version Lucene Version
   * @param ignoreCase If true, dictionary matching will be case insensitive
   * @throws IOException Can be thrown while reading from the InputStreams
   * @throws ParseException Can be thrown if the content of the files does not meet expected formats
   */
  public HunspellDictionary(InputStream affix, InputStream dictionary, Version version, boolean ignoreCase) throws IOException, ParseException {
    this(affix, Arrays.asList(dictionary), version, ignoreCase);
  }

  /**
   * Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
   * and dictionary files.
   * You have to close the provided InputStreams yourself.
   *
   * @param affix InputStream for reading the hunspell affix file (won't be closed).
   * @param dictionaries InputStreams for reading the hunspell dictionary file (won't be closed).
   * @param version Lucene Version
   * @param ignoreCase If true, dictionary matching will be case insensitive
   * @throws IOException Can be thrown while reading from the InputStreams
   * @throws ParseException Can be thrown if the content of the files does not meet expected formats
   */
  public HunspellDictionary(InputStream affix, List dictionaries, Version version, boolean ignoreCase) throws IOException, ParseException {
    this.version = version;
    this.ignoreCase = ignoreCase;
    String encoding = getDictionaryEncoding(affix);
    CharsetDecoder decoder = getJavaEncoding(encoding);
    readAffixFile(affix, decoder);
    words = new CharArrayMap>(version, 65535 /* guess */, this.ignoreCase);
    for (InputStream dictionary : dictionaries) {
      readDictionaryFile(dictionary, decoder);
    }
  }

  /**
   * Looks up HunspellWords that match the String created from the given char array, offset and length
   *
   * @param word Char array to generate the String from
   * @param offset Offset in the char array that the String starts at
   * @param length Length from the offset that the String is
   * @return List of HunspellWords that match the generated String, or {@code null} if none are found
   */
  public List lookupWord(char word[], int offset, int length) {
    return words.get(word, offset, length);
  }

  /**
   * Looks up HunspellAffix prefixes that have an append that matches the String created from the given char array, offset and length
   *
   * @param word Char array to generate the String from
   * @param offset Offset in the char array that the String starts at
   * @param length Length from the offset that the String is
   * @return List of HunspellAffix prefixes with an append that matches the String, or {@code null} if none are found
   */
  public List lookupPrefix(char word[], int offset, int length) {
    return prefixes.get(word, offset, length);
  }

  /**
   * Looks up HunspellAffix suffixes that have an append that matches the String created from the given char array, offset and length
   *
   * @param word Char array to generate the String from
   * @param offset Offset in the char array that the String starts at
   * @param length Length from the offset that the String is
   * @return List of HunspellAffix suffixes with an append that matches the String, or {@code null} if none are found
   */
  public List lookupSuffix(char word[], int offset, int length) {
    return suffixes.get(word, offset, length);
  }

  /**
   * Reads the affix file through the provided InputStream, building up the prefix and suffix maps
   *
   * @param affixStream InputStream to read the content of the affix file from
   * @param decoder CharsetDecoder to decode the content of the file
   * @throws IOException Can be thrown while reading from the InputStream
   */
  private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException {
    prefixes = new CharArrayMap>(version, 8, ignoreCase);
    suffixes = new CharArrayMap>(version, 8, ignoreCase);
    
    BufferedReader reader = new BufferedReader(new InputStreamReader(affixStream, decoder));
    String line = null;
    while ((line = reader.readLine()) != null) {
      if (line.startsWith(ALIAS_KEY)) {
        parseAlias(line);
      } else if (line.startsWith(PREFIX_KEY)) {
        parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN);
      } else if (line.startsWith(SUFFIX_KEY)) {
        parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN);
      } else if (line.startsWith(FLAG_KEY)) {
        // Assume that the FLAG line comes before any prefix or suffixes
        // Store the strategy so it can be used when parsing the dic file
        flagParsingStrategy = getFlagParsingStrategy(line);
      }
    }
  }

  /**
   * Parses a specific affix rule putting the result into the provided affix map
   * 
   * @param affixes Map where the result of the parsing will be put
   * @param header Header line of the affix rule
   * @param reader BufferedReader to read the content of the rule from
   * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex
   *                         pattern
   * @throws IOException Can be thrown while reading the rule
   */
  private void parseAffix(CharArrayMap> affixes,
                          String header,
                          BufferedReader reader,
                          String conditionPattern) throws IOException {
    String args[] = header.split("\\s+");

    boolean crossProduct = args[2].equals("Y");
    
    int numLines = Integer.parseInt(args[3]);
    for (int i = 0; i < numLines; i++) {
      String line = reader.readLine();
      String ruleArgs[] = line.split("\\s+");

      HunspellAffix affix = new HunspellAffix();
      
      affix.setFlag(flagParsingStrategy.parseFlag(ruleArgs[1]));
      affix.setStrip(ruleArgs[2].equals("0") ? "" : ruleArgs[2]);

      String affixArg = ruleArgs[3];
      
      int flagSep = affixArg.lastIndexOf('/');
      if (flagSep != -1) {
        String flagPart = affixArg.substring(flagSep + 1);
        
        if (aliasCount > 0) {
          flagPart = getAliasValue(Integer.parseInt(flagPart));
        } 
        
        char appendFlags[] = flagParsingStrategy.parseFlags(flagPart);
        Arrays.sort(appendFlags);
        affix.setAppendFlags(appendFlags);
        affix.setAppend(affixArg.substring(0, flagSep));
      } else {
        affix.setAppend(affixArg);
      }

      String condition = ruleArgs[4];
      affix.setCondition(condition, String.format(conditionPattern, condition));
      affix.setCrossProduct(crossProduct);
      
      List list = affixes.get(affix.getAppend());
      if (list == null) {
        list = new ArrayList();
        affixes.put(affix.getAppend(), list);
      }
      
      list.add(affix);
    }
  }

  /**
   * Parses the encoding specificed in the affix file readable through the provided InputStream
   *
   * @param affix InputStream for reading the affix file
   * @return Encoding specified in the affix file
   * @throws IOException Can be thrown while reading from the InputStream
   * @throws ParseException Thrown if the first non-empty non-comment line read from the file does not adhere to the format {@code SET }
   */
  private String getDictionaryEncoding(InputStream affix) throws IOException, ParseException {
    final StringBuilder encoding = new StringBuilder();
    for (;;) {
      encoding.setLength(0);
      int ch;
      while ((ch = affix.read()) >= 0) {
        if (ch == '\n') {
          break;
        }
        if (ch != '\r') {
          encoding.append((char)ch);
        }
      }
      if (
          encoding.length() == 0 || encoding.charAt(0) == '#' ||
          // this test only at the end as ineffective but would allow lines only containing spaces:
          encoding.toString().trim().length() == 0
      ) {
        if (ch < 0) {
          throw new ParseException("Unexpected end of affix file.", 0);
        }
        continue;
      }
      if ("SET ".equals(encoding.substring(0, 4))) {
        // cleanup the encoding string, too (whitespace)
        return encoding.substring(4).trim();
      }
      throw new ParseException("The first non-comment line in the affix file must "+
          "be a 'SET charset', was: '" + encoding +"'", 0);
    }
  }

  /**
   * Retrieves the CharsetDecoder for the given encoding.  Note, This isn't perfect as I think ISCII-DEVANAGARI and
   * MICROSOFT-CP1251 etc are allowed...
   *
   * @param encoding Encoding to retrieve the CharsetDecoder for
   * @return CharSetDecoder for the given encoding
   */
  private CharsetDecoder getJavaEncoding(String encoding) {
    Charset charset = Charset.forName(encoding);
    return charset.newDecoder();
  }

  /**
   * Determines the appropriate {@link FlagParsingStrategy} based on the FLAG definiton line taken from the affix file
   *
   * @param flagLine Line containing the flag information
   * @return FlagParsingStrategy that handles parsing flags in the way specified in the FLAG definiton
   */
  private FlagParsingStrategy getFlagParsingStrategy(String flagLine) {
    String flagType = flagLine.substring(5);

    if (NUM_FLAG_TYPE.equals(flagType)) {
      return new NumFlagParsingStrategy();
    } else if (UTF8_FLAG_TYPE.equals(flagType)) {
      return new SimpleFlagParsingStrategy();
    } else if (LONG_FLAG_TYPE.equals(flagType)) {
      return new DoubleASCIIFlagParsingStrategy();
    }

    throw new IllegalArgumentException("Unknown flag type: " + flagType);
  }

  /**
   * Reads the dictionary file through the provided InputStream, building up the words map
   *
   * @param dictionary InputStream to read the dictionary file through
   * @param decoder CharsetDecoder used to decode the contents of the file
   * @throws IOException Can be thrown while reading from the file
   */
  private void readDictionaryFile(InputStream dictionary, CharsetDecoder decoder) throws IOException {
    BufferedReader reader = new BufferedReader(new InputStreamReader(dictionary, decoder));
    // TODO: don't create millions of strings.
    String line = reader.readLine(); // first line is number of entries
    int numEntries = Integer.parseInt(line);
    
    // TODO: the flags themselves can be double-chars (long) or also numeric
    // either way the trick is to encode them as char... but they must be parsed differently
    while ((line = reader.readLine()) != null) {
      String entry;
      HunspellWord wordForm;
      
      int flagSep = line.lastIndexOf('/');
      if (flagSep == -1) {
        wordForm = NOFLAGS;
        entry = line;
      } else {
        // note, there can be comments (morph description) after a flag.
        // we should really look for any whitespace
        int end = line.indexOf('\t', flagSep);
        if (end == -1)
          end = line.length();
        
        String flagPart = line.substring(flagSep + 1, end);
        if (aliasCount > 0) {
          flagPart = getAliasValue(Integer.parseInt(flagPart));
        } 
        
        wordForm = new HunspellWord(flagParsingStrategy.parseFlags(flagPart));
        Arrays.sort(wordForm.getFlags());
        entry = line.substring(0, flagSep);
        if(ignoreCase) {
          entry = entry.toLowerCase(Locale.ENGLISH);
        }
      }
      
      List entries = words.get(entry);
      if (entries == null) {
        entries = new ArrayList();
        words.put(entry, entries);
      }
      entries.add(wordForm);
    }
  }

  public Version getVersion() {
    return version;
  }

  private void parseAlias(String line) {
    String ruleArgs[] = line.split("\\s+");
    if (aliases == null) {
      //first line should be the aliases count
      final int count = Integer.parseInt(ruleArgs[1]);
      aliases = new String[count];
    } else {
      aliases[aliasCount++] = ruleArgs[1];
    }
  }
  
  private String getAliasValue(int id) {
    try {
      return aliases[id - 1];
    } catch (IndexOutOfBoundsException ex) {
      throw new IllegalArgumentException("Bad flag alias number:" + id, ex);
    }
  }

  /**
   * Abstraction of the process of parsing flags taken from the affix and dic files
   */
  private static abstract class FlagParsingStrategy {

    /**
     * Parses the given String into a single flag
     *
     * @param rawFlag String to parse into a flag
     * @return Parsed flag
     */
    char parseFlag(String rawFlag) {
      return parseFlags(rawFlag)[0];
    }

    /**
     * Parses the given String into multiple flags
     *
     * @param rawFlags String to parse into flags
     * @return Parsed flags
     */
    abstract char[] parseFlags(String rawFlags);
  }

  /**
   * Simple implementation of {@link FlagParsingStrategy} that treats the chars in each String as a individual flags.
   * Can be used with both the ASCII and UTF-8 flag types.
   */
  private static class SimpleFlagParsingStrategy extends FlagParsingStrategy {
    /**
     * {@inheritDoc}
     */
    public char[] parseFlags(String rawFlags) {
      return rawFlags.toCharArray();
    }
  }

  /**
   * Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded in its numerical form.  In the case
   * of multiple flags, each number is separated by a comma.
   */
  private static class NumFlagParsingStrategy extends FlagParsingStrategy {
    /**
     * {@inheritDoc}
     */
    public char[] parseFlags(String rawFlags) {
      String[] rawFlagParts = rawFlags.trim().split(",");
      char[] flags = new char[rawFlagParts.length];

      for (int i = 0; i < rawFlagParts.length; i++) {
        // note, removing the trailing X/leading I for nepali... what is the rule here?! 
        flags[i] = (char) Integer.parseInt(rawFlagParts[i].replaceAll("[^0-9]", ""));
      }

      return flags;
    }
  }

  /**
   * Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded as two ASCII characters whose codes
   * must be combined into a single character.
   *
   * TODO (rmuir) test
   */
  private static class DoubleASCIIFlagParsingStrategy extends FlagParsingStrategy {

    /**
     * {@inheritDoc}
     */
    public char[] parseFlags(String rawFlags) {
      if (rawFlags.length() == 0) {
        return new char[0];
      }

      StringBuilder builder = new StringBuilder();
      for (int i = 0; i < rawFlags.length(); i+=2) {
        char cookedFlag = (char) ((int) rawFlags.charAt(i) + (int) rawFlags.charAt(i + 1));
        builder.append(cookedFlag);
      }
      
      char flags[] = new char[builder.length()];
      builder.getChars(0, builder.length(), flags, 0);
      return flags;
    }
  }

  public boolean isIgnoreCase() {
    return ignoreCase;
  }
}