All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.analysis.hunspell.Dictionary Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.hunspell;


import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.OfflineSorter;
import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.RegExp;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.CharSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.IntSequenceOutputs;
import org.apache.lucene.util.fst.Outputs;
import org.apache.lucene.util.fst.Util;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.OutputStream;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * In-memory structure for the dictionary (.dic) and affix (.aff)
 * data of a hunspell dictionary.
 */
public class Dictionary {

  static final char[] NOFLAGS = new char[0];
  
  private static final String ALIAS_KEY = "AF";
  private static final String MORPH_ALIAS_KEY = "AM";
  private static final String PREFIX_KEY = "PFX";
  private static final String SUFFIX_KEY = "SFX";
  private static final String FLAG_KEY = "FLAG";
  private static final String COMPLEXPREFIXES_KEY = "COMPLEXPREFIXES";
  private static final String CIRCUMFIX_KEY = "CIRCUMFIX";
  private static final String IGNORE_KEY = "IGNORE";
  private static final String ICONV_KEY = "ICONV";
  private static final String OCONV_KEY = "OCONV";
  private static final String FULLSTRIP_KEY = "FULLSTRIP";
  private static final String LANG_KEY = "LANG";
  private static final String KEEPCASE_KEY = "KEEPCASE";
  private static final String NEEDAFFIX_KEY = "NEEDAFFIX";
  private static final String PSEUDOROOT_KEY = "PSEUDOROOT";
  private static final String ONLYINCOMPOUND_KEY = "ONLYINCOMPOUND";

  private static final String NUM_FLAG_TYPE = "num";
  private static final String UTF8_FLAG_TYPE = "UTF-8";
  private static final String LONG_FLAG_TYPE = "long";
  
  // TODO: really for suffixes we should reverse the automaton and run them backwards
  private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
  private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";

  FST prefixes;
  FST suffixes;
  
  // all condition checks used by prefixes and suffixes. these are typically re-used across
  // many affix stripping rules. so these are deduplicated, to save RAM.
  ArrayList patterns = new ArrayList<>();
  
  // the entries in the .dic file, mapping to their set of flags.
  // the fst output is the ordinal list for flagLookup
  FST words;
  // the list of unique flagsets (wordforms). theoretically huge, but practically
  // small (e.g. for polish this is 756), otherwise humans wouldn't be able to deal with it either.
  BytesRefHash flagLookup = new BytesRefHash();
  
  // the list of unique strip affixes.
  char[] stripData;
  int[] stripOffsets;
  
  // 8 bytes per affix
  byte[] affixData = new byte[64];
  private int currentAffix = 0;

  private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy

  // AF entries
  private String[] aliases;
  private int aliasCount = 0;
  
  // AM entries
  private String[] morphAliases;
  private int morphAliasCount = 0;
  
  // st: morphological entries (either directly, or aliased from AM)
  private String[] stemExceptions = new String[8];
  private int stemExceptionCount = 0;
  // we set this during sorting, so we know to add an extra FST output.
  // when set, some words have exceptional stems, and the last entry is a pointer to stemExceptions
  boolean hasStemExceptions;
  
  private final Path tempDir = OfflineSorter.getDefaultTempDir(); // TODO: make this configurable?
  
  boolean ignoreCase;
  boolean complexPrefixes;
  boolean twoStageAffix; // if no affixes have continuation classes, no need to do 2-level affix stripping
  
  int circumfix = -1; // circumfix flag, or -1 if one is not defined
  int keepcase = -1;  // keepcase flag, or -1 if one is not defined
  int needaffix = -1; // needaffix flag, or -1 if one is not defined
  int onlyincompound = -1; // onlyincompound flag, or -1 if one is not defined
  
  // ignored characters (dictionary, affix, inputs)
  private char[] ignore;
  
  // FSTs used for ICONV/OCONV, output ord pointing to replacement text
  FST iconv;
  FST oconv;
  
  boolean needsInputCleaning;
  boolean needsOutputCleaning;
  
  // true if we can strip suffixes "down to nothing"
  boolean fullStrip;
  
  // language declaration of the dictionary
  String language;
  // true if case algorithms should use alternate (Turkish/Azeri) mapping
  boolean alternateCasing;
  
  /**
   * Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix
   * and dictionary files.
   * You have to close the provided InputStreams yourself.
   *
   * @param affix InputStream for reading the hunspell affix file (won't be closed).
   * @param dictionary InputStream for reading the hunspell dictionary file (won't be closed).
   * @throws IOException Can be thrown while reading from the InputStreams
   * @throws ParseException Can be thrown if the content of the files does not meet expected formats
   */
  public Dictionary(InputStream affix, InputStream dictionary) throws IOException, ParseException {
    this(affix, Collections.singletonList(dictionary), false);
  }

  /**
   * Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix
   * and dictionary files.
   * You have to close the provided InputStreams yourself.
   *
   * @param affix InputStream for reading the hunspell affix file (won't be closed).
   * @param dictionaries InputStream for reading the hunspell dictionary files (won't be closed).
   * @throws IOException Can be thrown while reading from the InputStreams
   * @throws ParseException Can be thrown if the content of the files does not meet expected formats
   */
  public Dictionary(InputStream affix, List dictionaries, boolean ignoreCase) throws IOException, ParseException {
    this.ignoreCase = ignoreCase;
    this.needsInputCleaning = ignoreCase;
    this.needsOutputCleaning = false; // set if we have an OCONV
    flagLookup.add(new BytesRef()); // no flags -> ord 0

    Path aff = Files.createTempFile(tempDir, "affix", "aff");
    OutputStream out = new BufferedOutputStream(Files.newOutputStream(aff));
    InputStream aff1 = null;
    InputStream aff2 = null;
    boolean success = false;
    try {
      // copy contents of affix stream to temp file
      final byte [] buffer = new byte [1024 * 8];
      int len;
      while ((len = affix.read(buffer)) > 0) {
        out.write(buffer, 0, len);
      }
      out.close();
      
      // pass 1: get encoding
      aff1 = new BufferedInputStream(Files.newInputStream(aff));
      String encoding = getDictionaryEncoding(aff1);
      
      // pass 2: parse affixes
      CharsetDecoder decoder = getJavaEncoding(encoding);
      aff2 = new BufferedInputStream(Files.newInputStream(aff));
      readAffixFile(aff2, decoder);
      
      // read dictionary entries
      IntSequenceOutputs o = IntSequenceOutputs.getSingleton();
      Builder b = new Builder<>(FST.INPUT_TYPE.BYTE4, o);
      readDictionaryFiles(dictionaries, decoder, b);
      words = b.finish();
      aliases = null; // no longer needed
      morphAliases = null; // no longer needed
      success = true;
    } finally {
      IOUtils.closeWhileHandlingException(out, aff1, aff2);
      if (success) {
        Files.delete(aff);
      } else {
        IOUtils.deleteFilesIgnoringExceptions(aff);
      }
    }
  }

  /**
   * Looks up Hunspell word forms from the dictionary
   */
  IntsRef lookupWord(char word[], int offset, int length) {
    return lookup(words, word, offset, length);
  }

  // only for testing
  IntsRef lookupPrefix(char word[], int offset, int length) {
    return lookup(prefixes, word, offset, length);
  }

  // only for testing
  IntsRef lookupSuffix(char word[], int offset, int length) {
    return lookup(suffixes, word, offset, length);
  }
  
  IntsRef lookup(FST fst, char word[], int offset, int length) {
    if (fst == null) {
      return null;
    }
    final FST.BytesReader bytesReader = fst.getBytesReader();
    final FST.Arc arc = fst.getFirstArc(new FST.Arc());
    // Accumulate output as we go
    final IntsRef NO_OUTPUT = fst.outputs.getNoOutput();
    IntsRef output = NO_OUTPUT;
    
    int l = offset + length;
    try {
      for (int i = offset, cp = 0; i < l; i += Character.charCount(cp)) {
        cp = Character.codePointAt(word, i, l);
        if (fst.findTargetArc(cp, arc, arc, bytesReader) == null) {
          return null;
        } else if (arc.output != NO_OUTPUT) {
          output = fst.outputs.add(output, arc.output);
        }
      }
      if (fst.findTargetArc(FST.END_LABEL, arc, arc, bytesReader) == null) {
        return null;
      } else if (arc.output != NO_OUTPUT) {
        return fst.outputs.add(output, arc.output);
      } else {
        return output;
      }
    } catch (IOException bogus) {
      throw new RuntimeException(bogus);
    }
  }

  /**
   * Reads the affix file through the provided InputStream, building up the prefix and suffix maps
   *
   * @param affixStream InputStream to read the content of the affix file from
   * @param decoder CharsetDecoder to decode the content of the file
   * @throws IOException Can be thrown while reading from the InputStream
   */
  private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException, ParseException {
    TreeMap> prefixes = new TreeMap<>();
    TreeMap> suffixes = new TreeMap<>();
    Map seenPatterns = new HashMap<>();
    
    // zero condition -> 0 ord
    seenPatterns.put(".*", 0);
    patterns.add(null);
    
    // zero strip -> 0 ord
    Map seenStrips = new LinkedHashMap<>();
    seenStrips.put("", 0);

    LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
    String line = null;
    while ((line = reader.readLine()) != null) {
      // ignore any BOM marker on first line
      if (reader.getLineNumber() == 1 && line.startsWith("\uFEFF")) {
        line = line.substring(1);
      }
      if (line.startsWith(ALIAS_KEY)) {
        parseAlias(line);
      } else if (line.startsWith(MORPH_ALIAS_KEY)) {
        parseMorphAlias(line);
      } else if (line.startsWith(PREFIX_KEY)) {
        parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips);
      } else if (line.startsWith(SUFFIX_KEY)) {
        parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips);
      } else if (line.startsWith(FLAG_KEY)) {
        // Assume that the FLAG line comes before any prefix or suffixes
        // Store the strategy so it can be used when parsing the dic file
        flagParsingStrategy = getFlagParsingStrategy(line);
      } else if (line.equals(COMPLEXPREFIXES_KEY)) {
        complexPrefixes = true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix
      } else if (line.startsWith(CIRCUMFIX_KEY)) {
        String parts[] = line.split("\\s+");
        if (parts.length != 2) {
          throw new ParseException("Illegal CIRCUMFIX declaration", reader.getLineNumber());
        }
        circumfix = flagParsingStrategy.parseFlag(parts[1]);
      } else if (line.startsWith(KEEPCASE_KEY)) {
        String parts[] = line.split("\\s+");
        if (parts.length != 2) {
          throw new ParseException("Illegal KEEPCASE declaration", reader.getLineNumber());
        }
        keepcase = flagParsingStrategy.parseFlag(parts[1]);
      } else if (line.startsWith(NEEDAFFIX_KEY) || line.startsWith(PSEUDOROOT_KEY)) {
        String parts[] = line.split("\\s+");
        if (parts.length != 2) {
          throw new ParseException("Illegal NEEDAFFIX declaration", reader.getLineNumber());
        }
        needaffix = flagParsingStrategy.parseFlag(parts[1]);
      } else if (line.startsWith(ONLYINCOMPOUND_KEY)) {
        String parts[] = line.split("\\s+");
        if (parts.length != 2) {
          throw new ParseException("Illegal ONLYINCOMPOUND declaration", reader.getLineNumber());
        }
        onlyincompound = flagParsingStrategy.parseFlag(parts[1]);
      } else if (line.startsWith(IGNORE_KEY)) {
        String parts[] = line.split("\\s+");
        if (parts.length != 2) {
          throw new ParseException("Illegal IGNORE declaration", reader.getLineNumber());
        }
        ignore = parts[1].toCharArray();
        Arrays.sort(ignore);
        needsInputCleaning = true;
      } else if (line.startsWith(ICONV_KEY) || line.startsWith(OCONV_KEY)) {
        String parts[] = line.split("\\s+");
        String type = parts[0];
        if (parts.length != 2) {
          throw new ParseException("Illegal " + type + " declaration", reader.getLineNumber());
        }
        int num = Integer.parseInt(parts[1]);
        FST res = parseConversions(reader, num);
        if (type.equals("ICONV")) {
          iconv = res;
          needsInputCleaning |= iconv != null;
        } else {
          oconv = res;
          needsOutputCleaning |= oconv != null;
        }
      } else if (line.startsWith(FULLSTRIP_KEY)) {
        fullStrip = true;
      } else if (line.startsWith(LANG_KEY)) {
        language = line.substring(LANG_KEY.length()).trim();
        alternateCasing = "tr_TR".equals(language) || "az_AZ".equals(language);
      }
    }
    
    this.prefixes = affixFST(prefixes);
    this.suffixes = affixFST(suffixes);
    
    int totalChars = 0;
    for (String strip : seenStrips.keySet()) {
      totalChars += strip.length();
    }
    stripData = new char[totalChars];
    stripOffsets = new int[seenStrips.size()+1];
    int currentOffset = 0;
    int currentIndex = 0;
    for (String strip : seenStrips.keySet()) {
      stripOffsets[currentIndex++] = currentOffset;
      strip.getChars(0, strip.length(), stripData, currentOffset);
      currentOffset += strip.length();
    }
    assert currentIndex == seenStrips.size();
    stripOffsets[currentIndex] = currentOffset;
  }
  
  private FST affixFST(TreeMap> affixes) throws IOException {
    IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
    Builder builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs);
    IntsRefBuilder scratch = new IntsRefBuilder();
    for (Map.Entry> entry : affixes.entrySet()) {
      Util.toUTF32(entry.getKey(), scratch);
      List entries = entry.getValue();
      IntsRef output = new IntsRef(entries.size());
      for (Integer c : entries) {
        output.ints[output.length++] = c;
      }
      builder.add(scratch.get(), output);
    }
    return builder.finish();
  }
  
  static String escapeDash(String re) {
    // we have to be careful, even though dash doesn't have a special meaning,
    // some dictionaries already escape it (e.g. pt_PT), so we don't want to nullify it
    StringBuilder escaped = new StringBuilder();
    for (int i = 0; i < re.length(); i++) {
      char c = re.charAt(i);
      if (c == '-') {
        escaped.append("\\-");
      } else {
        escaped.append(c);
        if (c == '\\' && i + 1 < re.length()) {
          escaped.append(re.charAt(i+1));
          i++;
        }
      }
    }
    return escaped.toString();
  }

  /**
   * Parses a specific affix rule putting the result into the provided affix map
   * 
   * @param affixes Map where the result of the parsing will be put
   * @param header Header line of the affix rule
   * @param reader BufferedReader to read the content of the rule from
   * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex
   *                         pattern
   * @param seenPatterns map from condition -> index of patterns, for deduplication.
   * @throws IOException Can be thrown while reading the rule
   */
  private void parseAffix(TreeMap> affixes,
                          String header,
                          LineNumberReader reader,
                          String conditionPattern,
                          Map seenPatterns,
                          Map seenStrips) throws IOException, ParseException {
    
    BytesRefBuilder scratch = new BytesRefBuilder();
    StringBuilder sb = new StringBuilder();
    String args[] = header.split("\\s+");

    boolean crossProduct = args[2].equals("Y");
    boolean isSuffix = conditionPattern == SUFFIX_CONDITION_REGEX_PATTERN;
    
    int numLines = Integer.parseInt(args[3]);
    affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3));
    ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3);
    
    for (int i = 0; i < numLines; i++) {
      assert affixWriter.getPosition() == currentAffix << 3;
      String line = reader.readLine();
      String ruleArgs[] = line.split("\\s+");

      // from the manpage: PFX flag stripping prefix [condition [morphological_fields...]]
      // condition is optional
      if (ruleArgs.length < 4) {
          throw new ParseException("The affix file contains a rule with less than four elements: " + line, reader.getLineNumber());
      }
      
      char flag = flagParsingStrategy.parseFlag(ruleArgs[1]);
      String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2];
      String affixArg = ruleArgs[3];
      char appendFlags[] = null;
      
      // first: parse continuation classes out of affix
      int flagSep = affixArg.lastIndexOf('/');
      if (flagSep != -1) {
        String flagPart = affixArg.substring(flagSep + 1);
        affixArg = affixArg.substring(0, flagSep);

        if (aliasCount > 0) {
          flagPart = getAliasValue(Integer.parseInt(flagPart));
        } 
        
        appendFlags = flagParsingStrategy.parseFlags(flagPart);
        Arrays.sort(appendFlags);
        twoStageAffix = true;
      }
      // zero affix -> empty string
      if ("0".equals(affixArg)) {
        affixArg = "";
      }
      
      String condition = ruleArgs.length > 4 ? ruleArgs[4] : ".";
      // at least the gascon affix file has this issue
      if (condition.startsWith("[") && condition.indexOf(']') == -1) {
        condition = condition + "]";
      }
      // "dash hasn't got special meaning" (we must escape it)
      if (condition.indexOf('-') >= 0) {
        condition = escapeDash(condition);
      }

      final String regex;
      if (".".equals(condition)) {
        regex = ".*"; // Zero condition is indicated by dot
      } else if (condition.equals(strip)) {
        regex = ".*"; // TODO: optimize this better:
                      // if we remove 'strip' from condition, we don't have to append 'strip' to check it...!
                      // but this is complicated...
      } else {
        regex = String.format(Locale.ROOT, conditionPattern, condition);
      }
      
      // deduplicate patterns
      Integer patternIndex = seenPatterns.get(regex);
      if (patternIndex == null) {
        patternIndex = patterns.size();
        if (patternIndex > Short.MAX_VALUE) {
          throw new UnsupportedOperationException("Too many patterns, please report this to [email protected]");          
        }
        seenPatterns.put(regex, patternIndex);
        CharacterRunAutomaton pattern = new CharacterRunAutomaton(new RegExp(regex, RegExp.NONE).toAutomaton());
        patterns.add(pattern);
      }
      
      Integer stripOrd = seenStrips.get(strip);
      if (stripOrd == null) {
        stripOrd = seenStrips.size();
        seenStrips.put(strip, stripOrd);
        if (stripOrd > Character.MAX_VALUE) {
          throw new UnsupportedOperationException("Too many unique strips, please report this to [email protected]");
        }
      }

      if (appendFlags == null) {
        appendFlags = NOFLAGS;
      }
      
      encodeFlags(scratch, appendFlags);
      int appendFlagsOrd = flagLookup.add(scratch.get());
      if (appendFlagsOrd < 0) {
        // already exists in our hash
        appendFlagsOrd = (-appendFlagsOrd)-1;
      } else if (appendFlagsOrd > Short.MAX_VALUE) {
        // this limit is probably flexible, but it's a good sanity check too
        throw new UnsupportedOperationException("Too many unique append flags, please report this to [email protected]");
      }
      
      affixWriter.writeShort((short)flag);
      affixWriter.writeShort((short)stripOrd.intValue());
      // encode crossProduct into patternIndex
      int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0);
      affixWriter.writeShort((short)patternOrd);
      affixWriter.writeShort((short)appendFlagsOrd);
      
      if (needsInputCleaning) {
        CharSequence cleaned = cleanInput(affixArg, sb);
        affixArg = cleaned.toString();
      }
      
      if (isSuffix) {
        affixArg = new StringBuilder(affixArg).reverse().toString();
      }
      
      List list = affixes.get(affixArg);
      if (list == null) {
        list = new ArrayList<>();
        affixes.put(affixArg, list);
      }
      list.add(currentAffix);
      currentAffix++;
    }
  }
  
  private FST parseConversions(LineNumberReader reader, int num) throws IOException, ParseException {
    Map mappings = new TreeMap<>();
    
    for (int i = 0; i < num; i++) {
      String line = reader.readLine();
      String parts[] = line.split("\\s+");
      if (parts.length != 3) {
        throw new ParseException("invalid syntax: " + line, reader.getLineNumber());
      }
      if (mappings.put(parts[1], parts[2]) != null) {
        throw new IllegalStateException("duplicate mapping specified for: " + parts[1]);
      }
    }
    
    Outputs outputs = CharSequenceOutputs.getSingleton();
    Builder builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs);
    IntsRefBuilder scratchInts = new IntsRefBuilder();
    for (Map.Entry entry : mappings.entrySet()) {
      Util.toUTF16(entry.getKey(), scratchInts);
      builder.add(scratchInts.get(), new CharsRef(entry.getValue()));
    }
    
    return builder.finish();
  }
  
  /** pattern accepts optional BOM + SET + any whitespace */
  final static Pattern ENCODING_PATTERN = Pattern.compile("^(\u00EF\u00BB\u00BF)?SET\\s+");

  /**
   * Parses the encoding specified in the affix file readable through the provided InputStream
   *
   * @param affix InputStream for reading the affix file
   * @return Encoding specified in the affix file
   * @throws IOException Can be thrown while reading from the InputStream
   * @throws ParseException Thrown if the first non-empty non-comment line read from the file does not adhere to the format {@code SET }
   */
  static String getDictionaryEncoding(InputStream affix) throws IOException, ParseException {
    final StringBuilder encoding = new StringBuilder();
    for (;;) {
      encoding.setLength(0);
      int ch;
      while ((ch = affix.read()) >= 0) {
        if (ch == '\n') {
          break;
        }
        if (ch != '\r') {
          encoding.append((char)ch);
        }
      }
      if (
          encoding.length() == 0 || encoding.charAt(0) == '#' ||
          // this test only at the end as ineffective but would allow lines only containing spaces:
          encoding.toString().trim().length() == 0
      ) {
        if (ch < 0) {
          throw new ParseException("Unexpected end of affix file.", 0);
        }
        continue;
      }
      Matcher matcher = ENCODING_PATTERN.matcher(encoding);
      if (matcher.find()) {
        int last = matcher.end();
        return encoding.substring(last).trim();
      }
    }
  }

  static final Map CHARSET_ALIASES;
  static {
    Map m = new HashMap<>();
    m.put("microsoft-cp1251", "windows-1251");
    m.put("TIS620-2533", "TIS-620");
    CHARSET_ALIASES = Collections.unmodifiableMap(m);
  }
  
  /**
   * Retrieves the CharsetDecoder for the given encoding.  Note, This isn't perfect as I think ISCII-DEVANAGARI and
   * MICROSOFT-CP1251 etc are allowed...
   *
   * @param encoding Encoding to retrieve the CharsetDecoder for
   * @return CharSetDecoder for the given encoding
   */
  private CharsetDecoder getJavaEncoding(String encoding) {
    if ("ISO8859-14".equals(encoding)) {
      return new ISO8859_14Decoder();
    }
    String canon = CHARSET_ALIASES.get(encoding);
    if (canon != null) {
      encoding = canon;
    }
    Charset charset = Charset.forName(encoding);
    return charset.newDecoder().onMalformedInput(CodingErrorAction.REPLACE);
  }

  /**
   * Determines the appropriate {@link FlagParsingStrategy} based on the FLAG definition line taken from the affix file
   *
   * @param flagLine Line containing the flag information
   * @return FlagParsingStrategy that handles parsing flags in the way specified in the FLAG definition
   */
  static FlagParsingStrategy getFlagParsingStrategy(String flagLine) {
    String parts[] = flagLine.split("\\s+");
    if (parts.length != 2) {
      throw new IllegalArgumentException("Illegal FLAG specification: " + flagLine);
    }
    String flagType = parts[1];

    if (NUM_FLAG_TYPE.equals(flagType)) {
      return new NumFlagParsingStrategy();
    } else if (UTF8_FLAG_TYPE.equals(flagType)) {
      return new SimpleFlagParsingStrategy();
    } else if (LONG_FLAG_TYPE.equals(flagType)) {
      return new DoubleASCIIFlagParsingStrategy();
    }

    throw new IllegalArgumentException("Unknown flag type: " + flagType);
  }

  final char FLAG_SEPARATOR = 0x1f; // flag separator after escaping
  final char MORPH_SEPARATOR = 0x1e; // separator for boundary of entry (may be followed by morph data)
  
  String unescapeEntry(String entry) {
    StringBuilder sb = new StringBuilder();
    int end = morphBoundary(entry);
    for (int i = 0; i < end; i++) {
      char ch = entry.charAt(i);
      if (ch == '\\' && i+1 < entry.length()) {
        sb.append(entry.charAt(i+1));
        i++;
      } else if (ch == '/') {
        sb.append(FLAG_SEPARATOR);
      } else if (ch == MORPH_SEPARATOR || ch == FLAG_SEPARATOR) {
        // BINARY EXECUTABLES EMBEDDED IN ZULU DICTIONARIES!!!!!!!
      } else {
        sb.append(ch);
      }
    }
    sb.append(MORPH_SEPARATOR);
    if (end < entry.length()) {
      for (int i = end; i < entry.length(); i++) {
        char c = entry.charAt(i);
        if (c == FLAG_SEPARATOR || c == MORPH_SEPARATOR) {
          // BINARY EXECUTABLES EMBEDDED IN ZULU DICTIONARIES!!!!!!!
        } else {
          sb.append(c);
        }
      }
    }
    return sb.toString();
  }
  
  static int morphBoundary(String line) {
    int end = indexOfSpaceOrTab(line, 0);
    if (end == -1) {
      return line.length();
    }
    while (end >= 0 && end < line.length()) {
      if (line.charAt(end) == '\t' ||
          end+3 < line.length() && 
          Character.isLetter(line.charAt(end+1)) && 
          Character.isLetter(line.charAt(end+2)) &&
          line.charAt(end+3) == ':') {
        break;
      }
      end = indexOfSpaceOrTab(line, end+1);
    }
    if (end == -1) {
      return line.length();
    }
    return end;
  }
  
  static int indexOfSpaceOrTab(String text, int start) {
    int pos1 = text.indexOf('\t', start);
    int pos2 = text.indexOf(' ', start);
    if (pos1 >= 0 && pos2 >= 0) {
      return Math.min(pos1, pos2);
    } else {
      return Math.max(pos1, pos2);
    }
  }
  
  /**
   * Reads the dictionary file through the provided InputStreams, building up the words map
   *
   * @param dictionaries InputStreams to read the dictionary file through
   * @param decoder CharsetDecoder used to decode the contents of the file
   * @throws IOException Can be thrown while reading from the file
   */
  private void readDictionaryFiles(List dictionaries, CharsetDecoder decoder, Builder words) throws IOException {
    BytesRefBuilder flagsScratch = new BytesRefBuilder();
    IntsRefBuilder scratchInts = new IntsRefBuilder();
    
    StringBuilder sb = new StringBuilder();
    
    Path unsorted = Files.createTempFile(tempDir, "unsorted", "dat");
    try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) {
      for (InputStream dictionary : dictionaries) {
        BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
        String line = lines.readLine(); // first line is number of entries (approximately, sometimes)
        
        while ((line = lines.readLine()) != null) {
          // wild and unpredictable code comment rules
          if (line.isEmpty() || line.charAt(0) == '/' || line.charAt(0) == '#' || line.charAt(0) == '\t') {
            continue;
          }
          line = unescapeEntry(line);
          // if we havent seen any stem exceptions, try to parse one
          if (hasStemExceptions == false) {
            int morphStart = line.indexOf(MORPH_SEPARATOR);
            if (morphStart >= 0 && morphStart < line.length()) {
              hasStemExceptions = parseStemException(line.substring(morphStart+1)) != null;
            }
          }
          if (needsInputCleaning) {
            int flagSep = line.indexOf(FLAG_SEPARATOR);
            if (flagSep == -1) {
              flagSep = line.indexOf(MORPH_SEPARATOR);
            }
            if (flagSep == -1) {
              CharSequence cleansed = cleanInput(line, sb);
              writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8));
            } else {
              String text = line.substring(0, flagSep);
              CharSequence cleansed = cleanInput(text, sb);
              if (cleansed != sb) {
                sb.setLength(0);
                sb.append(cleansed);
              }
              sb.append(line.substring(flagSep));
              writer.write(sb.toString().getBytes(StandardCharsets.UTF_8));
            }
          } else {
            writer.write(line.getBytes(StandardCharsets.UTF_8));
          }
        }
      }
    }
    Path sorted = Files.createTempFile(tempDir, "sorted", "dat");
    
    OfflineSorter sorter = new OfflineSorter(new Comparator() {
      BytesRef scratch1 = new BytesRef();
      BytesRef scratch2 = new BytesRef();
      
      @Override
      public int compare(BytesRef o1, BytesRef o2) {
        scratch1.bytes = o1.bytes;
        scratch1.offset = o1.offset;
        scratch1.length = o1.length;
        
        for (int i = scratch1.length - 1; i >= 0; i--) {
          if (scratch1.bytes[scratch1.offset + i] == FLAG_SEPARATOR || scratch1.bytes[scratch1.offset + i] == MORPH_SEPARATOR) {
            scratch1.length = i;
            break;
          }
        }
        
        scratch2.bytes = o2.bytes;
        scratch2.offset = o2.offset;
        scratch2.length = o2.length;
        
        for (int i = scratch2.length - 1; i >= 0; i--) {
          if (scratch2.bytes[scratch2.offset + i] == FLAG_SEPARATOR || scratch2.bytes[scratch2.offset + i] == MORPH_SEPARATOR) {
            scratch2.length = i;
            break;
          }
        }
        
        int cmp = scratch1.compareTo(scratch2);
        if (cmp == 0) {
          // tie break on whole row
          return o1.compareTo(o2);
        } else {
          return cmp;
        }
      }
    });
    boolean success = false;
    try {
      sorter.sort(unsorted, sorted);
      success = true;
    } finally {
      if (success) {
        Files.delete(unsorted);
      } else {
        IOUtils.deleteFilesIgnoringExceptions(unsorted);
      }
    }
    
    boolean success2 = false;
    ByteSequencesReader reader = new ByteSequencesReader(sorted);
    try {
      BytesRefBuilder scratchLine = new BytesRefBuilder();
    
      // TODO: the flags themselves can be double-chars (long) or also numeric
      // either way the trick is to encode them as char... but they must be parsed differently
    
      String currentEntry = null;
      IntsRefBuilder currentOrds = new IntsRefBuilder();
    
      String line;
      while (reader.read(scratchLine)) {
        line = scratchLine.get().utf8ToString();
        String entry;
        char wordForm[];
        int end;

        int flagSep = line.indexOf(FLAG_SEPARATOR);
        if (flagSep == -1) {
          wordForm = NOFLAGS;
          end = line.indexOf(MORPH_SEPARATOR);
          entry = line.substring(0, end);
        } else {
          end = line.indexOf(MORPH_SEPARATOR);
          String flagPart = line.substring(flagSep + 1, end);
          if (aliasCount > 0) {
            flagPart = getAliasValue(Integer.parseInt(flagPart));
          } 
        
          wordForm = flagParsingStrategy.parseFlags(flagPart);
          Arrays.sort(wordForm);
          entry = line.substring(0, flagSep);
        }
        // we possibly have morphological data
        int stemExceptionID = 0;
        if (hasStemExceptions && end+1 < line.length()) {
          String stemException = parseStemException(line.substring(end+1));
          if (stemException != null) {
            if (stemExceptionCount == stemExceptions.length) {
              int newSize = ArrayUtil.oversize(stemExceptionCount+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF);
              stemExceptions = Arrays.copyOf(stemExceptions, newSize);
            }
            stemExceptionID = stemExceptionCount+1; // we use '0' to indicate no exception for the form
            stemExceptions[stemExceptionCount++] = stemException;
          }
        }

        int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry);
        if (cmp < 0) {
          throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry);
        } else {
          encodeFlags(flagsScratch, wordForm);
          int ord = flagLookup.add(flagsScratch.get());
          if (ord < 0) {
            // already exists in our hash
            ord = (-ord)-1;
          }
          // finalize current entry, and switch "current" if necessary
          if (cmp > 0 && currentEntry != null) {
            Util.toUTF32(currentEntry, scratchInts);
            words.add(scratchInts.get(), currentOrds.get());
          }
          // swap current
          if (cmp > 0 || currentEntry == null) {
            currentEntry = entry;
            currentOrds = new IntsRefBuilder(); // must be this way
          }
          if (hasStemExceptions) {
            currentOrds.append(ord);
            currentOrds.append(stemExceptionID);
          } else {
            currentOrds.append(ord);
          }
        }
      }
    
      // finalize last entry
      Util.toUTF32(currentEntry, scratchInts);
      words.add(scratchInts.get(), currentOrds.get());
      success2 = true;
    } finally {
      IOUtils.closeWhileHandlingException(reader);
      if (success2) {
        Files.delete(sorted);
      } else {
        IOUtils.deleteFilesIgnoringExceptions(sorted);
      }
    }
  }
  
  static char[] decodeFlags(BytesRef b) {
    if (b.length == 0) {
      return CharsRef.EMPTY_CHARS;
    }
    int len = b.length >>> 1;
    char flags[] = new char[len];
    int upto = 0;
    int end = b.offset + b.length;
    for (int i = b.offset; i < end; i += 2) {
      flags[upto++] = (char)((b.bytes[i] << 8) | (b.bytes[i+1] & 0xff));
    }
    return flags;
  }
  
  static void encodeFlags(BytesRefBuilder b, char flags[]) {
    int len = flags.length << 1;
    b.grow(len);
    b.clear();
    for (int i = 0; i < flags.length; i++) {
      int flag = flags[i];
      b.append((byte) ((flag >> 8) & 0xff));
      b.append((byte) (flag & 0xff));
    }
  }

  private void parseAlias(String line) {
    String ruleArgs[] = line.split("\\s+");
    if (aliases == null) {
      //first line should be the aliases count
      final int count = Integer.parseInt(ruleArgs[1]);
      aliases = new String[count];
    } else {
      // an alias can map to no flags
      String aliasValue = ruleArgs.length == 1 ? "" : ruleArgs[1];
      aliases[aliasCount++] = aliasValue;
    }
  }
  
  private String getAliasValue(int id) {
    try {
      return aliases[id - 1];
    } catch (IndexOutOfBoundsException ex) {
      throw new IllegalArgumentException("Bad flag alias number:" + id, ex);
    }
  }
  
  String getStemException(int id) {
    return stemExceptions[id-1];
  }
  
  private void parseMorphAlias(String line) {
    if (morphAliases == null) {
      //first line should be the aliases count
      final int count = Integer.parseInt(line.substring(3));
      morphAliases = new String[count];
    } else {
      String arg = line.substring(2); // leave the space
      morphAliases[morphAliasCount++] = arg;
    }
  }
  
  private String parseStemException(String morphData) {
    // first see if it's an alias
    if (morphAliasCount > 0) {
      try {
        int alias = Integer.parseInt(morphData.trim());
        morphData = morphAliases[alias-1];
      } catch (NumberFormatException e) {  
        // fine
      }
    }
    // try to parse morph entry
    int index = morphData.indexOf(" st:");
    if (index < 0) {
      index = morphData.indexOf("\tst:");
    }
    if (index >= 0) {
      int endIndex = indexOfSpaceOrTab(morphData, index+1);
      if (endIndex < 0) {
        endIndex = morphData.length();
      }
      return morphData.substring(index+4, endIndex);
    }
    return null;
  }

  /**
   * Abstraction of the process of parsing flags taken from the affix and dic files
   */
  static abstract class FlagParsingStrategy {

    /**
     * Parses the given String into a single flag
     *
     * @param rawFlag String to parse into a flag
     * @return Parsed flag
     */
    char parseFlag(String rawFlag) {
      char flags[] = parseFlags(rawFlag);
      if (flags.length != 1) {
        throw new IllegalArgumentException("expected only one flag, got: " + rawFlag);
      }
      return flags[0];
    }

    /**
     * Parses the given String into multiple flags
     *
     * @param rawFlags String to parse into flags
     * @return Parsed flags
     */
    abstract char[] parseFlags(String rawFlags);
  }

  /**
   * Simple implementation of {@link FlagParsingStrategy} that treats the chars in each String as a individual flags.
   * Can be used with both the ASCII and UTF-8 flag types.
   */
  private static class SimpleFlagParsingStrategy extends FlagParsingStrategy {
    @Override
    public char[] parseFlags(String rawFlags) {
      return rawFlags.toCharArray();
    }
  }

  /**
   * Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded in its numerical form.  In the case
   * of multiple flags, each number is separated by a comma.
   */
  private static class NumFlagParsingStrategy extends FlagParsingStrategy {
    @Override
    public char[] parseFlags(String rawFlags) {
      String[] rawFlagParts = rawFlags.trim().split(",");
      char[] flags = new char[rawFlagParts.length];
      int upto = 0;
      
      for (int i = 0; i < rawFlagParts.length; i++) {
        // note, removing the trailing X/leading I for nepali... what is the rule here?! 
        String replacement = rawFlagParts[i].replaceAll("[^0-9]", "");
        // note, ignoring empty flags (this happens in danish, for example)
        if (replacement.isEmpty()) {
          continue;
        }
        flags[upto++] = (char) Integer.parseInt(replacement);
      }

      if (upto < flags.length) {
        flags = Arrays.copyOf(flags, upto);
      }
      return flags;
    }
  }

  /**
   * Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded as two ASCII characters whose codes
   * must be combined into a single character.
   */
  private static class DoubleASCIIFlagParsingStrategy extends FlagParsingStrategy {

    @Override
    public char[] parseFlags(String rawFlags) {
      if (rawFlags.length() == 0) {
        return new char[0];
      }

      StringBuilder builder = new StringBuilder();
      if (rawFlags.length() % 2 == 1) {
        throw new IllegalArgumentException("Invalid flags (should be even number of characters): " + rawFlags);
      }
      for (int i = 0; i < rawFlags.length(); i+=2) {
        char f1 = rawFlags.charAt(i);
        char f2 = rawFlags.charAt(i+1);
        if (f1 >= 256 || f2 >= 256) {
          throw new IllegalArgumentException("Invalid flags (LONG flags must be double ASCII): " + rawFlags);
        }
        char combined = (char) (f1 << 8 | f2);
        builder.append(combined);
      }
      
      char flags[] = new char[builder.length()];
      builder.getChars(0, builder.length(), flags, 0);
      return flags;
    }
  }
  
  static boolean hasFlag(char flags[], char flag) {
    return Arrays.binarySearch(flags, flag) >= 0;
  }
  
  CharSequence cleanInput(CharSequence input, StringBuilder reuse) {
    reuse.setLength(0);
    
    for (int i = 0; i < input.length(); i++) {
      char ch = input.charAt(i);
      
      if (ignore != null && Arrays.binarySearch(ignore, ch) >= 0) {
        continue;
      }
      
      if (ignoreCase && iconv == null) {
        // if we have no input conversion mappings, do this on-the-fly
        ch = caseFold(ch);
      }
      
      reuse.append(ch);
    }
    
    if (iconv != null) {
      try {
        applyMappings(iconv, reuse);
      } catch (IOException bogus) {
        throw new RuntimeException(bogus);
      }
      if (ignoreCase) {
        for (int i = 0; i < reuse.length(); i++) {
          reuse.setCharAt(i, caseFold(reuse.charAt(i)));
        }
      }
    }
    
    return reuse;
  }
  
  /** folds single character (according to LANG if present) */
  char caseFold(char c) {
    if (alternateCasing) {
      if (c == 'I') {
        return 'ı';
      } else if (c == 'İ') {
        return 'i';
      } else {
        return Character.toLowerCase(c);
      }
    } else {
      return Character.toLowerCase(c);
    }
  }
  
  // TODO: this could be more efficient!
  static void applyMappings(FST fst, StringBuilder sb) throws IOException {
    final FST.BytesReader bytesReader = fst.getBytesReader();
    final FST.Arc firstArc = fst.getFirstArc(new FST.Arc());
    final CharsRef NO_OUTPUT = fst.outputs.getNoOutput();
    
    // temporary stuff
    final FST.Arc arc = new FST.Arc<>();
    int longestMatch;
    CharsRef longestOutput;
    
    for (int i = 0; i < sb.length(); i++) {
      arc.copyFrom(firstArc);
      CharsRef output = NO_OUTPUT;
      longestMatch = -1;
      longestOutput = null;
      
      for (int j = i; j < sb.length(); j++) {
        char ch = sb.charAt(j);
        if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
          break;
        } else {
          output = fst.outputs.add(output, arc.output);
        }
        if (arc.isFinal()) {
          longestOutput = fst.outputs.add(output, arc.nextFinalOutput);
          longestMatch = j;
        }
      }
      
      if (longestMatch >= 0) {
        sb.delete(i, longestMatch+1);
        sb.insert(i, longestOutput);
        i += (longestOutput.length - 1);
      }
    }
  }
  
  /** Returns true if this dictionary was constructed with the {@code ignoreCase} option */
  public boolean getIgnoreCase() {
    return ignoreCase;
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy