morfologik.stemming.DictionaryMetadata Maven / Gradle / Ivy

Go to download
package morfologik.stemming;

import static morfologik.stemming.DictionaryAttribute.*;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Writer;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.UnsupportedCharsetException;
import java.nio.file.Path;
import java.util.Collections;
import java.util.EnumMap;
import java.util.EnumSet;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;

/**
 * Description of attributes, their types and default values.
 */
public final class DictionaryMetadata {
  /**
   * Default attribute values.
   */
  private static Map DEFAULT_ATTRIBUTES = new DictionaryMetadataBuilder()
    .frequencyIncluded(false)
    .ignorePunctuation()
    .ignoreNumbers()
    .ignoreCamelCase()
    .ignoreAllUppercase()
    .ignoreDiacritics()
    .convertCase()
    .supportRunOnWords()
    .toMap();

  /**
   * Required attributes.
   */
  private static EnumSet REQUIRED_ATTRIBUTES = EnumSet.of(
      SEPARATOR,
      ENCODER,
      ENCODING);

  /**
   * A separator character between fields (stem, lemma, form). The character
   * must be within byte range (FSA uses bytes internally).
   */
  private byte separator;
  private char separatorChar;

  /**
   * Encoding used for converting bytes to characters and vice versa.
   */
  private String encoding;

  private Charset charset;
  private Locale locale = Locale.getDefault();

  /**
   * Replacement pairs for non-obvious candidate search in a speller dictionary.
   */
  private LinkedHashMap> replacementPairs = new LinkedHashMap<>();

  /**
   * Conversion pairs for input conversion, for example to replace ligatures.
   */
  private LinkedHashMap inputConversion = new LinkedHashMap<>();

  /**
   * Conversion pairs for output conversion, for example to replace ligatures.
   */
  private LinkedHashMap outputConversion = new LinkedHashMap<>();

  /**
   * Equivalent characters (treated similarly as equivalent chars with and without
   * diacritics). For example, Polish ł can be specified as equivalent to l.
   * 
   * This implements a feature similar to hunspell MAP in the affix file.
   */
  private LinkedHashMap> equivalentChars = new LinkedHashMap<>();

  /**
   * All attributes.
   */
  private final EnumMap attributes;

  /**
   * All "enabled" boolean attributes.
   */
  private final EnumMap boolAttributes;

  /**
   * Sequence encoder.
   */
  private EncoderType encoderType;

  /**
   * Expected metadata file extension.
   */
  public final static String METADATA_FILE_EXTENSION = "info";

  /**
   * @return Return all metadata attributes.  
   */
  public Map getAttributes() {
    return Collections.unmodifiableMap(attributes);
  }

  // Cached attrs.
  public String getEncoding()      { return encoding; }
  public byte getSeparator()       { return separator; }
  public Locale getLocale()        { return locale; }

  public LinkedHashMap getInputConversionPairs() { return inputConversion; }
  public LinkedHashMap getOutputConversionPairs() { return outputConversion; }

  public LinkedHashMap> getReplacementPairs() { return replacementPairs; }
  public LinkedHashMap> getEquivalentChars() { return equivalentChars; }

  // Dynamically fetched.
  public boolean isFrequencyIncluded()    { return boolAttributes.get(FREQUENCY_INCLUDED); }
  public boolean isIgnoringPunctuation()  { return boolAttributes.get(IGNORE_PUNCTUATION); }
  public boolean isIgnoringNumbers()      { return boolAttributes.get(IGNORE_NUMBERS); }
  public boolean isIgnoringCamelCase()    { return boolAttributes.get(IGNORE_CAMEL_CASE); }
  public boolean isIgnoringAllUppercase() { return boolAttributes.get(IGNORE_ALL_UPPERCASE); }
  public boolean isIgnoringDiacritics()   { return boolAttributes.get(IGNORE_DIACRITICS); }
  public boolean isConvertingCase()       { return boolAttributes.get(CONVERT_CASE); }
  public boolean isSupportingRunOnWords() { return boolAttributes.get(RUN_ON_WORDS); }

  /**
   * Create an instance from an attribute map.
   * 
   * @param attrs A set of {@link DictionaryAttribute} keys and their associated values.
   * @see DictionaryMetadataBuilder
   */
  public DictionaryMetadata(Map attrs) {
    this.boolAttributes = new EnumMap(DictionaryAttribute.class);
    this.attributes = new EnumMap(DictionaryAttribute.class);
    this.attributes.putAll(attrs);

    EnumMap attributeMap = new EnumMap(DEFAULT_ATTRIBUTES);
    attributeMap.putAll(attrs);

    // Convert some attrs from the map to local fields for performance reasons.
    EnumSet requiredAttributes = EnumSet.copyOf(REQUIRED_ATTRIBUTES);

    for (Map.Entry e : attributeMap.entrySet()) {
      requiredAttributes.remove(e.getKey());

      // Run validation and conversion on all of them.
      Object value = e.getKey().fromString(e.getValue());
      switch (e.getKey()) {
        case ENCODING:
          this.encoding = e.getValue();
          if (!Charset.isSupported(encoding)) {
            throw new IllegalArgumentException("Encoding not supported on this JVM: "
                + encoding);
          }
          this.charset = (Charset) value;
          break;
  
        case SEPARATOR:
          this.separatorChar = (Character) value;
          break;
  
        case LOCALE:
          this.locale = (Locale) value;
          break;
  
        case ENCODER:
          this.encoderType = (EncoderType) value;
          break;
  
        case INPUT_CONVERSION:
        {
          @SuppressWarnings("unchecked")
          LinkedHashMap gvalue = (LinkedHashMap) value;
          this.inputConversion = gvalue;
        }
        break;
  
        case OUTPUT_CONVERSION:
        {
          @SuppressWarnings("unchecked")
          LinkedHashMap gvalue = (LinkedHashMap) value;
          this.outputConversion = gvalue;
        }
        break;
  
        case REPLACEMENT_PAIRS:
        {
          @SuppressWarnings("unchecked")
          LinkedHashMap> gvalue = (LinkedHashMap>) value;
          this.replacementPairs = gvalue;
        }
        break;
  
        case EQUIVALENT_CHARS:
        {
          @SuppressWarnings("unchecked")
          LinkedHashMap> gvalue = (LinkedHashMap>) value;
          this.equivalentChars = gvalue;
        }
        break;
  
        case IGNORE_PUNCTUATION:
        case IGNORE_NUMBERS:
        case IGNORE_CAMEL_CASE:
        case IGNORE_ALL_UPPERCASE:
        case IGNORE_DIACRITICS:
        case CONVERT_CASE:
        case RUN_ON_WORDS:
        case FREQUENCY_INCLUDED:
          this.boolAttributes.put(e.getKey(), (Boolean) value);
          break;
  
        case AUTHOR:
        case LICENSE:
        case CREATION_DATE:
          // Just run validation.
          e.getKey().fromString(e.getValue());
          break;
  
        default:
          throw new RuntimeException("Unexpected code path (attribute should be handled but is not): " + e.getKey());
      }
    }

    if (!requiredAttributes.isEmpty()) {
      throw new IllegalArgumentException("At least one the required attributes was not provided: "
          + requiredAttributes.toString());
    }

    // Sanity check.
    CharsetEncoder encoder = getEncoder();
    try {
      ByteBuffer encoded = encoder.encode(CharBuffer.wrap(new char [] { separatorChar }));
      if (encoded.remaining() > 1) {
        throw new IllegalArgumentException("Separator character is not a single byte in encoding "
            + encoding + ": " + separatorChar);
      }
      this.separator = encoded.get();
    } catch (CharacterCodingException e) {
      throw new IllegalArgumentException("Separator character cannot be converted to a byte in "
          + encoding + ": " + separatorChar, e);
    }
  }

  /**
   * @return Returns a new {@link CharsetDecoder} for the {@link #encoding}. 
   */
  public CharsetDecoder getDecoder() {
    try {
      return charset.newDecoder()
              .onMalformedInput(CodingErrorAction.REPORT)
              .onUnmappableCharacter(CodingErrorAction.REPORT);
    } catch (UnsupportedCharsetException e) {
      throw new RuntimeException(
          "FSA's encoding charset is not supported: " + encoding);
    }
  }

  /**
   * @return Returns a new {@link CharsetEncoder} for the {@link #encoding}.
   */
  public CharsetEncoder getEncoder() {
    try {
      return charset.newEncoder()
        .onMalformedInput(CodingErrorAction.REPORT)
        .onUnmappableCharacter(CodingErrorAction.REPORT);
    } catch (UnsupportedCharsetException e) {
      throw new RuntimeException(
          "FSA's encoding charset is not supported: " + encoding);
    }
  }

  /**
   * @return Return sequence encoder type.
   */
  public EncoderType getSequenceEncoderType() {
    return encoderType;
  }

  /**
   * @return Returns the {@link #separator} byte converted to a single
   *         char.
   * @throws RuntimeException
   *           if this conversion is for some reason impossible (the byte is a
   *           surrogate pair, FSA's {@link #encoding} is not available).
   */
  public char getSeparatorAsChar() {
    return separatorChar;
  }

  /**
   * @return A shortcut returning {@link DictionaryMetadataBuilder}.
   */
  public static DictionaryMetadataBuilder builder() {
    return new DictionaryMetadataBuilder();
  }
  
  /**
   * Returns the expected name of the metadata file, based on the name of the
   * dictionary file. The expected name is resolved by truncating any
   * file extension of name and appending
   * {@link DictionaryMetadata#METADATA_FILE_EXTENSION}.
   * 
   * @param dictionaryFile The name of the dictionary (*.dict) file. 
   * @return Returns the expected name of the metadata file. 
   */
  public static String getExpectedMetadataFileName(String dictionaryFile) {
    final int dotIndex = dictionaryFile.lastIndexOf('.');
    final String featuresName;
    if (dotIndex >= 0) {
      featuresName = dictionaryFile.substring(0, dotIndex) + "." + METADATA_FILE_EXTENSION;
    } else {
      featuresName = dictionaryFile + "." + METADATA_FILE_EXTENSION;
    }
  
    return featuresName;
  }

  /**
   * @param dictionary The location of the dictionary file.
   * @return Returns the expected location of a metadata file.
   */
  public static Path getExpectedMetadataLocation(Path dictionary) {
    return dictionary.resolveSibling(
        getExpectedMetadataFileName(dictionary.getFileName().toString()));
  }

  /**
   * Read dictionary metadata from a property file (stream).
   * 
   * @param metadataStream The stream with metadata. 
   * @return Returns {@link DictionaryMetadata} read from a the stream (property file).
   * @throws IOException Thrown if an I/O exception occurs. 
   */
  public static DictionaryMetadata read(InputStream metadataStream) throws IOException {
    Map map = new HashMap();
    final Properties properties = new Properties();
    properties.load(new InputStreamReader(metadataStream, "UTF-8"));

    // Handle back-compatibility for encoder specification.
    if (!properties.containsKey(DictionaryAttribute.ENCODER.propertyName)) {
      boolean hasDeprecated = properties.containsKey("fsa.dict.uses-suffixes") ||
                              properties.containsKey("fsa.dict.uses-infixes") ||
                              properties.containsKey("fsa.dict.uses-prefixes");

      boolean usesSuffixes = Boolean.valueOf(properties.getProperty("fsa.dict.uses-suffixes", "true"));
      boolean usesPrefixes = Boolean.valueOf(properties.getProperty("fsa.dict.uses-prefixes", "false"));
      boolean usesInfixes  = Boolean.valueOf(properties.getProperty("fsa.dict.uses-infixes",  "false"));

      final EncoderType encoder;
      if (usesInfixes) {
        encoder = EncoderType.INFIX;
      } else if (usesPrefixes) {
        encoder = EncoderType.PREFIX;
      } else if (usesSuffixes) {
        encoder = EncoderType.SUFFIX;
      } else {
        encoder = EncoderType.NONE;
      }

      if (!hasDeprecated) {
        throw new IOException("Use an explicit " +
            DictionaryAttribute.ENCODER.propertyName + "=" + encoder.name() +
            " metadata key: ");
      }

      throw new IOException("Deprecated encoder keys in metadata. Use " +
          DictionaryAttribute.ENCODER.propertyName + "=" + encoder.name());
    }

    for (Enumeration e = properties.propertyNames(); e.hasMoreElements();) {
      String key = (String) e.nextElement();
      map.put(DictionaryAttribute.fromPropertyName(key), properties.getProperty(key));
    }

    return new DictionaryMetadata(map);
  }

  /**
   * Write dictionary attributes (metadata).
   * 
   * @param writer The writer to write to.
   * @throws IOException Thrown when an I/O error occurs.
   */
  public void write(Writer writer) throws IOException {
    final Properties properties = new Properties();

    for (Map.Entry e : getAttributes().entrySet()) {
      properties.setProperty(e.getKey().propertyName, e.getValue());
    }

    properties.store(writer, "# " + getClass().getName());
  }
}