opennlp.tools.dictionary.Dictionary Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of opennlp-tools Show documentation
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.dictionary;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.Reader;
import java.util.AbstractSet;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.StringTokenizer;

import opennlp.tools.dictionary.serializer.Attributes;
import opennlp.tools.dictionary.serializer.DictionaryEntryPersistor;
import opennlp.tools.dictionary.serializer.Entry;
import opennlp.tools.util.StringList;
import opennlp.tools.util.model.DictionarySerializer;
import opennlp.tools.util.model.SerializableArtifact;

/**
 * An iterable and serializable dictionary implementation.
 *
 * @see SerializableArtifact
 * @see Iterable
 */
public class Dictionary implements Iterable, SerializableArtifact {
  private final Set entrySet = new HashSet<>();
  private final boolean isCaseSensitive;
  private int minTokenCount = 99999;
  private int maxTokenCount = 0;

  /**
   * Initializes an empty {@link Dictionary}.
   * By default, the resulting instance will not be case-sensitive.
   */
  public Dictionary() {
    this(false);
  }

  /**
   * Initializes an empty {@link Dictionary}.
   *
   * @param caseSensitive Whether the new instance will operate case-sensitive, or not.
   */
  public Dictionary(boolean caseSensitive) {
    isCaseSensitive = caseSensitive;
  }

  /**
   * Initializes the {@link Dictionary} from an existing dictionary resource.
   *
   * @param in The {@link InputStream} that references the dictionary content.
   *           
   * @throws IOException Thrown if IO errors occurred.
   */
  public Dictionary(InputStream in) throws IOException {
    isCaseSensitive = DictionaryEntryPersistor.create(in, entry -> put(entry.tokens()));
  }

  /**
   * Adds the tokens to the dictionary as one new entry.
   *
   * @param tokens the new entry
   */
  public void put(StringList tokens) {
    entrySet.add(applyCaseSensitivity(tokens));
    minTokenCount = StrictMath.min(minTokenCount, tokens.size());
    maxTokenCount = StrictMath.max(maxTokenCount, tokens.size());
  }

  public int getMinTokenCount() {
    return minTokenCount;
  }

  public int getMaxTokenCount() {
    return maxTokenCount;
  }

  /**
   * Checks if this dictionary has the given entry.
   *
   * @param tokens The query of tokens to be checked for.
   * @return {@code true} if it contains the entry, {@code false} otherwise.
   */
  public boolean contains(StringList tokens) {
    return entrySet.contains(applyCaseSensitivity(tokens));
  }

  /**
   * Removes the given tokens form the current instance.
   *
   * @param tokens The tokens to be filtered out (= removed).
   */
  public void remove(StringList tokens) {
    entrySet.remove(applyCaseSensitivity(tokens));
  }

  /**
   * @return Retrieves a token-{@link Iterator} over all elements.
   */
  @Override
  public Iterator iterator() {
    final Iterator entries = entrySet.iterator();

    return new Iterator<>() {

      @Override
      public boolean hasNext() {
        return entries.hasNext();
      }

      @Override
      public StringList next() {
        return entries.next();
      }

      @Override
      public void remove() {
        entries.remove();
      }
    };
  }

  /**
   * @return Retrieves the number of tokens in the current instance.
   */
  public int size() {
    return entrySet.size();
  }

  /**
   * Writes the current instance to the given {@link OutputStream}.
   *
   * @param out A valid {@link OutputStream}, ready for serialization.
   * @throws IOException Thrown if IO errors occurred.
   */
  public void serialize(OutputStream out) throws IOException {

    Iterator entryIterator = new Iterator<>() {
      private final Iterator dictionaryIterator = Dictionary.this.iterator();

      @Override
      public boolean hasNext() {
        return dictionaryIterator.hasNext();
      }

      @Override
      public Entry next() {

        StringList tokens = dictionaryIterator.next();

        return new Entry(tokens, new Attributes());
      }

      @Override
      public void remove() {
        throw new UnsupportedOperationException();
      }

    };

    DictionaryEntryPersistor.serialize(out, entryIterator, isCaseSensitive);
  }

  @Override
  public boolean equals(Object obj) {

    boolean result;

    if (obj == this) {
      result = true;
    }
    else if (obj instanceof Dictionary dictionary) {

      result = entrySet.equals(dictionary.entrySet);
    }
    else {
      result = false;
    }

    return result;
  }

  @Override
  public int hashCode() {
    return entrySet.hashCode();
  }

  @Override
  public String toString() {
    return entrySet.toString();
  }

  /**
   * Reads a {@link Dictionary} which has one entry per line.
   * The tokens inside an entry are whitespace delimited.
   *
   * @param in A {@link Reader} instance used to parse the dictionary from.
   * @return The parsed {@link Dictionary} instance; guaranteed to be non-{@code null}.
   * @throws IOException Thrown if IO errors occurred during read and parse operations.
   */
  public static Dictionary parseOneEntryPerLine(Reader in) throws IOException {
    BufferedReader lineReader = new BufferedReader(in);

    final Dictionary dictionary = new Dictionary();

    String line;

    while ((line = lineReader.readLine()) != null) {
      StringTokenizer whiteSpaceTokenizer = new StringTokenizer(line, " ");

      String[] tokens = new String[whiteSpaceTokenizer.countTokens()];

      if (tokens.length > 0) {
        int tokenIndex = 0;
        while (whiteSpaceTokenizer.hasMoreTokens()) {
          tokens[tokenIndex++] = whiteSpaceTokenizer.nextToken();
        }

        dictionary.put(new StringList(tokens));
      }
    }

    return dictionary;
  }

  /**
   * Converts this {@link Dictionary} to a {@link Set}.
   * 
   * Note: Only {@link AbstractSet#iterator()}, {@link AbstractSet#size()} and
   * {@link AbstractSet#contains(Object)} methods are implemented.
   * 
   * If this dictionary entries are multi tokens only the first token of the
   * entry will be part of the {@link Set}.
   *
   * @return A {@link Set} containing all entries of this {@link Dictionary}.
   */
  public Set asStringSet() {
    return new AbstractSet<>() {

      @Override
      public Iterator iterator() {
        final Iterator entries = entrySet.iterator();

        return new Iterator<>() {
          @Override
          public boolean hasNext() {
            return entries.hasNext();
          }
          @Override
          public String next() {
            return entries.next().getToken(0);
          }
          @Override
          public void remove() {
            throw new UnsupportedOperationException();
          }
        };
      }

      @Override
      public int size() {
        return entrySet.size();
      }

      @Override
      public boolean contains(Object obj) {
        boolean result = false;

        if (obj instanceof String str) {

          result = entrySet.contains(new StringList(isCaseSensitive, str));

        }
        return result;
      }

      @Override
      public boolean equals(Object o) {
        if (! (o instanceof Set)) {
          return false;
        }
        Set toCheck = (Set) o;
        if (entrySet.size() != toCheck.size()) {
          return false;
        }
        Iterator toCheckIter = toCheck.iterator();
        for (StringList entry : entrySet) {
          if (isCaseSensitive) {
            if (!entry.equals(new StringList(true, toCheckIter.next()))) {
              return false;
            }
          } else {
            if (!entry.compareToIgnoreCase(new StringList(false, toCheckIter.next()))) {
              return false;
            }
          }
        }
        return true;
      }

      @Override
      public int hashCode() {
        return entrySet.hashCode();
      }
    };
  }

  /**
   * @return Retrieves the serializer class for {@link Dictionary}
   *
   * @see DictionarySerializer
   */
  @Override
  public Class getArtifactSerializerClass() {
    return DictionarySerializer.class;
  }

  /**
   * @return {@code true}, if this {@link Dictionary} is case-sensitive.
   */
  public boolean isCaseSensitive() {
    return isCaseSensitive;
  }

  private StringList applyCaseSensitivity(StringList list) {
    if (isCaseSensitive) {
      return list.toCaseSensitive();
    } else {
      return list.toCaseInsensitive();
    }
  }
}