opennlp.tools.dictionary.Dictionary Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of opennlp-tools Show documentation
There is a newer version: 2.5.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreemnets.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package opennlp.tools.dictionary;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.Reader;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.StringTokenizer;

import opennlp.tools.dictionary.serializer.Attributes;
import opennlp.tools.dictionary.serializer.DictionarySerializer;
import opennlp.tools.dictionary.serializer.Entry;
import opennlp.tools.dictionary.serializer.EntryInserter;
import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.StringList;

/**
 * This class is a dictionary.
 */
public class Dictionary implements Iterable {

  private static class StringListWrapper {

    private final StringList stringList;
    private final boolean isCaseSensitive;

    private StringListWrapper(StringList stringList, boolean isCaseSensitive) {
      this.stringList = stringList;
      this.isCaseSensitive = isCaseSensitive;
    }

    private StringList getStringList() {
      return stringList;
    }

    public boolean equals(Object obj) {

      boolean result;

      if (obj == this) {
        result = true;
      }
      else if (obj instanceof StringListWrapper) {
        StringListWrapper other = (StringListWrapper) obj;

        if (isCaseSensitive) {
          result = this.stringList.equals(other.getStringList());
        }
        else {
          result = this.stringList.compareToIgnoreCase(other.getStringList());
        }
       }
      else {
        result = false;
      }

      return result;
    }

    public int hashCode() {
      // if lookup is too slow optimize this
      return this.stringList.toString().toLowerCase().hashCode();
    }

    public String toString() {
      return this.stringList.toString();
    }
  }

  private Set entrySet = new HashSet();
  private boolean caseSensitive;

  /**
   * Initializes an empty {@link Dictionary}.
   */
  public Dictionary() {
    this(false);
  }

  public Dictionary(boolean caseSensitive) {
    this.caseSensitive = caseSensitive;
  }

  /**
   * Initializes the {@link Dictionary} from an existing dictionary resource.
   *
   * @param in
   * @throws IOException
   * @throws InvalidFormatException
   */
  public Dictionary(InputStream in) throws IOException, InvalidFormatException {
    this(in,false);
  }

  public Dictionary(InputStream in, boolean caseSensitive) throws IOException, InvalidFormatException {
    this.caseSensitive = caseSensitive;
    DictionarySerializer.create(in, new EntryInserter()
    {
      public void insert(Entry entry) {
        put(entry.getTokens());
      }
    });
  }

  /**
   * Adds the tokens to the dictionary as one new entry.
   *
   * @param tokens the new entry
   */
  public void put(StringList tokens) {
      entrySet.add(new StringListWrapper(tokens, caseSensitive));
  }

  /**
   * Checks if this dictionary has the given entry.
   *
   * @param tokens
   *
   * @return true if it contains the entry otherwise false
   */
  public boolean contains(StringList tokens) {
      return entrySet.contains(new StringListWrapper(tokens, caseSensitive));
  }

  /**
   * Removes the given tokens form the current instance.
   *
   * @param tokens
   */
  public void remove(StringList tokens) {
      entrySet.remove(new StringListWrapper(tokens, caseSensitive));
  }

  /**
   * Retrieves an Iterator over all tokens.
   *
   * @return token-{@link Iterator}
   */
  public Iterator iterator() {
    final Iterator entries = entrySet.iterator();

    return new Iterator() {

      public boolean hasNext() {
        return entries.hasNext();
      }

      public StringList next() {
        return entries.next().getStringList();
      }

      public void remove() {
        entries.remove();
      }};
  }

  /**
   * Retrieves the number of tokens in the current instance.
   *
   * @return number of tokens
   */
  public int size() {
    return entrySet.size();
  }

  /**
   * Writes the current instance to the given {@link OutputStream}.
   *
   * @param out
   * @throws IOException
   */
  public void serialize(OutputStream out) throws IOException {

    Iterator entryIterator = new Iterator()
      {
        private Iterator dictionaryIterator = Dictionary.this.iterator();

        public boolean hasNext() {
          return dictionaryIterator.hasNext();
        }

        public Entry next() {

          StringList tokens = (StringList)
              dictionaryIterator.next();

          return new Entry(tokens, new Attributes());
        }

        public void remove() {
          throw new UnsupportedOperationException();
        }

      };

    DictionarySerializer.serialize(out, entryIterator);
  }

  public boolean equals(Object obj) {

    boolean result;

    if (obj == this) {
      result = true;
    }
    else if (obj != null && obj instanceof Dictionary) {
      Dictionary dictionary  = (Dictionary) obj;

      result = entrySet.equals(dictionary.entrySet);
    }
    else {
      result = false;
    }

    return result;
  }

  public int hashCode() {
    return entrySet.hashCode();
  }

  public String toString() {
    return entrySet.toString();
  }

  /**
   * Reads a dictionary which has one entry per line. The tokens inside an
   * entry are whitespace delimited.
   *
   * @param in
   *
   * @return the parsed dictionary
   *
   * @throws IOException
   */
  public static Dictionary parseOneEntryPerLine(Reader in) throws IOException {
    BufferedReader lineReader = new BufferedReader(in);

    Dictionary dictionary = new Dictionary();

    String line;

    while ((line = lineReader.readLine()) != null) {
      StringTokenizer whiteSpaceTokenizer = new StringTokenizer(line, " ");

      String tokens[] = new String[whiteSpaceTokenizer.countTokens()];

      if (tokens.length > 0) {
        int tokenIndex = 0;
        while (whiteSpaceTokenizer.hasMoreTokens()) {
          tokens[tokenIndex++] = whiteSpaceTokenizer.nextToken();
        }

        dictionary.put(new StringList(tokens));
      }
    }

    return dictionary;
  }
}