All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.gwt.dev.util.editdistance.CharIndex Maven / Gradle / Ivy

There is a newer version: 2.10.0
Show newest version
/*
 * Copyright 2010 Google Inc.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package com.google.gwt.dev.util.editdistance;

/**
 * A performance-oriented character-indexed map.
 * Provides a mapping from characters in a limited alphabet
 * (defined by a CharSequence) to integer values (for array indexing).
 * Emphasis is on performance (e.g., compared to using a generic HashMap),
 * in terms of speed of (post-construction) lookup and space required.
 *
 * Maps are constructed using a static getInstance method that
 * chooses an efficient implementation for a given string.
 * The map interface consists of:
 *    a size method for determining the maximum range of indices used;
 *    a lookup method for translating a character to its index; and,
 *    a nullElement method that returns the index that is not
 *     returned by lookup for any characer actually in the map.
 *
 * The target application is the mapping of natural-language strings.
 * Most strings include characters from a single language, and thus
 * fall into a small slice of the Unicode space.  In particular,
 * many strings fall into just the Latin series.
 */
public abstract class CharIndex {
  /**
   * An index based on a garden-variety hash map.
   * This is the implementation of last recourse (from a
   * performance perspective).
   *
   * TODO(mwyoung): there may be value in an implementation
   * that falls in between the masked and fullhash variants;
   * for example, one using a closed hash table with rehashing.
   */
  public static class FullHash extends CharIndex {
    /**
     * Mutable holder for a character.
     */
    static class Char {
      char c;
      @Override
      public boolean equals(Object x) {
        return (x != null) && (((Char) x).c == this.c);
      }
      @Override
      public int hashCode() {
        return c;
      }
      @Override
      public String toString() {
        return "'" + c + "'";
      }
    }
    
    static final int NULL_ELEMENT = 0;
    protected int lastUsed = NULL_ELEMENT;
    /**
     * Mapping from pattern characters to their integer index values.
     */
    final java.util.HashMap map;

    /**
     * Constructs a full hash-based mapping.
     */
    FullHash(CharSequence s) {
      /* Choose a hash size larger at least twice the string length */
      int len = s.length();
      int power = Integer.highestOneBit(len);

      /* Create the map */
      map = new java.util.HashMap(
          power << ((power == len) ? 1 : 2));

      /*
       * Add characters one at a time to the map
       */

      Char test = new Char();                   /* (re)used for lookup */
      for (int i = 0; i < s.length(); i++) {
        test.c = s.charAt(i);
        if (map.get(test) == null) {
          /* Not there... add it */
          map.put(test, new Integer(++lastUsed));

          /* Grow a new holder for subsequent lookups */
          test = new Char();
        }
      }
    }
    
    @Override
    public int lookup(char c) {
      final Char lookupTest = new Char();
      lookupTest.c = c;
      Integer result = map.get(lookupTest);
      return (result != null) ? result.intValue() : 0;
    }

    @Override
    public int[] map(CharSequence s, int[] mapped) {
      /* Create one mutable Char, and reuse it for all lookups */
      final Char lookupTest = new Char();

      int len = s.length();
      if (mapped.length < len) {
        mapped = new int[len];
      }

      for (int i = 0; i < len; i++) {
        lookupTest.c = s.charAt(i);
        Integer result = map.get(lookupTest);
        mapped[i] = (result != null) ? result.intValue() : NULL_ELEMENT;
      }
      return mapped;
    }

    @Override
    public int nullElement() {
      return NULL_ELEMENT;
    }

    @Override
    public int size() {
      return lastUsed + 1;
    }
  }
  
  /**
   * An index based on a simple mask: index(c) = (c & MASK).
   * This allows for a very fast index function for many
   * languages outside of the ASCII spectrum.  Most languages
   * fit in a single 8-bit Unicode slice.  Even languages
   * that cross slices for special characters (such as extended
   * Latin) often have no collisions under simple masking.
   */
  public static class Masked extends CharIndex {
    /**
     * Hash table size.
     */
    static final int SIZE = 0x100;

    /**
     * Mask used for hashing.
     */
    static final int MASK = (SIZE - 1);

    /**
     * Where we may invalid characters: beyond the hash table.
     */
    static final int NULL_ELEMENT = SIZE;

    /**
     * Generates an instance of this implementation if possible.
     * @param s pattern string
     * @return an instance of this CharIndex if the pattern satisfies
     *         the constraints for this implementation; otherwise, null
     */
    static Masked generate(CharSequence s) {
      /*
       * This implementation requires that there be no hash collisions
       * among the pattern characters, using a simple mask-based hash.
       */
      char [] contains = new char[SIZE];

      /* Ensure that for all x, hash(contains[x]) != x initially. */
      contains[0] = (char) 1;

      /* Hash characters, recording values seen, rejecting collisions */
      for (int i = 0; i < s.length(); i++) {
        char c = s.charAt(i);
        int index = c & MASK;
        if (contains[index] != c) {
          if ((contains[index] & MASK) == index) {
            return null;
          }
          contains[index] = c;
        }
      }
      return new Masked(contains);
    }

    /**
     * Closed hash table: characters actually indexed.
     * Unused cells contain values inappropriate to their index.
     */
    final char[] contains;

    /**
     * Constructor based on hash table built by generate().
     */
    private Masked(char [] contains) {
      this.contains = contains;
    }
    
    @Override
    public int lookup(char c) {
      int index = c & MASK;
      return (c == contains[index]) ? index : NULL_ELEMENT;
    }

    @Override
    public int[] map(CharSequence s, int[] mapped) {
      int len = s.length();
      if (mapped.length < len) {
        mapped = new int[len];
      }
      for (int i = 0; i < len; i++) {
        char c = s.charAt(i);
        int index = c & MASK;
        mapped[i] = (c == contains[index]) ? index : NULL_ELEMENT;
      }
      return mapped;
    }

    @Override
    public int nullElement() { return NULL_ELEMENT; }

    @Override
    public int size() { return NULL_ELEMENT + 1; }
  }

  /**
   * An index based on the identity mapping: index(c) = c.
   * This requires minimal computation and no additional space
   * for the basic ASCII character set.
   */
  public static class Straight extends CharIndex {
    /**
     * The largest character we will map (directly).
     */
    static final int MAX = 0x80;

    /** 
     * A mask used to find characters that fall outside.
     */
    static final int MASK = ~(MAX - 1);

    /**
     * A map result we never generate for valid characters.
     */
    static final int NULL_ELEMENT = MAX;

    /**
     * Generates an instance of this implementation if possible.
     * @param s pattern string
     * @return an instance of this CharIndex if the pattern satisfies
     *         the constraints for this implementation; otherwise, null
     */
    static Straight generate(CharSequence s) {
      /* This implementation requires that all characters fall below MAX */
      for (int i = 0; i < s.length(); i++) {
        if ((s.charAt(i) & MASK) != 0) {
          return null;
        }
      }

      return new Straight();
    }

    /**
     * Simple private constructor, no state required.
     */
    private Straight() { }

    @Override
    public int lookup(char c) {
      return ((c & MASK) == 0) ? c : NULL_ELEMENT;
    }

    @Override
    public int[] map(CharSequence s, int[] mapped) {
      int len = s.length();
      if (mapped.length < len) {
        mapped = new int[len];
      }
      for (int i = 0; i < len; i++) {
        char c = s.charAt(i);
        mapped[i] = ((c & MASK) == 0) ? c : NULL_ELEMENT;
      }
      return mapped;
    }

    @Override
    public int nullElement() {
      return NULL_ELEMENT;
    }

    @Override
    public int size() {
      return NULL_ELEMENT + 1;
    }
  }

  /**
   * Generates an index for a given alphabet, defined by a character sequence.
   * An appropriate implementation is chosen for the alphabet.  The
   * character sequence is allowed to contain multiple instances of
   * the same character.
   *
   * @param s the alphabet to be indexed
   * @return an index appropriate to this alphabet
   */
  public static CharIndex getInstance(CharSequence s) {
    CharIndex result;

    /* Try fastest mappings first, then more general ones */

    if ((result = Straight.generate(s)) != null) {
      return result;
    }

    if ((result = Masked.generate(s)) != null) {
      return result;
    }

    /* Full hash always works */
    return new FullHash(s);
  }
  
  /**
   * Returns the index (mapping result) for a given character.
   * If the character is present in the alphabet, then a unique index
   * is returned; other characters may be mapped to unique
   * values, *or* may be mapped to a shared nullElement value.
   * @param c character for which an index is sought
   * @return an integer index for the character
   */
  public abstract int lookup(char c);

  /**
   * Performs lookups on an entire sequence of characters.
   * This allows an entire string to be mapped with fewer
   * method invocations, and possibly with some added internal
   * efficiencies.
   *
   * @param s character sequence to be indexed
   * @param mapped array for storing results
   * @return an array of indices for the character sequence,
   *  either the mapped parameter or a larger newly-allocated
   *  array if required
   */
  public abstract int[] map(CharSequence s, int[] mapped);

  /**
   * Returns an index that is not used for any character in
   * the alphabet.  This index *may* be returned for characters
   * not in the alphabet.
   * @return an index not used for a valid character
   */
  public abstract int nullElement();

  /**
   * Returns the maximum index result that can be returned by
   * the lookup function (including the nullElement result
   * if applicable).  No negative values are ever returned
   * from lookup.
   * @return maximum index returned from any lookup
   */
  public abstract int size();
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy