All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.gwt.dev.util.editdistance.MyersBitParallelEditDistance Maven / Gradle / Ivy

There is a newer version: 2.11.0
Show newest version
/*
 * Copyright 2010 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package com.google.gwt.dev.util.editdistance;

/**
 * Computes Levenshtein string-edit distance using the
 * algorithm of Eugene Myers (see "A fast bit-vector algorithm for
 * approximate string matching based on dynamic progamming",
 * Journal of the ACM, 46(3): 395-415, 1999), using the reformulations
 * of Heikki Hyyro (see "Explaining and Extending the Bit-parallel
 * Approximate String Matching Algorithm of Myers") along with
 * Gonzalo Navarro ("Faster Bit-Parallel Approximate String Matching",
 * Proc. 13th Combinatorial Pattern Matching (CPM'2002)).
 */
public abstract class MyersBitParallelEditDistance
    implements GeneralEditDistance, Cloneable {
  /*
   * The Myers algorithm is based on an adaptation of the traditional
   * dynamic programming algorithm, in which a matrix of distances
   * is computed column-wise (or row-wise).  For strings of length
   * M and N, an MxN matrix (D) is filled with distances for substrings:
   * D[i,j] holds the distance for string1[0..i]=>string2[0..j].
   * The matrix is initially populated with the trivial values
   * D[0,j]=j and D[i,0]=i; and then expanded with the rule:
   *    D[i,j] = min( D[i-1,j]+1,       // insertion
   *                  D[i,j-1]+1,       // deletion
   *                  (D[i-1,j-1]
   *                   + (string1[i]==string2[j])
   *                      ? 0           // match
   *                      : 1           // substitution ) )
   *
   * The Myers algorithm takes advantage of the observation that
   * the difference from one matrix position to its higher diagonal
   * is either 0 or 1, and consequently row-to-row and column-to-column
   * differences are 1, 0, or -1.  It organizes whole columns of
   * differences into bit arrays, computing new columns with
   * arithmetic on whole words.  The edit distance is computed
   * by tracking the column changes in the last row.
   *
   * The description below follows the naming from the Hyrro papers above.
   * The pattern string (length M) is arranged vertically; the
   * string to be matched (length N) is horizontal.  Columns are
   * represented by bit-arrays with (1< D[i-1,j]
   *                    |     \         |
   *                    ?        0      +1
   *                    |            \  |
   *                D[i,j-1]          D[i,j]
   *
   *        That is DZ[i,j] = DZ[i-1,j] & VP[i-1,j-1].
   *        Note that this is recursive in the DZ[*,j] bit array, but we
   *        want to compute the whole array at once.  We need an expression
   *        that will propagate a 1 bit down the DZ column when the associated
   *        VP bit is set.  Noting that the DZ[*,0] is the least significant
   *        bit, this means propagating to more significant bits.  The carry bit
   *        of an addition has this behavior.  If we set
   *            DZ = (PM | VN)          // first two cases
   *        then we can propagate bits up iff the corresponding VP bit
   *        is set with:
   *            DZ |= (((DZ & VP) + VP) ^ VP);
   *        The (DZ&VP) piece covers our required condition on VP.  Adding
   *        the VP bits causes any 1 bits from (DZ&VP) to result in a carry,
   *        but also introduces a 1 bit where VP=1 but DZ=0, which are then
   *        cleared with the ^VP piece.  The propagated bits are |ed into the
   *        base case to get a final result.
   *
   *    Once we have DZ, we can compute HP and HN.  Note that VP and VN have
   *    not yet been updated, and so refer to the preceding column; DZ refers
   *    to the current column.
   *      HN is true when DZ is true in this column but VP was true
   *        in the prior column (HN = DZ & VP)
   *      HP is true when: the prior VN is true (otherwise, the diagonal would
   *        be negative, and that cannot happen); or, DZ is false (meaning
   *        the diagonal increased by 1) and the prior-column VP is also false
   *        (HP = VN | ~(DZ|VP))
   *
   *    Once we have HP and HN, we can tally the effect on the last row (adding
   *    or subtracting one, respectively).
   *
   *    Finally, we can compute VP and VN for this column, in preparation
   *    for the next round.  These computations are analagous to the ones for
   *    HN and HP above, except that there, the incoming V values already
   *    reflected the prior column and DZ this one, but now the H and DZ values
   *    reflect the same column.  So, we first shift the H values to associate
   *    with the proper bits from DZ.
   */

  /**
   * A trivial implementation for the zero-length string.
   */
  static class Empty extends MyersBitParallelEditDistance {
    Empty(CharSequence s) {
      super(s);
    }

    @Override
    public GeneralEditDistance duplicate() {
      return this;      /* thread-safe */
    }

    @Override
    public int getDistance(CharSequence s, int k) {
      return s.length();
    }
  }

  /**
   * An implementation using multiple integers for each column.
   * This follows the pattern for the single-word implementation, with
   * these changes:
   *    sums/shifts must propagate from one word to the next;
   *    to make that easier (and cheaper), we use only (SIZE-1) bits per word;
   */
  static class Multi extends MyersBitParallelEditDistance {
    /* How many integers we use per column */
    int count;

    /**
     * Where the last-row bit lives -- only in the last array slot, though.
     */
    final int lastBitPosition;

    /**
     * Bitmaps for pattern string: [pattern_index][column_index].
     */
    final int[][] positions;

    int[] verticalNegativesReusable;
    /* Reusable arrays for vertical changes */
    int[] verticalPositivesReusable;

    /**
     * A mask with those bits set.
     */
    final int wordMask = (-1 >>> 1);

    /**
     * How many bits we use per word.
     */
    final int wordSize = Integer.SIZE - 1;

    /**
     * Constructs a multi-word engine.
     */
    Multi(CharSequence s) {
      super(s);

      /* Compute number of words to use (rounding up) */
      count = (m + wordSize - 1) / wordSize;

      /* Precompute bitmaps for pattern string */
      positions = PatternBitmap.map(s, idx, new int [idx.size()][], wordSize);

      /* Where in last word does last row appear */
      lastBitPosition = (1 << ((m - 1) % wordSize));

      /* Initialize scratchpad items */
      perThreadInit();
    }

    @Override
    public int getDistance(CharSequence s, int k) {
      indices = idx.map(s, indices);

      /* Initialize verticalPositive to all bits on, verticalNegative all off */
      int[] verticalPositives = verticalPositivesReusable;
      java.util.Arrays.fill(verticalPositives, wordMask);
      int[] verticalNegatives = verticalNegativesReusable;
      java.util.Arrays.fill(verticalNegatives, 0);

      int distance = m;
      int len = s.length();

      /* We can only miss the distance-- below this many times: */
      int maxMisses = k + len - m;
      if (maxMisses < 0) {
        maxMisses = Integer.MAX_VALUE;
      }

      outer:
      for (int j = 0; j < len; j++) {
        int[] position = positions[indices[j]];

        /* Carry bits from one word to the next */
        int sum = 0;
        int horizontalPositiveShift = 1;
        int horizontalNegativeShift = 0;

        /* Iterate through words for this column */
        for (int i = 0; i < count; i++) {
          int verticalNegative = verticalNegatives[i];
          int patternMatch = (position[i] | verticalNegative);
          int verticalPositive = verticalPositives[i];
          sum = (verticalPositive & patternMatch)
                + (verticalPositive) + (sum >>> wordSize);
          int diagonalZero = ((sum & wordMask) ^ verticalPositive)
                             | patternMatch;
          int horizontalPositive = (verticalNegative
                                   | ~(diagonalZero | verticalPositive));
          int horizontalNegative = diagonalZero & verticalPositive;

          if (i == (count - 1)) {            /* only last bit in last word */
            if ((horizontalNegative & lastBitPosition) != 0) {
              distance--;
            } else if ((horizontalPositive & lastBitPosition) != 0) {
              distance++;
              if ((maxMisses -= 2) < 0) {
                break outer;
              }
            } else if (--maxMisses < 0) {
              break outer;
            }
          }

          horizontalPositive = ((horizontalPositive << 1)
                               | horizontalPositiveShift);
          horizontalPositiveShift = (horizontalPositive >>> wordSize);

          horizontalNegative = ((horizontalNegative << 1)
                               | horizontalNegativeShift);
          horizontalNegativeShift = (horizontalNegative >>> wordSize);

          verticalPositives[i] = (horizontalNegative
                                  | ~(diagonalZero | horizontalPositive))
                                 & wordMask;
          verticalNegatives[i] = (diagonalZero & horizontalPositive) & wordMask;
        }
      }
      return distance;
    }

    @Override
    protected void perThreadInit() {
      super.perThreadInit();

      /* Allocate verticalPositive/verticalNegative arrays */
      verticalPositivesReusable = new int[count];
      verticalNegativesReusable = new int[count];
    }
  }

  /*
   * The following code is duplicated with both "int" and "long"
   * as the word primitive type.  To improve maintainability, all instances
   * of the word type are marked with "WORD" in comments.  This
   * allows the "long" version to be regenerated from the "int" version
   * through a mechanical replacement.  To make this work, the
   * class name also needs to include the primitive type (without
   * altered capitalization) and the WORD marker -- this is a quite
   * intentional violation of the usual class naming rules.
   */
  /**
   * An implementation using "int" as the word size.
   */
  // Replace "int/*WORD*/" with "long/*WORD*/" to generate the long version.
  static class TYPEint/*WORD*/ extends MyersBitParallelEditDistance {
    final int/*WORD*/ lastBitPosition;
    final int/*WORD*/[] map;

    @SuppressWarnings("cast")
    TYPEint/*WORD*/(CharSequence s) {
      super(s);
      /* Precompute bitmaps for this pattern */
      map = PatternBitmap.map(s, idx, new int/*WORD*/[idx.size()]);
      /* Compute the bit that represents a change in the last row */
      lastBitPosition = (((int/*WORD*/) 1) << (m - 1));
    }

    @Override
    public int getDistance(CharSequence s, int k) {
      int len = s.length();

      /* Quick check based on length */
      if (((len - m) > k) || ((m - len) > k)) {
        return k + 1;
      }

      /* Map characters to their integer positions in the bitmap array */
      indices = idx.map(s, indices);

      /* Initially, vertical change is all positive (none negative) */
      int/*WORD*/ verticalPositive = -1;
      int/*WORD*/ verticalNegative = 0;
      int distance = m;

      /* We can only miss the "distance--" below this many times: */
      int maxMisses = k + len - m;
      if (maxMisses < 0) {
        maxMisses = Integer.MAX_VALUE;
      }

      for (int j = 0; j < len; j++) {
        /* Where is diagonal zero: matches, or prior VN; plus recursion */
        int/*WORD*/ diagonalZero = map[indices[j]] | verticalNegative;
        diagonalZero |= (((diagonalZero & verticalPositive) + verticalPositive)
                        ^ verticalPositive);

        /* Compute horizontal changes */
        int/*WORD*/ horizontalPositive = verticalNegative
                                         | ~(diagonalZero | verticalPositive);
        int/*WORD*/ horizontalNegative = diagonalZero & verticalPositive;

        /* Update final distance based on horizontal changes */
        if ((horizontalNegative & lastBitPosition) != 0) {
          distance--;
        } else if ((horizontalPositive & lastBitPosition) != 0) {
          distance++;
          if ((maxMisses -= 2) < 0) {
            break;
          }
        } else if (--maxMisses < 0) {
          break;
        }

        /* Shift Hs to next row, compute new Vs analagously to Hs above */
        horizontalPositive = (horizontalPositive << 1) | 1;
        verticalPositive = (horizontalNegative << 1)
                           | ~(diagonalZero | horizontalPositive);
        verticalNegative = diagonalZero & horizontalPositive;
      }
      return distance;
    }
  }

  /**
   * An implementation using "long" as the word size.
   * GENERATED MECHANICALLY FROM THE "int" VERSION ABOVE.
   * DO NOT EDIT THIS ONE -- EDIT ABOVE AND REAPPLY QUERY/REPLACE.
   */
  static class TYPElong/*WORD*/ extends MyersBitParallelEditDistance {
    final long/*WORD*/ lastBitPosition;
    final long/*WORD*/[] map;

    TYPElong/*WORD*/(CharSequence s) {
      super(s);
      /* Precompute bitmaps for this pattern */
      map = PatternBitmap.map(s, idx, new long/*WORD*/[idx.size()]);
      /* Compute the bit that represents a change in the last row */
      lastBitPosition = (((long/*WORD*/) 1) << (m - 1));
    }

    @Override
    public int getDistance(CharSequence s, int k) {
      int len = s.length();

      /* Quick check based on length */
      if (((len - m) > k) || ((m - len) > k)) {
        return k + 1;
      }

      /* Map characters to their integer positions in the bitmap array */
      indices = idx.map(s, indices);

      /* Initially, vertical change is all positive (none negative) */
      long/*WORD*/ verticalPositive = -1;
      long/*WORD*/ verticalNegative = 0;
      int distance = m;

      /* We can only miss the "distance--" below this many times: */
      int maxMisses = k + len - m;
      if (maxMisses < 0) {
        maxMisses = Integer.MAX_VALUE;
      }

      for (int j = 0; j < len; j++) {
        /* Where is diagonal zero: matches, or prior VN; plus recursion */
        long/*WORD*/ diagonalZero = map[indices[j]] | verticalNegative;
        diagonalZero |= (((diagonalZero & verticalPositive) + verticalPositive)
                        ^ verticalPositive);

        /* Compute horizontal changes */
        long/*WORD*/ horizontalPositive = verticalNegative
                                         | ~(diagonalZero | verticalPositive);
        long/*WORD*/ horizontalNegative = diagonalZero & verticalPositive;

        /* Update final distance based on horizontal changes */
        if ((horizontalNegative & lastBitPosition) != 0) {
          distance--;
        } else if ((horizontalPositive & lastBitPosition) != 0) {
          distance++;
          if ((maxMisses -= 2) < 0) {
            break;
          }
        } else if (--maxMisses < 0) {
          break;
        }

        /* Shift Hs to next row, compute new Vs analagously to Hs above */
        horizontalPositive = (horizontalPositive << 1) | 1;
        verticalPositive = (horizontalNegative << 1)
                           | ~(diagonalZero | horizontalPositive);
        verticalNegative = diagonalZero & horizontalPositive;
      }
      return distance;
    }
  }

  /**
   * Chooses an appropriate implementation for a given pattern string.
   * @param s pattern string
   * @return distance calculator appropriate for the pattern
   */
  public static MyersBitParallelEditDistance getInstance(CharSequence s) {
    int m = s.length();
    return (m <= Integer.SIZE) ?
             ((m == 0) ? new Empty(s) : new TYPEint(s)) :
           (s.length() <= Long.SIZE) ?
             new TYPElong(s) :
             new Multi(s);
  }

  /**
   * Tests a computation manually.
   */
  public static void main(String[] args) {
    MyersBitParallelEditDistance b = getInstance(args[0]);
    int k = args.length > 2 ? Integer.parseInt(args[2]) : 0;
    System.out.println("Result: " + b.getDistance(args[1], k));
  }

  /**
   * Index mapping for pattern string.
   */
  final CharIndex idx;

  /**
   * Reusable array of indices for target strings.
   */
  int[] indices = new int[0];

  /**
   * Length of pattern.
   */
  final int m;

  /**
   * Constructs a distance calculator for a given string.
   */
  protected MyersBitParallelEditDistance(CharSequence s) {
    m = s.length();
    idx = CharIndex.getInstance(s);
  }

  @Override
  public GeneralEditDistance duplicate() {
    try {
      return (MyersBitParallelEditDistance) clone();
    } catch (CloneNotSupportedException x) { /*IMPOSSIBLE */
      throw new IllegalStateException("Cloneable object would not clone");
    }
  }

  /**
   * Computes distance from the pattern to a given string, bounded by
   * a limiting distance @see(GeneralEditDistance.getDistance(CharSequence,int)).
   */
  @Override
  public abstract int getDistance(CharSequence s, int k);

  @Override
  protected Object clone() throws CloneNotSupportedException {
    Object obj = super.clone();

    /* Re-initialize any non-thread-safe parts */
    ((MyersBitParallelEditDistance) obj).perThreadInit();

    return obj;
  }

  /**
   * Initializes items that cannot be shared among threads.
   */
  protected void perThreadInit() {
    indices = new int[0];
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy