morfologik.stemming.TrimPrefixAndSuffixEncoder Maven / Gradle / Ivy

Go to download
package morfologik.stemming;

import java.nio.ByteBuffer;

/**
 * Encodes dst relative to src by trimming whatever
 * non-equal suffix and prefix src and dst have. The
 * output code is (bytes):
 * 
 *  * {P}{K}{suffix}
 * 
 * 
 * where (P - 'A') bytes should be trimmed from the start of
 * src, (K - 'A') bytes should be trimmed from the
 * end of src and then the suffix should be appended
 * to the resulting byte sequence.
 * 
 * 
 * Examples:
 * 
 * 
 *  * src: abc
 * dst: abcd
 * encoded: AAd
 * 
 * src: abc
 * dst: xyz
 * encoded: ADxyz
 * 
 */
public class TrimPrefixAndSuffixEncoder implements ISequenceEncoder {
  /**
   * Maximum encodable single-byte code.
   */
  private static final int REMOVE_EVERYTHING = 255;

  public ByteBuffer encode(ByteBuffer reuse, ByteBuffer source, ByteBuffer target) {
    // Search for the maximum matching subsequence that can be encoded. 
    int maxSubsequenceLength = 0;
    int maxSubsequenceIndex = 0;
    for (int i = 0; i < source.remaining(); i++) {
      // prefix at i => shared subsequence (infix)
      int sharedPrefix = BufferUtils.sharedPrefixLength(source, i, target, 0);
      // Only update maxSubsequenceLength if we will be able to encode it.
      if (sharedPrefix > maxSubsequenceLength && i < REMOVE_EVERYTHING
          && (source.remaining() - (i + sharedPrefix)) < REMOVE_EVERYTHING) {
        maxSubsequenceLength = sharedPrefix;
        maxSubsequenceIndex = i;
      }
    }

    // Determine how much to remove (and where) from src to get a prefix of dst.
    int truncatePrefixBytes = maxSubsequenceIndex;
    int truncateSuffixBytes = (source.remaining() - (maxSubsequenceIndex + maxSubsequenceLength));
    if (truncatePrefixBytes >= REMOVE_EVERYTHING || truncateSuffixBytes >= REMOVE_EVERYTHING) {
      maxSubsequenceIndex = maxSubsequenceLength = 0;
      truncatePrefixBytes = truncateSuffixBytes = REMOVE_EVERYTHING;
    }

    final int len1 = target.remaining() - maxSubsequenceLength;
    reuse = BufferUtils.clearAndEnsureCapacity(reuse, 2 + len1);

    assert target.hasArray() && 
           target.position() == 0 && 
           target.arrayOffset() == 0;

    reuse.put((byte) ((truncatePrefixBytes + 'A') & 0xFF));
    reuse.put((byte) ((truncateSuffixBytes + 'A') & 0xFF));
    reuse.put(target.array(), maxSubsequenceLength, len1);
    reuse.flip();

    return reuse;
  }
  
  @Override
  public int prefixBytes() {
    return 2;
  }

  public ByteBuffer decode(ByteBuffer reuse, ByteBuffer source, ByteBuffer encoded) {
    assert encoded.remaining() >= 2;

    final int p = encoded.position();
    int truncatePrefixBytes = (encoded.get(p)     - 'A') & 0xFF;
    int truncateSuffixBytes = (encoded.get(p + 1) - 'A') & 0xFF;

    if (truncatePrefixBytes == REMOVE_EVERYTHING || 
        truncateSuffixBytes == REMOVE_EVERYTHING) {
      truncatePrefixBytes = source.remaining();
      truncateSuffixBytes = 0;
    }

    assert source.hasArray() && 
           source.position() == 0 && 
           source.arrayOffset() == 0;

    assert encoded.hasArray() && 
           encoded.position() == 0 && 
           encoded.arrayOffset() == 0;

    final int len1 = source.remaining() - (truncateSuffixBytes + truncatePrefixBytes);
    final int len2 = encoded.remaining() - 2;
    reuse = BufferUtils.clearAndEnsureCapacity(reuse, len1 + len2);

    reuse.put(source.array(), truncatePrefixBytes, len1);
    reuse.put(encoded.array(), 2, len2);
    reuse.flip();

    return reuse;
  }

  @Override
  public String toString() {
    return getClass().getSimpleName();
  }
}