All Downloads are FREE. Search and download functionalities are using the official Maven repository.

morfologik.stemming.TrimInfixAndSuffixEncoder Maven / Gradle / Ivy

There is a newer version: 2.1.9
Show newest version
package morfologik.stemming;

import java.nio.ByteBuffer;

/**
 * Encodes dst relative to src by trimming whatever
 * non-equal suffix and infix src and dst have. The
 * output code is (bytes):
 * 
 * 
 * {X}{L}{K}{suffix}
 * 
* * where src's infix at position (X - 'A') and of * length (L - 'A') should be removed, then (K - * 'A') bytes should be trimmed from the end and then the suffix * should be appended to the resulting byte sequence. * *

* Examples: *

* *
 * src: ayz
 * dst: abc
 * encoded: AACbc
 * 
 * src: aillent
 * dst: aller
 * encoded: BBCr
 * 
*/ public class TrimInfixAndSuffixEncoder implements ISequenceEncoder { /** * Maximum encodable single-byte code. */ private static final int REMOVE_EVERYTHING = 255; private ByteBuffer scratch = ByteBuffer.allocate(0); public ByteBuffer encode(ByteBuffer reuse, ByteBuffer source, ByteBuffer target) { assert source.hasArray() && source.position() == 0 && source.arrayOffset() == 0; assert target.hasArray() && target.position() == 0 && target.arrayOffset() == 0; // Search for the infix that can we can encode and remove from src // to get a maximum-length prefix of dst. This could be done more efficiently // by running a smarter longest-common-subsequence algorithm and some pruning (?). // // For now, naive loop should do. // There can be only two positions for the infix to delete: // 1) we remove leading bytes, even if they are partially matching (but a longer match // exists somewhere later on). // 2) we leave max. matching prefix and remove non-matching bytes that follow. int maxInfixIndex = 0; int maxSubsequenceLength = BufferUtils.sharedPrefixLength(source, target); int maxInfixLength = 0; for (int i : new int[] { 0, maxSubsequenceLength }) { for (int j = 1; j <= source.remaining() - i; j++) { // Compute temporary src with the infix removed. // Concatenate in scratch space for simplicity. final int len2 = source.remaining() - (i + j); scratch = BufferUtils.ensureCapacity(scratch, i + len2); scratch.clear(); scratch.put(source.array(), 0, i); scratch.put(source.array(), i + j, len2); scratch.flip(); int sharedPrefix = BufferUtils.sharedPrefixLength(scratch, target); // Only update maxSubsequenceLength if we will be able to encode it. if (sharedPrefix > 0 && sharedPrefix > maxSubsequenceLength && i < REMOVE_EVERYTHING && j < REMOVE_EVERYTHING) { maxSubsequenceLength = sharedPrefix; maxInfixIndex = i; maxInfixLength = j; } } } int truncateSuffixBytes = source.remaining() - (maxInfixLength + maxSubsequenceLength); // Special case: if we're removing the suffix in the infix code, move it // to the suffix code instead. if (truncateSuffixBytes == 0 && maxInfixIndex + maxInfixLength == source.remaining()) { truncateSuffixBytes = maxInfixLength; maxInfixIndex = maxInfixLength = 0; } if (maxInfixIndex >= REMOVE_EVERYTHING || maxInfixLength >= REMOVE_EVERYTHING || truncateSuffixBytes >= REMOVE_EVERYTHING) { maxInfixIndex = maxSubsequenceLength = 0; maxInfixLength = truncateSuffixBytes = REMOVE_EVERYTHING; } final int len1 = target.remaining() - maxSubsequenceLength; reuse = BufferUtils.ensureCapacity(reuse, 3 + len1); reuse.clear(); reuse.put((byte) ((maxInfixIndex + 'A') & 0xFF)); reuse.put((byte) ((maxInfixLength + 'A') & 0xFF)); reuse.put((byte) ((truncateSuffixBytes + 'A') & 0xFF)); reuse.put(target.array(), maxSubsequenceLength, len1); reuse.flip(); return reuse; } public ByteBuffer decode(ByteBuffer reuse, ByteBuffer source, ByteBuffer encoded) { assert encoded.remaining() >= 3; final int p = encoded.position(); int infixIndex = (encoded.get(p) - 'A') & 0xFF; int infixLength = (encoded.get(p + 1) - 'A') & 0xFF; int truncateSuffixBytes = (encoded.get(p + 2) - 'A') & 0xFF; if (infixLength == REMOVE_EVERYTHING || truncateSuffixBytes == REMOVE_EVERYTHING) { infixIndex = 0; infixLength = source.remaining(); truncateSuffixBytes = 0; } final int len1 = source.remaining() - (infixIndex + infixLength + truncateSuffixBytes); final int len2 = encoded.remaining() - 3; reuse = BufferUtils.ensureCapacity(reuse, infixIndex + len1 + len2); reuse.clear(); assert encoded.hasArray() && encoded.position() == 0 && encoded.arrayOffset() == 0; assert source.hasArray() && source.position() == 0 && source.arrayOffset() == 0; reuse.put(source.array(), 0, infixIndex); reuse.put(source.array(), infixIndex + infixLength, len1); reuse.put(encoded.array(), 3, len2); reuse.flip(); return reuse; } @Override public String toString() { return getClass().getSimpleName(); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy