All Downloads are FREE. Search and download functionalities are using the official Maven repository.

morfologik.stemming.TrimPrefixAndSuffixEncoder Maven / Gradle / Ivy

package morfologik.stemming;

import java.nio.ByteBuffer;

/**
 * Encodes dst relative to src by trimming whatever
 * non-equal suffix and prefix src and dst have. The
 * output code is (bytes):
 * 
 * 
 * {P}{K}{suffix}
 * 
* * where (P - 'A') bytes should be trimmed from the start of * src, (K - 'A') bytes should be trimmed from the * end of src and then the suffix should be appended * to the resulting byte sequence. * *

* Examples: *

* *
 * src: abc
 * dst: abcd
 * encoded: AAd
 * 
 * src: abc
 * dst: xyz
 * encoded: ADxyz
 * 
*/ public class TrimPrefixAndSuffixEncoder implements ISequenceEncoder { /** * Maximum encodable single-byte code. */ private static final int REMOVE_EVERYTHING = 255; public ByteBuffer encode(ByteBuffer reuse, ByteBuffer source, ByteBuffer target) { // Search for the maximum matching subsequence that can be encoded. int maxSubsequenceLength = 0; int maxSubsequenceIndex = 0; for (int i = 0; i < source.remaining(); i++) { // prefix at i => shared subsequence (infix) int sharedPrefix = BufferUtils.sharedPrefixLength(source, i, target, 0); // Only update maxSubsequenceLength if we will be able to encode it. if (sharedPrefix > maxSubsequenceLength && i < REMOVE_EVERYTHING && (source.remaining() - (i + sharedPrefix)) < REMOVE_EVERYTHING) { maxSubsequenceLength = sharedPrefix; maxSubsequenceIndex = i; } } // Determine how much to remove (and where) from src to get a prefix of dst. int truncatePrefixBytes = maxSubsequenceIndex; int truncateSuffixBytes = (source.remaining() - (maxSubsequenceIndex + maxSubsequenceLength)); if (truncatePrefixBytes >= REMOVE_EVERYTHING || truncateSuffixBytes >= REMOVE_EVERYTHING) { maxSubsequenceIndex = maxSubsequenceLength = 0; truncatePrefixBytes = truncateSuffixBytes = REMOVE_EVERYTHING; } final int len1 = target.remaining() - maxSubsequenceLength; reuse = BufferUtils.clearAndEnsureCapacity(reuse, 2 + len1); assert target.hasArray() && target.position() == 0 && target.arrayOffset() == 0; reuse.put((byte) ((truncatePrefixBytes + 'A') & 0xFF)); reuse.put((byte) ((truncateSuffixBytes + 'A') & 0xFF)); reuse.put(target.array(), maxSubsequenceLength, len1); reuse.flip(); return reuse; } @Override public int prefixBytes() { return 2; } public ByteBuffer decode(ByteBuffer reuse, ByteBuffer source, ByteBuffer encoded) { assert encoded.remaining() >= 2; final int p = encoded.position(); int truncatePrefixBytes = (encoded.get(p) - 'A') & 0xFF; int truncateSuffixBytes = (encoded.get(p + 1) - 'A') & 0xFF; if (truncatePrefixBytes == REMOVE_EVERYTHING || truncateSuffixBytes == REMOVE_EVERYTHING) { truncatePrefixBytes = source.remaining(); truncateSuffixBytes = 0; } assert source.hasArray() && source.position() == 0 && source.arrayOffset() == 0; assert encoded.hasArray() && encoded.position() == 0 && encoded.arrayOffset() == 0; final int len1 = source.remaining() - (truncateSuffixBytes + truncatePrefixBytes); final int len2 = encoded.remaining() - 2; reuse = BufferUtils.clearAndEnsureCapacity(reuse, len1 + len2); reuse.put(source.array(), truncatePrefixBytes, len1); reuse.put(encoded.array(), 2, len2); reuse.flip(); return reuse; } @Override public String toString() { return getClass().getSimpleName(); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy