morfologik.stemming.TrimSuffixEncoder Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of morfologik-stemming Show documentation
Show all versions of morfologik-stemming Show documentation
Morfologik provides high quality lemmatisation for the Polish language,
along with tools for building and using byte-based finite state automata.
package morfologik.stemming;
import java.nio.ByteBuffer;
/**
* Encodes dst
relative to src
by trimming whatever
* non-equal suffix src
has. The output code is (bytes):
*
*
* {K}{suffix}
*
*
* where (K
- 'A') bytes should be trimmed from the end of
* src
and then the suffix
should be appended to the
* resulting byte sequence.
*
*
* Examples:
*
*
*
* src: foo
* dst: foobar
* encoded: Abar
*
* src: foo
* dst: bar
* encoded: Dbar
*
*/
public class TrimSuffixEncoder implements ISequenceEncoder {
/**
* Maximum encodable single-byte code.
*/
private static final int REMOVE_EVERYTHING = 255;
public ByteBuffer encode(ByteBuffer reuse, ByteBuffer source, ByteBuffer target) {
int sharedPrefix = BufferUtils.sharedPrefixLength(source, target);
int truncateBytes = source.remaining() - sharedPrefix;
if (truncateBytes >= REMOVE_EVERYTHING) {
truncateBytes = REMOVE_EVERYTHING;
sharedPrefix = 0;
}
reuse = BufferUtils.clearAndEnsureCapacity(reuse, 1 + target.remaining() - sharedPrefix);
assert target.hasArray() &&
target.position() == 0 &&
target.arrayOffset() == 0;
final byte suffixTrimCode = (byte) (truncateBytes + 'A');
reuse.put(suffixTrimCode)
.put(target.array(), sharedPrefix, target.remaining() - sharedPrefix)
.flip();
return reuse;
}
@Override
public int prefixBytes() {
return 1;
}
public ByteBuffer decode(ByteBuffer reuse, ByteBuffer source, ByteBuffer encoded) {
assert encoded.remaining() >= 1;
int suffixTrimCode = encoded.get(encoded.position());
int truncateBytes = (suffixTrimCode - 'A') & 0xFF;
if (truncateBytes == REMOVE_EVERYTHING) {
truncateBytes = source.remaining();
}
final int len1 = source.remaining() - truncateBytes;
final int len2 = encoded.remaining() - 1;
reuse = BufferUtils.clearAndEnsureCapacity(reuse, len1 + len2);
assert source.hasArray() &&
source.position() == 0 &&
source.arrayOffset() == 0;
assert encoded.hasArray() &&
encoded.position() == 0 &&
encoded.arrayOffset() == 0;
reuse.put(source.array(), 0, len1)
.put(encoded.array(), 1, len2)
.flip();
return reuse;
}
@Override
public String toString() {
return getClass().getSimpleName();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy