smile.llm.tokenizer.SentencePiece Maven / Gradle / Ivy
The newest version!
/*
* Copyright (c) 2010-2024 Haifeng Li. All rights reserved.
*
* Smile is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Smile is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Smile. If not, see .
*/
package smile.llm.tokenizer;
import java.io.IOException;
import java.nio.file.Paths;
import ai.djl.sentencepiece.*;
/**
* SentencePiece is an unsupervised text tokenizer by Google.
* SentencePiece implements BPE and unigram language model.
*
* @author Haifeng Li
*/
public class SentencePiece implements Tokenizer {
private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(SentencePiece.class);
/** SentencePiece tokenizer. */
private final SpProcessor tokenizer;
/** Unknown token (), default id 0. */
private final int unk;
/** BOS (beginning of sequence) token (), default id 1. */
private final int bos;
/** EOS (end of sequence) token (), default id 2. */
private final int eos;
/**
* Constructor.
* @param path The SentencePiece model file path.
* @throws IOException if fail to load the model.
*/
public SentencePiece(String path) throws IOException {
logger.info("Loading SentencePiece model from {}", path);
SpTokenizer model = new SpTokenizer(Paths.get(path));
SpVocabulary voc = SpVocabulary.from(model);
tokenizer = model.getProcessor();
unk = (int) voc.getIndex("");
bos = (int) voc.getIndex("");
eos = (int) voc.getIndex("");
logger.info("UNK ID: {} | BOS ID: {} | EOS ID: {}", unk, bos, eos);
}
@Override
public int[] encode(String text) {
return encode(text, false, false);
}
@Override
public int[] encode(String text, boolean bos, boolean eos) {
int[] t = tokenizer.encode(text);
int length = t.length;
if (bos) ++length;
if (eos) ++length;
int[] tokens = length > t.length ? new int[length] : t;
if (bos) {
tokens[0] = this.bos;
System.arraycopy(t, 0, tokens, 1, t.length);
} else {
System.arraycopy(t, 0, tokens, 0, t.length);
}
if (eos) {
tokens[length - 1] = this.eos;
}
return tokens;
}
@Override
public String decode(int[] tokens) {
return tokenizer.decode(tokens);
}
@Override
public String[] tokenize(String text) {
return tokenizer.tokenize(text);
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy