
net.sf.okapi.steps.tokenization.engine.LookUpLexer Maven / Gradle / Ivy
package net.sf.okapi.steps.tokenization.engine;
import java.util.LinkedHashMap;
import java.util.List;
import net.sf.okapi.common.ListUtil;
import net.sf.okapi.common.LocaleId;
import net.sf.okapi.steps.tokenization.common.AbstractLexer;
import net.sf.okapi.steps.tokenization.common.InputTokenAnnotation;
import net.sf.okapi.steps.tokenization.common.Lexem;
import net.sf.okapi.steps.tokenization.common.Lexems;
import net.sf.okapi.steps.tokenization.common.LexerRule;
import net.sf.okapi.steps.tokenization.common.LexerRules;
import net.sf.okapi.steps.tokenization.common.Token;
import net.sf.okapi.steps.tokenization.tokens.Tokens;
public class LookUpLexer extends AbstractLexer {
private LexerRules rules;
private LinkedHashMap> dictionaries;
@Override
protected boolean lexer_hasNext() {
return false;
}
@Override
protected void lexer_init() {
rules = getRules();
dictionaries = new LinkedHashMap>();
for (LexerRule rule : rules) {
if (!checkRule(rule)) continue;
String dictionaryLocation = rule.getPattern();
List dictionary = ListUtil.loadList(this.getClass(), dictionaryLocation);
dictionaries.put(rule, dictionary);
}
}
@Override
protected Lexem lexer_next() {
return null;
}
@Override
protected void lexer_open(String text, LocaleId language, Tokens tokens) {
}
public Lexems process(String text, LocaleId language, Tokens tokens) {
Lexems lexems = new Lexems();
for (LexerRule rule : rules) {
if (!checkRule(rule, language)) continue;
List inTokenIDs = rule.getInTokenIDs();
List dictionary = dictionaries.get(rule);
if (dictionary == null) continue;
for (Token token : tokens)
if (inTokenIDs.contains(token.getTokenId())) {
if (dictionary.contains(token.getValue())) {
Lexem lexem = new Lexem(rule.getLexemId(), token.getValue(), token.getRange());
lexem.setAnnotation(new InputTokenAnnotation(token));
lexems.add(lexem);
if (!rule.getKeepInput())
token.delete(); // Remove replaced token
}
}
}
return lexems;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy