
net.sf.okapi.steps.tokenization.RbbiTokenizer Maven / Gradle / Ivy
/*===========================================================================
Copyright (C) 2009 by the Okapi Framework contributors
-----------------------------------------------------------------------------
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
===========================================================================*/
package net.sf.okapi.steps.tokenization;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator;
import net.sf.okapi.common.LocaleId;
import net.sf.okapi.common.StringUtil;
import net.sf.okapi.common.Util;
import net.sf.okapi.common.exceptions.OkapiBadFilterInputException;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.TreeMap;
public class RbbiTokenizer implements ITokenizer {
// Cache for iterators reuse
private final TreeMap iterators = new TreeMap<>();
private RuleBasedBreakIterator iterator = null;
private LocaleId language;
private int start;
private int end;
private String text;
public RbbiTokenizer() {
}
@Override
public boolean hasNext() {
return end != BreakIterator.DONE;
}
@Override
public net.sf.okapi.steps.tokenization.Token next() {
end = iterator.next();
if (end == BreakIterator.DONE) {
return null;
}
if (start >= end) {
return null;
}
// get token id from RBBI
int tokenId = iterator.getRuleStatus();
String value = text.substring(start, end);
String name = Tokens.getTokenName(tokenId);
String description = Tokens.getTokenDescription(tokenId);
net.sf.okapi.steps.tokenization.Token token = new net.sf.okapi.steps.tokenization.Token(tokenId, value, name,
description, start, end);
// Prepare for the next iteration
start = end;
return token;
}
@Override
public void init(String text, LocaleId language) {
this.language = language;
this.text = text;
if (Util.isEmpty(this.text)) {
return;
}
if (iterators.containsKey(language)) {
iterator = iterators.get(language);
} else {
try {
// Ideally we should compile the rule file outside and store it, but
// this way you don't have to worry about ICU version compatibility
// and running ICU4C genbrk tool
String rules = StringUtil.readString(RbbiTokenizer.class.getResource("/rbbi.txt"));
ByteArrayOutputStream out = new ByteArrayOutputStream();
RuleBasedBreakIterator.compileRules(rules, out);
ByteArrayInputStream is = new ByteArrayInputStream(out.toByteArray());
iterator = RuleBasedBreakIterator.getInstanceFromCompiledRules(is);
} catch (IOException | NullPointerException e) {
throw new OkapiBadFilterInputException("Cannot load compiled break rules.", e);
}
iterators.put(language, iterator);
}
if (iterator == null) {
return;
}
iterator.setText(this.text);
// Sets the current iteration position to the beginning of the text
start = iterator.first();
end = start;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy