org.apache.lucene.benchmark.byTask.tasks.ReadTokensTask Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-benchmark Show documentation
Show all versions of lucene-benchmark Show documentation
Apache Lucene (module: benchmark)
package org.apache.lucene.benchmark.byTask.tasks;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.LongField;
import org.apache.lucene.document.FloatField;
import org.apache.lucene.document.DoubleField;
import org.apache.lucene.index.IndexableField;
/**
* Simple task to test performance of tokenizers. It just
* creates a token stream for each field of the document and
* read all tokens out of that stream.
*/
public class ReadTokensTask extends PerfTask {
public ReadTokensTask(PerfRunData runData) {
super(runData);
}
private int totalTokenCount = 0;
// volatile data passed between setup(), doLogic(), tearDown().
private Document doc = null;
@Override
public void setup() throws Exception {
super.setup();
DocMaker docMaker = getRunData().getDocMaker();
doc = docMaker.makeDocument();
}
@Override
protected String getLogMessage(int recsCount) {
return "read " + recsCount + " docs; " + totalTokenCount + " tokens";
}
@Override
public void tearDown() throws Exception {
doc = null;
super.tearDown();
}
@Override
public int doLogic() throws Exception {
List fields = doc.getFields();
Analyzer analyzer = getRunData().getAnalyzer();
int tokenCount = 0;
for(final IndexableField field : fields) {
if (!field.fieldType().tokenized() ||
field instanceof IntField ||
field instanceof LongField ||
field instanceof FloatField ||
field instanceof DoubleField) {
continue;
}
final TokenStream stream = field.tokenStream(analyzer, null);
// reset the TokenStream to the first token
stream.reset();
TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
while(stream.incrementToken()) {
termAtt.fillBytesRef();
tokenCount++;
}
stream.end();
stream.close();
}
totalTokenCount += tokenCount;
return tokenCount;
}
/* Simple StringReader that can be reset to a new string;
* we use this when tokenizing the string value from a
* Field. */
ReusableStringReader stringReader = new ReusableStringReader();
private final static class ReusableStringReader extends Reader {
int upto;
int left;
String s;
void init(String s) {
this.s = s;
left = s.length();
this.upto = 0;
}
@Override
public int read(char[] c) {
return read(c, 0, c.length);
}
@Override
public int read(char[] c, int off, int len) {
if (left > len) {
s.getChars(upto, upto+len, c, off);
upto += len;
left -= len;
return len;
} else if (0 == left) {
return -1;
} else {
s.getChars(upto, upto+left, c, off);
int r = left;
left = 0;
upto = s.length();
return r;
}
}
@Override
public void close() {}
}
}