org.apache.lucene.codecs.simpletext.SimpleTextDocValuesReader Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-codecs Show documentation
Show all versions of lucene-codecs Show documentation
Apache Lucene (module: codecs)
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.simpletext;
import java.io.IOException;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.nio.charset.StandardCharsets;
import java.text.DecimalFormat;
import java.text.DecimalFormatSymbols;
import java.text.ParseException;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.function.IntFunction;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.index.*;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.BufferedChecksumIndexInput;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.StringHelper;
import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.END;
import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.FIELD;
import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.LENGTH;
import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.MAXLENGTH;
import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.MINVALUE;
import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.NUMVALUES;
import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.ORDPATTERN;
import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.PATTERN;
import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.TYPE;
class SimpleTextDocValuesReader extends DocValuesProducer {
private static final long BASE_RAM_BYTES_USED =
RamUsageEstimator.shallowSizeOfInstance(SimpleTextDocValuesReader.class)
+ RamUsageEstimator.shallowSizeOfInstance(BytesRef.class);
static class OneField {
private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(OneField.class);
long dataStartFilePointer;
String pattern;
String ordPattern;
int maxLength;
boolean fixedLength;
long minValue;
long numValues;
}
final int maxDoc;
final IndexInput data;
final BytesRefBuilder scratch = new BytesRefBuilder();
final Map fields = new HashMap<>();
public SimpleTextDocValuesReader(SegmentReadState state, String ext) throws IOException {
// System.out.println("dir=" + state.directory + " seg=" + state.segmentInfo.name + " file=" + IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, ext));
data = state.directory.openInput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, ext), state.context);
maxDoc = state.segmentInfo.maxDoc();
while(true) {
readLine();
//System.out.println("READ field=" + scratch.utf8ToString());
if (scratch.get().equals(END)) {
break;
}
assert startsWith(FIELD) : scratch.get().utf8ToString();
String fieldName = stripPrefix(FIELD);
//System.out.println(" field=" + fieldName);
OneField field = new OneField();
fields.put(fieldName, field);
readLine();
assert startsWith(TYPE) : scratch.get().utf8ToString();
DocValuesType dvType = DocValuesType.valueOf(stripPrefix(TYPE));
assert dvType != DocValuesType.NONE;
if (dvType == DocValuesType.NUMERIC) {
readLine();
assert startsWith(MINVALUE): "got " + scratch.get().utf8ToString() + " field=" + fieldName + " ext=" + ext;
field.minValue = Long.parseLong(stripPrefix(MINVALUE));
readLine();
assert startsWith(PATTERN);
field.pattern = stripPrefix(PATTERN);
field.dataStartFilePointer = data.getFilePointer();
data.seek(data.getFilePointer() + (1+field.pattern.length()+2) * maxDoc);
} else if (dvType == DocValuesType.BINARY) {
readLine();
assert startsWith(MAXLENGTH);
field.maxLength = Integer.parseInt(stripPrefix(MAXLENGTH));
readLine();
assert startsWith(PATTERN);
field.pattern = stripPrefix(PATTERN);
field.dataStartFilePointer = data.getFilePointer();
data.seek(data.getFilePointer() + (9+field.pattern.length()+field.maxLength+2) * maxDoc);
} else if (dvType == DocValuesType.SORTED || dvType == DocValuesType.SORTED_SET) {
readLine();
assert startsWith(NUMVALUES);
field.numValues = Long.parseLong(stripPrefix(NUMVALUES));
readLine();
assert startsWith(MAXLENGTH);
field.maxLength = Integer.parseInt(stripPrefix(MAXLENGTH));
readLine();
assert startsWith(PATTERN);
field.pattern = stripPrefix(PATTERN);
readLine();
assert startsWith(ORDPATTERN);
field.ordPattern = stripPrefix(ORDPATTERN);
field.dataStartFilePointer = data.getFilePointer();
data.seek(data.getFilePointer() + (9+field.pattern.length()+field.maxLength) * field.numValues + (1+field.ordPattern.length())*maxDoc);
} else {
throw new AssertionError();
}
}
// We should only be called from above if at least one
// field has DVs:
assert !fields.isEmpty();
}
@Override
public NumericDocValues getNumeric(FieldInfo fieldInfo) throws IOException {
IntFunction values = getNumericNonIterator(fieldInfo);
if (values == null) {
return null;
} else {
DocValuesIterator docsWithField = getNumericDocsWithField(fieldInfo);
return new NumericDocValues() {
@Override
public int nextDoc() throws IOException {
return docsWithField.nextDoc();
}
@Override
public int docID() {
return docsWithField.docID();
}
@Override
public long cost() {
return docsWithField.cost();
}
@Override
public int advance(int target) throws IOException {
return docsWithField.advance(target);
}
@Override
public boolean advanceExact(int target) throws IOException {
return docsWithField.advanceExact(target);
}
@Override
public long longValue() throws IOException {
return values.apply(docsWithField.docID());
}
};
}
}
IntFunction getNumericNonIterator(FieldInfo fieldInfo) throws IOException {
final OneField field = fields.get(fieldInfo.name);
assert field != null;
// SegmentCoreReaders already verifies this field is
// valid:
assert field != null: "field=" + fieldInfo.name + " fields=" + fields;
final IndexInput in = data.clone();
final BytesRefBuilder scratch = new BytesRefBuilder();
final DecimalFormat decoder = new DecimalFormat(field.pattern, new DecimalFormatSymbols(Locale.ROOT));
decoder.setParseBigDecimal(true);
return new IntFunction() {
@Override
public Long apply(int docID) {
try {
//System.out.println(Thread.currentThread().getName() + ": get docID=" + docID + " in=" + in);
if (docID < 0 || docID >= maxDoc) {
throw new IndexOutOfBoundsException("docID must be 0 .. " + (maxDoc-1) + "; got " + docID);
}
in.seek(field.dataStartFilePointer + (1+field.pattern.length()+2)*docID);
SimpleTextUtil.readLine(in, scratch);
//System.out.println("parsing delta: " + scratch.utf8ToString());
BigDecimal bd;
try {
bd = (BigDecimal) decoder.parse(scratch.get().utf8ToString());
} catch (ParseException pe) {
throw new CorruptIndexException("failed to parse BigDecimal value", in, pe);
}
SimpleTextUtil.readLine(in, scratch); // read the line telling us if it's real or not
return BigInteger.valueOf(field.minValue).add(bd.toBigIntegerExact()).longValue();
} catch (IOException ioe) {
throw new RuntimeException(ioe);
}
}
};
}
private static abstract class DocValuesIterator extends DocIdSetIterator {
abstract boolean advanceExact(int target) throws IOException;
}
private DocValuesIterator getNumericDocsWithField(FieldInfo fieldInfo) throws IOException {
final OneField field = fields.get(fieldInfo.name);
final IndexInput in = data.clone();
final BytesRefBuilder scratch = new BytesRefBuilder();
return new DocValuesIterator() {
int doc = -1;
@Override
public int nextDoc() throws IOException {
return advance(docID() + 1);
}
@Override
public int docID() {
return doc;
}
@Override
public long cost() {
return maxDoc;
}
@Override
public int advance(int target) throws IOException {
for (int i = target; i < maxDoc; ++i) {
in.seek(field.dataStartFilePointer + (1+field.pattern.length()+2)*i);
SimpleTextUtil.readLine(in, scratch); // data
SimpleTextUtil.readLine(in, scratch); // 'T' or 'F'
if (scratch.byteAt(0) == (byte) 'T') {
return doc = i;
}
}
return doc = NO_MORE_DOCS;
}
@Override
boolean advanceExact(int target) throws IOException {
this.doc = target;
in.seek(field.dataStartFilePointer + (1+field.pattern.length()+2)*target);
SimpleTextUtil.readLine(in, scratch); // data
SimpleTextUtil.readLine(in, scratch); // 'T' or 'F'
return scratch.byteAt(0) == (byte) 'T';
}
};
}
@Override
public synchronized BinaryDocValues getBinary(FieldInfo fieldInfo) throws IOException {
final OneField field = fields.get(fieldInfo.name);
// SegmentCoreReaders already verifies this field is
// valid:
assert field != null;
final IndexInput in = data.clone();
final BytesRefBuilder scratch = new BytesRefBuilder();
final DecimalFormat decoder = new DecimalFormat(field.pattern, new DecimalFormatSymbols(Locale.ROOT));
DocValuesIterator docsWithField = getBinaryDocsWithField(fieldInfo);
IntFunction values = new IntFunction() {
final BytesRefBuilder term = new BytesRefBuilder();
@Override
public BytesRef apply(int docID) {
try {
if (docID < 0 || docID >= maxDoc) {
throw new IndexOutOfBoundsException("docID must be 0 .. " + (maxDoc-1) + "; got " + docID);
}
in.seek(field.dataStartFilePointer + (9+field.pattern.length() + field.maxLength+2)*docID);
SimpleTextUtil.readLine(in, scratch);
assert StringHelper.startsWith(scratch.get(), LENGTH);
int len;
try {
len = decoder.parse(new String(scratch.bytes(), LENGTH.length, scratch.length() - LENGTH.length, StandardCharsets.UTF_8)).intValue();
} catch (ParseException pe) {
throw new CorruptIndexException("failed to parse int length", in, pe);
}
term.grow(len);
term.setLength(len);
in.readBytes(term.bytes(), 0, len);
return term.get();
} catch (IOException ioe) {
throw new RuntimeException(ioe);
}
}
};
return new BinaryDocValues() {
@Override
public int nextDoc() throws IOException {
return docsWithField.nextDoc();
}
@Override
public int docID() {
return docsWithField.docID();
}
@Override
public long cost() {
return docsWithField.cost();
}
@Override
public int advance(int target) throws IOException {
return docsWithField.advance(target);
}
@Override
public boolean advanceExact(int target) throws IOException {
return docsWithField.advanceExact(target);
}
@Override
public BytesRef binaryValue() throws IOException {
return values.apply(docsWithField.docID());
}
};
}
private DocValuesIterator getBinaryDocsWithField(FieldInfo fieldInfo) throws IOException {
final OneField field = fields.get(fieldInfo.name);
final IndexInput in = data.clone();
final BytesRefBuilder scratch = new BytesRefBuilder();
final DecimalFormat decoder = new DecimalFormat(field.pattern, new DecimalFormatSymbols(Locale.ROOT));
return new DocValuesIterator() {
int doc = -1;
@Override
public int nextDoc() throws IOException {
return advance(docID() + 1);
}
@Override
public int docID() {
return doc;
}
@Override
public long cost() {
return maxDoc;
}
@Override
public int advance(int target) throws IOException {
for (int i = target; i < maxDoc; ++i) {
in.seek(field.dataStartFilePointer + (9+field.pattern.length() + field.maxLength+2)*i);
SimpleTextUtil.readLine(in, scratch);
assert StringHelper.startsWith(scratch.get(), LENGTH);
int len;
try {
len = decoder.parse(new String(scratch.bytes(), LENGTH.length, scratch.length() - LENGTH.length, StandardCharsets.UTF_8)).intValue();
} catch (ParseException pe) {
throw new CorruptIndexException("failed to parse int length", in, pe);
}
// skip past bytes
byte bytes[] = new byte[len];
in.readBytes(bytes, 0, len);
SimpleTextUtil.readLine(in, scratch); // newline
SimpleTextUtil.readLine(in, scratch); // 'T' or 'F'
if (scratch.byteAt(0) == (byte) 'T') {
return doc = i;
}
}
return doc = NO_MORE_DOCS;
}
@Override
boolean advanceExact(int target) throws IOException {
this.doc = target;
in.seek(field.dataStartFilePointer + (9+field.pattern.length() + field.maxLength+2)*target);
SimpleTextUtil.readLine(in, scratch);
assert StringHelper.startsWith(scratch.get(), LENGTH);
int len;
try {
len = decoder.parse(new String(scratch.bytes(), LENGTH.length, scratch.length() - LENGTH.length, StandardCharsets.UTF_8)).intValue();
} catch (ParseException pe) {
throw new CorruptIndexException("failed to parse int length", in, pe);
}
// skip past bytes
byte bytes[] = new byte[len];
in.readBytes(bytes, 0, len);
SimpleTextUtil.readLine(in, scratch); // newline
SimpleTextUtil.readLine(in, scratch); // 'T' or 'F'
return scratch.byteAt(0) == (byte) 'T';
}
};
}
@Override
public SortedDocValues getSorted(FieldInfo fieldInfo) throws IOException {
final OneField field = fields.get(fieldInfo.name);
// SegmentCoreReaders already verifies this field is
// valid:
assert field != null;
final IndexInput in = data.clone();
final BytesRefBuilder scratch = new BytesRefBuilder();
final DecimalFormat decoder = new DecimalFormat(field.pattern, new DecimalFormatSymbols(Locale.ROOT));
final DecimalFormat ordDecoder = new DecimalFormat(field.ordPattern, new DecimalFormatSymbols(Locale.ROOT));
return new SortedDocValues() {
int doc = -1;
@Override
public int nextDoc() throws IOException {
return advance(docID() + 1);
}
@Override
public int docID() {
return doc;
}
@Override
public long cost() {
return maxDoc;
}
int ord;
@Override
public int advance(int target) throws IOException {
for (int i = target; i < maxDoc; ++i) {
in.seek(field.dataStartFilePointer + field.numValues * (9 + field.pattern.length() + field.maxLength) + i * (1 + field.ordPattern.length()));
SimpleTextUtil.readLine(in, scratch);
try {
ord = (int) ordDecoder.parse(scratch.get().utf8ToString()).longValue()-1;
} catch (ParseException pe) {
throw new CorruptIndexException("failed to parse ord", in, pe);
}
if (ord >= 0) {
return doc = i;
}
}
return doc = NO_MORE_DOCS;
}
@Override
public boolean advanceExact(int target) throws IOException {
this.doc = target;
in.seek(field.dataStartFilePointer + field.numValues * (9 + field.pattern.length() + field.maxLength) + target * (1 + field.ordPattern.length()));
SimpleTextUtil.readLine(in, scratch);
try {
ord = (int) ordDecoder.parse(scratch.get().utf8ToString()).longValue()-1;
} catch (ParseException pe) {
throw new CorruptIndexException("failed to parse ord", in, pe);
}
return ord >= 0;
}
@Override
public int ordValue() {
return ord;
}
final BytesRefBuilder term = new BytesRefBuilder();
@Override
public BytesRef lookupOrd(int ord) throws IOException {
if (ord < 0 || ord >= field.numValues) {
throw new IndexOutOfBoundsException("ord must be 0 .. " + (field.numValues-1) + "; got " + ord);
}
in.seek(field.dataStartFilePointer + ord * (9 + field.pattern.length() + field.maxLength));
SimpleTextUtil.readLine(in, scratch);
assert StringHelper.startsWith(scratch.get(), LENGTH): "got " + scratch.get().utf8ToString() + " in=" + in;
int len;
try {
len = decoder.parse(new String(scratch.bytes(), LENGTH.length, scratch.length() - LENGTH.length, StandardCharsets.UTF_8)).intValue();
} catch (ParseException pe) {
throw new CorruptIndexException("failed to parse int length", in, pe);
}
term.grow(len);
term.setLength(len);
in.readBytes(term.bytes(), 0, len);
return term.get();
}
@Override
public int getValueCount() {
return (int)field.numValues;
}
};
}
@Override
public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException {
final BinaryDocValues binary = getBinary(field);
return new SortedNumericDocValues() {
@Override
public int nextDoc() throws IOException {
int doc = binary.nextDoc();
setCurrentDoc();
return doc;
}
@Override
public int docID() {
return binary.docID();
}
@Override
public long cost() {
return binary.cost();
}
@Override
public int advance(int target) throws IOException {
int doc = binary.advance(target);
setCurrentDoc();
return doc;
}
@Override
public boolean advanceExact(int target) throws IOException {
if (binary.advanceExact(target)) {
setCurrentDoc();
return true;
}
return false;
}
long values[];
int index;
private void setCurrentDoc() throws IOException {
if (docID() == NO_MORE_DOCS) {
return;
}
String csv = binary.binaryValue().utf8ToString();
if (csv.length() == 0) {
values = new long[0];
} else {
String s[] = csv.split(",");
values = new long[s.length];
for (int i = 0; i < values.length; i++) {
values[i] = Long.parseLong(s[i]);
}
}
index = 0;
}
@Override
public long nextValue() throws IOException {
return values[index++];
}
@Override
public int docValueCount() {
return values.length;
}
};
}
@Override
public SortedSetDocValues getSortedSet(FieldInfo fieldInfo) throws IOException {
final OneField field = fields.get(fieldInfo.name);
// SegmentCoreReaders already verifies this field is
// valid:
assert field != null;
final IndexInput in = data.clone();
final BytesRefBuilder scratch = new BytesRefBuilder();
final DecimalFormat decoder = new DecimalFormat(field.pattern, new DecimalFormatSymbols(Locale.ROOT));
return new SortedSetDocValues() {
String[] currentOrds = new String[0];
int currentIndex = 0;
final BytesRefBuilder term = new BytesRefBuilder();
int doc = -1;
@Override
public int nextDoc() throws IOException {
return advance(doc + 1);
}
@Override
public int docID() {
return doc;
}
@Override
public long cost() {
return maxDoc;
}
@Override
public int advance(int target) throws IOException {
for (int i = target; i < maxDoc; ++i) {
in.seek(field.dataStartFilePointer + field.numValues * (9 + field.pattern.length() + field.maxLength) + i * (1 + field.ordPattern.length()));
SimpleTextUtil.readLine(in, scratch);
String ordList = scratch.get().utf8ToString().trim();
if (ordList.isEmpty() == false) {
currentOrds = ordList.split(",");
currentIndex = 0;
return doc = i;
}
}
return doc = NO_MORE_DOCS;
}
@Override
public boolean advanceExact(int target) throws IOException {
in.seek(field.dataStartFilePointer + field.numValues * (9 + field.pattern.length() + field.maxLength) + target * (1 + field.ordPattern.length()));
SimpleTextUtil.readLine(in, scratch);
String ordList = scratch.get().utf8ToString().trim();
doc = target;
if (ordList.isEmpty() == false) {
currentOrds = ordList.split(",");
currentIndex = 0;
return true;
}
return false;
}
@Override
public long nextOrd() throws IOException {
if (currentIndex == currentOrds.length) {
return NO_MORE_ORDS;
} else {
return Long.parseLong(currentOrds[currentIndex++]);
}
}
@Override
public BytesRef lookupOrd(long ord) throws IOException {
if (ord < 0 || ord >= field.numValues) {
throw new IndexOutOfBoundsException("ord must be 0 .. " + (field.numValues-1) + "; got " + ord);
}
in.seek(field.dataStartFilePointer + ord * (9 + field.pattern.length() + field.maxLength));
SimpleTextUtil.readLine(in, scratch);
assert StringHelper.startsWith(scratch.get(), LENGTH): "got " + scratch.get().utf8ToString() + " in=" + in;
int len;
try {
len = decoder.parse(new String(scratch.bytes(), LENGTH.length, scratch.length() - LENGTH.length, StandardCharsets.UTF_8)).intValue();
} catch (ParseException pe) {
throw new CorruptIndexException("failed to parse int length", in, pe);
}
term.grow(len);
term.setLength(len);
in.readBytes(term.bytes(), 0, len);
return term.get();
}
@Override
public long getValueCount() {
return field.numValues;
}
};
}
@Override
public void close() throws IOException {
data.close();
}
/** Used only in ctor: */
private void readLine() throws IOException {
SimpleTextUtil.readLine(data, scratch);
//System.out.println("line: " + scratch.utf8ToString());
}
/** Used only in ctor: */
private boolean startsWith(BytesRef prefix) {
return StringHelper.startsWith(scratch.get(), prefix);
}
/** Used only in ctor: */
private String stripPrefix(BytesRef prefix) {
return new String(scratch.bytes(), prefix.length, scratch.length() - prefix.length, StandardCharsets.UTF_8);
}
@Override
public long ramBytesUsed() {
return BASE_RAM_BYTES_USED + RamUsageEstimator.sizeOf(scratch.bytes())
+ fields.size() * (RamUsageEstimator.NUM_BYTES_OBJECT_REF * 2L + OneField.BASE_RAM_BYTES_USED);
}
@Override
public String toString() {
return getClass().getSimpleName() + "(fields=" + fields.size() + ")";
}
@Override
public void checkIntegrity() throws IOException {
BytesRefBuilder scratch = new BytesRefBuilder();
IndexInput clone = data.clone();
clone.seek(0);
// checksum is fixed-width encoded with 20 bytes, plus 1 byte for newline (the space is included in SimpleTextUtil.CHECKSUM):
long footerStartPos = data.length() - (SimpleTextUtil.CHECKSUM.length + 21);
ChecksumIndexInput input = new BufferedChecksumIndexInput(clone);
while (true) {
SimpleTextUtil.readLine(input, scratch);
if (input.getFilePointer() >= footerStartPos) {
// Make sure we landed at precisely the right location:
if (input.getFilePointer() != footerStartPos) {
throw new CorruptIndexException("SimpleText failure: footer does not start at expected position current=" + input.getFilePointer() + " vs expected=" + footerStartPos, input);
}
SimpleTextUtil.checkFooter(input);
break;
}
}
}
}