org.apache.lucene.codecs.sep.SepPostingsReader Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-codecs Show documentation
Show all versions of lucene-codecs Show documentation
Apache Lucene (module: codecs)
package org.apache.lucene.codecs.sep;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.TermState;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
/** Concrete class that reads the current doc/freq/skip
* postings format.
*
* @lucene.experimental
*/
// TODO: -- should we switch "hasProx" higher up? and
// create two separate docs readers, one that also reads
// prox and one that doesn't?
public class SepPostingsReader extends PostingsReaderBase {
final IntIndexInput freqIn;
final IntIndexInput docIn;
final IntIndexInput posIn;
final IndexInput payloadIn;
final IndexInput skipIn;
int skipInterval;
int maxSkipLevels;
int skipMinimum;
public SepPostingsReader(Directory dir, FieldInfos fieldInfos, SegmentInfo segmentInfo, IOContext context, IntStreamFactory intFactory, String segmentSuffix) throws IOException {
boolean success = false;
try {
final String docFileName = IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, SepPostingsWriter.DOC_EXTENSION);
docIn = intFactory.openInput(dir, docFileName, context);
skipIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, SepPostingsWriter.SKIP_EXTENSION), context);
if (fieldInfos.hasFreq()) {
freqIn = intFactory.openInput(dir, IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, SepPostingsWriter.FREQ_EXTENSION), context);
} else {
freqIn = null;
}
if (fieldInfos.hasProx()) {
posIn = intFactory.openInput(dir, IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, SepPostingsWriter.POS_EXTENSION), context);
payloadIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, SepPostingsWriter.PAYLOAD_EXTENSION), context);
} else {
posIn = null;
payloadIn = null;
}
success = true;
} finally {
if (!success) {
close();
}
}
}
@Override
public void init(IndexInput termsIn) throws IOException {
// Make sure we are talking to the matching past writer
CodecUtil.checkHeader(termsIn, SepPostingsWriter.CODEC,
SepPostingsWriter.VERSION_START, SepPostingsWriter.VERSION_START);
skipInterval = termsIn.readInt();
maxSkipLevels = termsIn.readInt();
skipMinimum = termsIn.readInt();
}
@Override
public void close() throws IOException {
IOUtils.close(freqIn, docIn, skipIn, posIn, payloadIn);
}
private static final class SepTermState extends BlockTermState {
// We store only the seek point to the docs file because
// the rest of the info (freqIndex, posIndex, etc.) is
// stored in the docs file:
IntIndexInput.Index docIndex;
IntIndexInput.Index posIndex;
IntIndexInput.Index freqIndex;
long payloadFP;
long skipFP;
@Override
public SepTermState clone() {
SepTermState other = new SepTermState();
other.copyFrom(this);
return other;
}
@Override
public void copyFrom(TermState _other) {
super.copyFrom(_other);
SepTermState other = (SepTermState) _other;
if (docIndex == null) {
docIndex = other.docIndex.clone();
} else {
docIndex.copyFrom(other.docIndex);
}
if (other.freqIndex != null) {
if (freqIndex == null) {
freqIndex = other.freqIndex.clone();
} else {
freqIndex.copyFrom(other.freqIndex);
}
} else {
freqIndex = null;
}
if (other.posIndex != null) {
if (posIndex == null) {
posIndex = other.posIndex.clone();
} else {
posIndex.copyFrom(other.posIndex);
}
} else {
posIndex = null;
}
payloadFP = other.payloadFP;
skipFP = other.skipFP;
}
@Override
public String toString() {
return super.toString() + " docIndex=" + docIndex + " freqIndex=" + freqIndex + " posIndex=" + posIndex + " payloadFP=" + payloadFP + " skipFP=" + skipFP;
}
}
@Override
public BlockTermState newTermState() throws IOException {
final SepTermState state = new SepTermState();
state.docIndex = docIn.index();
if (freqIn != null) {
state.freqIndex = freqIn.index();
}
if (posIn != null) {
state.posIndex = posIn.index();
}
return state;
}
@Override
public void decodeTerm(long[] empty, DataInput in, FieldInfo fieldInfo, BlockTermState _termState, boolean absolute)
throws IOException {
final SepTermState termState = (SepTermState) _termState;
termState.docIndex.read(in, absolute);
if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
termState.freqIndex.read(in, absolute);
if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
//System.out.println(" freqIndex=" + termState.freqIndex);
termState.posIndex.read(in, absolute);
//System.out.println(" posIndex=" + termState.posIndex);
if (fieldInfo.hasPayloads()) {
if (absolute) {
termState.payloadFP = in.readVLong();
} else {
termState.payloadFP += in.readVLong();
}
//System.out.println(" payloadFP=" + termState.payloadFP);
}
}
}
if (termState.docFreq >= skipMinimum) {
//System.out.println(" readSkip @ " + in.getPosition());
if (absolute) {
termState.skipFP = in.readVLong();
} else {
termState.skipFP += in.readVLong();
}
//System.out.println(" skipFP=" + termState.skipFP);
} else if (absolute) {
termState.skipFP = 0;
}
}
@Override
public DocsEnum docs(FieldInfo fieldInfo, BlockTermState _termState, Bits liveDocs, DocsEnum reuse, int flags) throws IOException {
final SepTermState termState = (SepTermState) _termState;
SepDocsEnum docsEnum;
if (reuse == null || !(reuse instanceof SepDocsEnum)) {
docsEnum = new SepDocsEnum();
} else {
docsEnum = (SepDocsEnum) reuse;
if (docsEnum.startDocIn != docIn) {
// If you are using ParellelReader, and pass in a
// reused DocsAndPositionsEnum, it could have come
// from another reader also using sep codec
docsEnum = new SepDocsEnum();
}
}
return docsEnum.init(fieldInfo, termState, liveDocs);
}
@Override
public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, BlockTermState _termState, Bits liveDocs,
DocsAndPositionsEnum reuse, int flags)
throws IOException {
assert fieldInfo.getIndexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
final SepTermState termState = (SepTermState) _termState;
SepDocsAndPositionsEnum postingsEnum;
if (reuse == null || !(reuse instanceof SepDocsAndPositionsEnum)) {
postingsEnum = new SepDocsAndPositionsEnum();
} else {
postingsEnum = (SepDocsAndPositionsEnum) reuse;
if (postingsEnum.startDocIn != docIn) {
// If you are using ParellelReader, and pass in a
// reused DocsAndPositionsEnum, it could have come
// from another reader also using sep codec
postingsEnum = new SepDocsAndPositionsEnum();
}
}
return postingsEnum.init(fieldInfo, termState, liveDocs);
}
class SepDocsEnum extends DocsEnum {
int docFreq;
int doc = -1;
int accum;
int count;
int freq;
long freqStart;
// TODO: -- should we do omitTF with 2 different enum classes?
private boolean omitTF;
private IndexOptions indexOptions;
private boolean storePayloads;
private Bits liveDocs;
private final IntIndexInput.Reader docReader;
private final IntIndexInput.Reader freqReader;
private long skipFP;
private final IntIndexInput.Index docIndex;
private final IntIndexInput.Index freqIndex;
private final IntIndexInput.Index posIndex;
private final IntIndexInput startDocIn;
// TODO: -- should we do hasProx with 2 different enum classes?
boolean skipped;
SepSkipListReader skipper;
SepDocsEnum() throws IOException {
startDocIn = docIn;
docReader = docIn.reader();
docIndex = docIn.index();
if (freqIn != null) {
freqReader = freqIn.reader();
freqIndex = freqIn.index();
} else {
freqReader = null;
freqIndex = null;
}
if (posIn != null) {
posIndex = posIn.index(); // only init this so skipper can read it
} else {
posIndex = null;
}
}
SepDocsEnum init(FieldInfo fieldInfo, SepTermState termState, Bits liveDocs) throws IOException {
this.liveDocs = liveDocs;
this.indexOptions = fieldInfo.getIndexOptions();
omitTF = indexOptions == IndexOptions.DOCS_ONLY;
storePayloads = fieldInfo.hasPayloads();
// TODO: can't we only do this if consumer
// skipped consuming the previous docs?
docIndex.copyFrom(termState.docIndex);
docIndex.seek(docReader);
if (!omitTF) {
freqIndex.copyFrom(termState.freqIndex);
freqIndex.seek(freqReader);
}
docFreq = termState.docFreq;
// NOTE: unused if docFreq < skipMinimum:
skipFP = termState.skipFP;
count = 0;
doc = -1;
accum = 0;
freq = 1;
skipped = false;
return this;
}
@Override
public int nextDoc() throws IOException {
while(true) {
if (count == docFreq) {
return doc = NO_MORE_DOCS;
}
count++;
// Decode next doc
//System.out.println("decode docDelta:");
accum += docReader.next();
if (!omitTF) {
//System.out.println("decode freq:");
freq = freqReader.next();
}
if (liveDocs == null || liveDocs.get(accum)) {
break;
}
}
return (doc = accum);
}
@Override
public int freq() throws IOException {
return freq;
}
@Override
public int docID() {
return doc;
}
@Override
public int advance(int target) throws IOException {
if ((target - skipInterval) >= doc && docFreq >= skipMinimum) {
// There are enough docs in the posting to have
// skip data, and its not too close
if (skipper == null) {
// This DocsEnum has never done any skipping
skipper = new SepSkipListReader(skipIn.clone(),
freqIn,
docIn,
posIn,
maxSkipLevels, skipInterval);
}
if (!skipped) {
// We haven't yet skipped for this posting
skipper.init(skipFP,
docIndex,
freqIndex,
posIndex,
0,
docFreq,
storePayloads);
skipper.setIndexOptions(indexOptions);
skipped = true;
}
final int newCount = skipper.skipTo(target);
if (newCount > count) {
// Skipper did move
if (!omitTF) {
skipper.getFreqIndex().seek(freqReader);
}
skipper.getDocIndex().seek(docReader);
count = newCount;
doc = accum = skipper.getDoc();
}
}
// Now, linear scan for the rest:
do {
if (nextDoc() == NO_MORE_DOCS) {
return NO_MORE_DOCS;
}
} while (target > doc);
return doc;
}
@Override
public long cost() {
return docFreq;
}
}
class SepDocsAndPositionsEnum extends DocsAndPositionsEnum {
int docFreq;
int doc = -1;
int accum;
int count;
int freq;
long freqStart;
private boolean storePayloads;
private Bits liveDocs;
private final IntIndexInput.Reader docReader;
private final IntIndexInput.Reader freqReader;
private final IntIndexInput.Reader posReader;
private final IndexInput payloadIn;
private long skipFP;
private final IntIndexInput.Index docIndex;
private final IntIndexInput.Index freqIndex;
private final IntIndexInput.Index posIndex;
private final IntIndexInput startDocIn;
private long payloadFP;
private int pendingPosCount;
private int position;
private int payloadLength;
private long pendingPayloadBytes;
private boolean skipped;
private SepSkipListReader skipper;
private boolean payloadPending;
private boolean posSeekPending;
SepDocsAndPositionsEnum() throws IOException {
startDocIn = docIn;
docReader = docIn.reader();
docIndex = docIn.index();
freqReader = freqIn.reader();
freqIndex = freqIn.index();
posReader = posIn.reader();
posIndex = posIn.index();
payloadIn = SepPostingsReader.this.payloadIn.clone();
}
SepDocsAndPositionsEnum init(FieldInfo fieldInfo, SepTermState termState, Bits liveDocs) throws IOException {
this.liveDocs = liveDocs;
storePayloads = fieldInfo.hasPayloads();
//System.out.println("Sep D&P init");
// TODO: can't we only do this if consumer
// skipped consuming the previous docs?
docIndex.copyFrom(termState.docIndex);
docIndex.seek(docReader);
//System.out.println(" docIndex=" + docIndex);
freqIndex.copyFrom(termState.freqIndex);
freqIndex.seek(freqReader);
//System.out.println(" freqIndex=" + freqIndex);
posIndex.copyFrom(termState.posIndex);
//System.out.println(" posIndex=" + posIndex);
posSeekPending = true;
payloadPending = false;
payloadFP = termState.payloadFP;
skipFP = termState.skipFP;
//System.out.println(" skipFP=" + skipFP);
docFreq = termState.docFreq;
count = 0;
doc = -1;
accum = 0;
pendingPosCount = 0;
pendingPayloadBytes = 0;
skipped = false;
return this;
}
@Override
public int nextDoc() throws IOException {
while(true) {
if (count == docFreq) {
return doc = NO_MORE_DOCS;
}
count++;
// TODO: maybe we should do the 1-bit trick for encoding
// freq=1 case?
// Decode next doc
//System.out.println(" sep d&p read doc");
accum += docReader.next();
//System.out.println(" sep d&p read freq");
freq = freqReader.next();
pendingPosCount += freq;
if (liveDocs == null || liveDocs.get(accum)) {
break;
}
}
position = 0;
return (doc = accum);
}
@Override
public int freq() throws IOException {
return freq;
}
@Override
public int docID() {
return doc;
}
@Override
public int advance(int target) throws IOException {
//System.out.println("SepD&P advance target=" + target + " vs current=" + doc + " this=" + this);
if ((target - skipInterval) >= doc && docFreq >= skipMinimum) {
// There are enough docs in the posting to have
// skip data, and its not too close
if (skipper == null) {
//System.out.println(" create skipper");
// This DocsEnum has never done any skipping
skipper = new SepSkipListReader(skipIn.clone(),
freqIn,
docIn,
posIn,
maxSkipLevels, skipInterval);
}
if (!skipped) {
//System.out.println(" init skip data skipFP=" + skipFP);
// We haven't yet skipped for this posting
skipper.init(skipFP,
docIndex,
freqIndex,
posIndex,
payloadFP,
docFreq,
storePayloads);
skipper.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
skipped = true;
}
final int newCount = skipper.skipTo(target);
//System.out.println(" skip newCount=" + newCount + " vs " + count);
if (newCount > count) {
// Skipper did move
skipper.getFreqIndex().seek(freqReader);
skipper.getDocIndex().seek(docReader);
//System.out.println(" doc seek'd to " + skipper.getDocIndex());
// NOTE: don't seek pos here; do it lazily
// instead. Eg a PhraseQuery may skip to many
// docs before finally asking for positions...
posIndex.copyFrom(skipper.getPosIndex());
posSeekPending = true;
count = newCount;
doc = accum = skipper.getDoc();
//System.out.println(" moved to doc=" + doc);
//payloadIn.seek(skipper.getPayloadPointer());
payloadFP = skipper.getPayloadPointer();
pendingPosCount = 0;
pendingPayloadBytes = 0;
payloadPending = false;
payloadLength = skipper.getPayloadLength();
//System.out.println(" move payloadLen=" + payloadLength);
}
}
// Now, linear scan for the rest:
do {
if (nextDoc() == NO_MORE_DOCS) {
//System.out.println(" advance nextDoc=END");
return NO_MORE_DOCS;
}
//System.out.println(" advance nextDoc=" + doc);
} while (target > doc);
//System.out.println(" return doc=" + doc);
return doc;
}
@Override
public int nextPosition() throws IOException {
if (posSeekPending) {
posIndex.seek(posReader);
payloadIn.seek(payloadFP);
posSeekPending = false;
}
// scan over any docs that were iterated without their
// positions
while (pendingPosCount > freq) {
final int code = posReader.next();
if (storePayloads && (code & 1) != 0) {
// Payload length has changed
payloadLength = posReader.next();
assert payloadLength >= 0;
}
pendingPosCount--;
position = 0;
pendingPayloadBytes += payloadLength;
}
final int code = posReader.next();
if (storePayloads) {
if ((code & 1) != 0) {
// Payload length has changed
payloadLength = posReader.next();
assert payloadLength >= 0;
}
position += code >>> 1;
pendingPayloadBytes += payloadLength;
payloadPending = payloadLength > 0;
} else {
position += code;
}
pendingPosCount--;
assert pendingPosCount >= 0;
return position;
}
@Override
public int startOffset() {
return -1;
}
@Override
public int endOffset() {
return -1;
}
private BytesRef payload;
@Override
public BytesRef getPayload() throws IOException {
if (!payloadPending) {
return null;
}
if (pendingPayloadBytes == 0) {
return payload;
}
assert pendingPayloadBytes >= payloadLength;
if (pendingPayloadBytes > payloadLength) {
payloadIn.seek(payloadIn.getFilePointer() + (pendingPayloadBytes - payloadLength));
}
if (payload == null) {
payload = new BytesRef();
payload.bytes = new byte[payloadLength];
} else if (payload.bytes.length < payloadLength) {
payload.grow(payloadLength);
}
payloadIn.readBytes(payload.bytes, 0, payloadLength);
payload.length = payloadLength;
pendingPayloadBytes = 0;
return payload;
}
@Override
public long cost() {
return docFreq;
}
}
@Override
public long ramBytesUsed() {
return 0;
}
@Override
public void checkIntegrity() throws IOException {
// TODO: remove sep layout, its fallen behind on features...
}
}