org.apache.lucene.index.FreqProxFields Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;
import java.io.IOException;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.index.FreqProxTermsWriterPerField.FreqProxPostingsArray;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
/** Implements limited (iterators only, no stats) {@link
* Fields} interface over the in-RAM buffered
* fields/terms/postings, to flush postings through the
* PostingsFormat. */
class FreqProxFields extends Fields {
final Map fields = new LinkedHashMap<>();
public FreqProxFields(List fieldList) {
// NOTE: fields are already sorted by field name
for(FreqProxTermsWriterPerField field : fieldList) {
fields.put(field.fieldInfo.name, field);
}
}
public Iterator iterator() {
return fields.keySet().iterator();
}
@Override
public Terms terms(String field) throws IOException {
FreqProxTermsWriterPerField perField = fields.get(field);
return perField == null ? null : new FreqProxTerms(perField);
}
@Override
public int size() {
//return fields.size();
throw new UnsupportedOperationException();
}
private static class FreqProxTerms extends Terms {
final FreqProxTermsWriterPerField terms;
public FreqProxTerms(FreqProxTermsWriterPerField terms) {
this.terms = terms;
}
@Override
public TermsEnum iterator() {
FreqProxTermsEnum termsEnum = new FreqProxTermsEnum(terms);
termsEnum.reset();
return termsEnum;
}
@Override
public long size() {
//return terms.termsHashPerField.bytesHash.size();
throw new UnsupportedOperationException();
}
@Override
public long getSumTotalTermFreq() {
//return terms.sumTotalTermFreq;
throw new UnsupportedOperationException();
}
@Override
public long getSumDocFreq() {
//return terms.sumDocFreq;
throw new UnsupportedOperationException();
}
@Override
public int getDocCount() {
//return terms.docCount;
throw new UnsupportedOperationException();
}
@Override
public boolean hasFreqs() {
return terms.fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
}
@Override
public boolean hasOffsets() {
// NOTE: the in-memory buffer may have indexed offsets
// because that's what FieldInfo said when we started,
// but during indexing this may have been downgraded:
return terms.fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
}
@Override
public boolean hasPositions() {
// NOTE: the in-memory buffer may have indexed positions
// because that's what FieldInfo said when we started,
// but during indexing this may have been downgraded:
return terms.fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
}
@Override
public boolean hasPayloads() {
return terms.sawPayloads;
}
}
private static class FreqProxTermsEnum extends TermsEnum {
final FreqProxTermsWriterPerField terms;
final int[] sortedTermIDs;
final FreqProxPostingsArray postingsArray;
final BytesRef scratch = new BytesRef();
final int numTerms;
int ord;
public FreqProxTermsEnum(FreqProxTermsWriterPerField terms) {
this.terms = terms;
this.numTerms = terms.bytesHash.size();
sortedTermIDs = terms.sortedTermIDs;
assert sortedTermIDs != null;
postingsArray = (FreqProxPostingsArray) terms.postingsArray;
}
public void reset() {
ord = -1;
}
public SeekStatus seekCeil(BytesRef text) {
// TODO: we could instead keep the BytesRefHash
// intact so this is a hash lookup
// binary search:
int lo = 0;
int hi = numTerms - 1;
while (hi >= lo) {
int mid = (lo + hi) >>> 1;
int textStart = postingsArray.textStarts[sortedTermIDs[mid]];
terms.bytePool.setBytesRef(scratch, textStart);
int cmp = scratch.compareTo(text);
if (cmp < 0) {
lo = mid + 1;
} else if (cmp > 0) {
hi = mid - 1;
} else {
// found:
ord = mid;
assert term().compareTo(text) == 0;
return SeekStatus.FOUND;
}
}
// not found:
ord = lo;
if (ord >= numTerms) {
return SeekStatus.END;
} else {
int textStart = postingsArray.textStarts[sortedTermIDs[ord]];
terms.bytePool.setBytesRef(scratch, textStart);
assert term().compareTo(text) > 0;
return SeekStatus.NOT_FOUND;
}
}
public void seekExact(long ord) {
this.ord = (int) ord;
int textStart = postingsArray.textStarts[sortedTermIDs[this.ord]];
terms.bytePool.setBytesRef(scratch, textStart);
}
@Override
public BytesRef next() {
ord++;
if (ord >= numTerms) {
return null;
} else {
int textStart = postingsArray.textStarts[sortedTermIDs[ord]];
terms.bytePool.setBytesRef(scratch, textStart);
return scratch;
}
}
@Override
public BytesRef term() {
return scratch;
}
@Override
public long ord() {
return ord;
}
@Override
public int docFreq() {
// We do not store this per-term, and we cannot
// implement this at merge time w/o an added pass
// through the postings:
throw new UnsupportedOperationException();
}
@Override
public long totalTermFreq() {
// We do not store this per-term, and we cannot
// implement this at merge time w/o an added pass
// through the postings:
throw new UnsupportedOperationException();
}
@Override
public PostingsEnum postings(PostingsEnum reuse, int flags) {
if (PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS)) {
FreqProxPostingsEnum posEnum;
if (!terms.hasProx) {
// Caller wants positions but we didn't index them;
// don't lie:
throw new IllegalArgumentException("did not index positions");
}
if (!terms.hasOffsets && PostingsEnum.featureRequested(flags, PostingsEnum.OFFSETS)) {
// Caller wants offsets but we didn't index them;
// don't lie:
throw new IllegalArgumentException("did not index offsets");
}
if (reuse instanceof FreqProxPostingsEnum) {
posEnum = (FreqProxPostingsEnum) reuse;
if (posEnum.postingsArray != postingsArray) {
posEnum = new FreqProxPostingsEnum(terms, postingsArray);
}
} else {
posEnum = new FreqProxPostingsEnum(terms, postingsArray);
}
posEnum.reset(sortedTermIDs[ord]);
return posEnum;
}
FreqProxDocsEnum docsEnum;
if (!terms.hasFreq && PostingsEnum.featureRequested(flags, PostingsEnum.FREQS)) {
// Caller wants freqs but we didn't index them;
// don't lie:
throw new IllegalArgumentException("did not index freq");
}
if (reuse instanceof FreqProxDocsEnum) {
docsEnum = (FreqProxDocsEnum) reuse;
if (docsEnum.postingsArray != postingsArray) {
docsEnum = new FreqProxDocsEnum(terms, postingsArray);
}
} else {
docsEnum = new FreqProxDocsEnum(terms, postingsArray);
}
docsEnum.reset(sortedTermIDs[ord]);
return docsEnum;
}
/**
* Expert: Returns the TermsEnums internal state to position the TermsEnum
* without re-seeking the term dictionary.
*
* NOTE: A seek by {@link TermState} might not capture the
* {@link AttributeSource}'s state. Callers must maintain the
* {@link AttributeSource} states separately
*
* @see TermState
* @see #seekExact(BytesRef, TermState)
*/
public TermState termState() throws IOException {
return new TermState() {
@Override
public void copyFrom(TermState other) {
throw new UnsupportedOperationException();
}
};
}
}
private static class FreqProxDocsEnum extends PostingsEnum {
final FreqProxTermsWriterPerField terms;
final FreqProxPostingsArray postingsArray;
final ByteSliceReader reader = new ByteSliceReader();
final boolean readTermFreq;
int docID = -1;
int freq;
boolean ended;
int termID;
public FreqProxDocsEnum(FreqProxTermsWriterPerField terms, FreqProxPostingsArray postingsArray) {
this.terms = terms;
this.postingsArray = postingsArray;
this.readTermFreq = terms.hasFreq;
}
public void reset(int termID) {
this.termID = termID;
terms.initReader(reader, termID, 0);
ended = false;
docID = -1;
}
@Override
public int docID() {
return docID;
}
@Override
public int freq() {
// Don't lie here ... don't want codecs writings lots
// of wasted 1s into the index:
if (!readTermFreq) {
throw new IllegalStateException("freq was not indexed");
} else {
return freq;
}
}
@Override
public int nextPosition() throws IOException {
return -1;
}
@Override
public int startOffset() throws IOException {
return -1;
}
@Override
public int endOffset() throws IOException {
return -1;
}
@Override
public BytesRef getPayload() throws IOException {
return null;
}
@Override
public int nextDoc() throws IOException {
if (docID == -1) {
docID = 0;
}
if (reader.eof()) {
if (ended) {
return NO_MORE_DOCS;
} else {
ended = true;
docID = postingsArray.lastDocIDs[termID];
if (readTermFreq) {
freq = postingsArray.termFreqs[termID];
}
}
} else {
int code = reader.readVInt();
if (!readTermFreq) {
docID += code;
} else {
docID += code >>> 1;
if ((code & 1) != 0) {
freq = 1;
} else {
freq = reader.readVInt();
}
}
assert docID != postingsArray.lastDocIDs[termID];
}
return docID;
}
@Override
public int advance(int target) {
throw new UnsupportedOperationException();
}
@Override
public long cost() {
throw new UnsupportedOperationException();
}
}
private static class FreqProxPostingsEnum extends PostingsEnum {
final FreqProxTermsWriterPerField terms;
final FreqProxPostingsArray postingsArray;
final ByteSliceReader reader = new ByteSliceReader();
final ByteSliceReader posReader = new ByteSliceReader();
final boolean readOffsets;
int docID = -1;
int freq;
int pos;
int startOffset;
int endOffset;
int posLeft;
int termID;
boolean ended;
boolean hasPayload;
BytesRefBuilder payload = new BytesRefBuilder();
public FreqProxPostingsEnum(FreqProxTermsWriterPerField terms, FreqProxPostingsArray postingsArray) {
this.terms = terms;
this.postingsArray = postingsArray;
this.readOffsets = terms.hasOffsets;
assert terms.hasProx;
assert terms.hasFreq;
}
public void reset(int termID) {
this.termID = termID;
terms.initReader(reader, termID, 0);
terms.initReader(posReader, termID, 1);
ended = false;
docID = -1;
posLeft = 0;
}
@Override
public int docID() {
return docID;
}
@Override
public int freq() {
return freq;
}
@Override
public int nextDoc() throws IOException {
if (docID == -1) {
docID = 0;
}
while (posLeft != 0) {
nextPosition();
}
if (reader.eof()) {
if (ended) {
return NO_MORE_DOCS;
} else {
ended = true;
docID = postingsArray.lastDocIDs[termID];
freq = postingsArray.termFreqs[termID];
}
} else {
int code = reader.readVInt();
docID += code >>> 1;
if ((code & 1) != 0) {
freq = 1;
} else {
freq = reader.readVInt();
}
assert docID != postingsArray.lastDocIDs[termID];
}
posLeft = freq;
pos = 0;
startOffset = 0;
return docID;
}
@Override
public int advance(int target) {
throw new UnsupportedOperationException();
}
@Override
public long cost() {
throw new UnsupportedOperationException();
}
@Override
public int nextPosition() throws IOException {
assert posLeft > 0;
posLeft--;
int code = posReader.readVInt();
pos += code >>> 1;
if ((code & 1) != 0) {
hasPayload = true;
// has a payload
payload.setLength(posReader.readVInt());
payload.grow(payload.length());
posReader.readBytes(payload.bytes(), 0, payload.length());
} else {
hasPayload = false;
}
if (readOffsets) {
startOffset += posReader.readVInt();
endOffset = startOffset + posReader.readVInt();
}
return pos;
}
@Override
public int startOffset() {
if (!readOffsets) {
throw new IllegalStateException("offsets were not indexed");
}
return startOffset;
}
@Override
public int endOffset() {
if (!readOffsets) {
throw new IllegalStateException("offsets were not indexed");
}
return endOffset;
}
@Override
public BytesRef getPayload() {
if (hasPayload) {
return payload.get();
} else {
return null;
}
}
}
}