Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
org.apache.lucene.codecs.blocktreeords.OrdsSegmentTermsEnum Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.blocktreeords;
//import java.io.*;
//import java.nio.charset.StandardCharsets;
import java.io.IOException;
import java.io.PrintStream;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.blocktreeords.FSTOrdsOutputs.Output;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Util;
/** Iterates through terms in this field. */
public final class OrdsSegmentTermsEnum extends TermsEnum {
// Lazy init:
IndexInput in;
// static boolean DEBUG = true;
private OrdsSegmentTermsEnumFrame[] stack;
private final OrdsSegmentTermsEnumFrame staticFrame;
OrdsSegmentTermsEnumFrame currentFrame;
boolean termExists;
final OrdsFieldReader fr;
private int targetBeforeCurrentLength;
private final ByteArrayDataInput scratchReader = new ByteArrayDataInput();
// What prefix of the current term was present in the index:
private int validIndexPrefix;
// assert only:
private boolean eof;
final BytesRefBuilder term = new BytesRefBuilder();
private final FST.BytesReader fstReader;
@SuppressWarnings({"rawtypes","unchecked"}) private FST.Arc[] arcs =
new FST.Arc[1];
boolean positioned;
OrdsSegmentTermsEnum(OrdsFieldReader fr) throws IOException {
this.fr = fr;
//if (DEBUG) System.out.println("BTTR.init seg=" + segment);
stack = new OrdsSegmentTermsEnumFrame[0];
// Used to hold seek by TermState, or cached seek
staticFrame = new OrdsSegmentTermsEnumFrame(this, -1);
if (fr.index == null) {
fstReader = null;
} else {
fstReader = fr.index.getBytesReader();
}
// Init w/ root block; don't use index since it may
// not (and need not) have been loaded
for(int arcIdx=0;arcIdx();
}
currentFrame = staticFrame;
final FST.Arc arc;
if (fr.index != null) {
arc = fr.index.getFirstArc(arcs[0]);
// Empty string prefix must have an output in the index!
assert arc.isFinal();
} else {
arc = null;
}
//currentFrame = pushFrame(arc, rootCode, 0);
//currentFrame.loadBlock();
validIndexPrefix = 0;
// if (DEBUG) {
// System.out.println("init frame state " + currentFrame.ord);
// printSeekState();
// }
//System.out.println();
// computeBlockStats().print(System.out);
}
// Not private to avoid synthetic access$NNN methods
void initIndexInput() {
if (this.in == null) {
this.in = fr.parent.in.clone();
}
}
private OrdsSegmentTermsEnumFrame getFrame(int ord) throws IOException {
if (ord >= stack.length) {
final OrdsSegmentTermsEnumFrame[] next = new OrdsSegmentTermsEnumFrame[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
System.arraycopy(stack, 0, next, 0, stack.length);
for(int stackOrd=stack.length;stackOrd getArc(int ord) {
if (ord >= arcs.length) {
@SuppressWarnings({"rawtypes","unchecked"}) final FST.Arc[] next =
new FST.Arc[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
System.arraycopy(arcs, 0, next, 0, arcs.length);
for(int arcOrd=arcs.length;arcOrd();
}
arcs = next;
}
return arcs[ord];
}
// Pushes a frame we seek'd to
OrdsSegmentTermsEnumFrame pushFrame(FST.Arc arc, Output frameData, int length) throws IOException {
scratchReader.reset(frameData.bytes.bytes, frameData.bytes.offset, frameData.bytes.length);
final long code = scratchReader.readVLong();
final long fpSeek = code >>> OrdsBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS;
// System.out.println(" fpSeek=" + fpSeek);
final OrdsSegmentTermsEnumFrame f = getFrame(1+currentFrame.ord);
f.hasTerms = (code & OrdsBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS) != 0;
f.hasTermsOrig = f.hasTerms;
f.isFloor = (code & OrdsBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR) != 0;
// Must setFloorData before pushFrame in case pushFrame tries to rewind:
if (f.isFloor) {
f.termOrdOrig = frameData.startOrd;
f.setFloorData(scratchReader, frameData.bytes);
}
pushFrame(arc, fpSeek, length, frameData.startOrd);
return f;
}
// Pushes next'd frame or seek'd frame; we later
// lazy-load the frame only when needed
OrdsSegmentTermsEnumFrame pushFrame(FST.Arc arc, long fp, int length, long termOrd) throws IOException {
final OrdsSegmentTermsEnumFrame f = getFrame(1+currentFrame.ord);
f.arc = arc;
// System.out.println("pushFrame termOrd= " + termOrd + " fpOrig=" + f.fpOrig + " fp=" + fp + " nextEnt=" + f.nextEnt);
if (f.fpOrig == fp && f.nextEnt != -1) {
//if (DEBUG) System.out.println(" push reused frame ord=" + f.ord + " fp=" + f.fp + " isFloor?=" + f.isFloor + " hasTerms=" + f.hasTerms + " pref=" + term + " nextEnt=" + f.nextEnt + " targetBeforeCurrentLength=" + targetBeforeCurrentLength + " term.length=" + term.length + " vs prefix=" + f.prefix);
if (f.prefix > targetBeforeCurrentLength) {
// System.out.println(" do rewind!");
f.rewind();
} else {
// if (DEBUG) {
// System.out.println(" skip rewind!");
// }
}
assert length == f.prefix;
assert termOrd == f.termOrdOrig;
} else {
f.nextEnt = -1;
f.prefix = length;
f.state.termBlockOrd = 0;
f.termOrdOrig = termOrd;
// System.out.println("set termOrdOrig=" + termOrd);
f.termOrd = termOrd;
f.fpOrig = f.fp = fp;
f.lastSubFP = -1;
// if (DEBUG) {
// final int sav = term.length;
// term.length = length;
// System.out.println(" push new frame ord=" + f.ord + " fp=" + f.fp + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " pref=" + brToString(term));
// term.length = sav;
// }
}
return f;
}
// asserts only
private boolean clearEOF() {
eof = false;
return true;
}
// asserts only
private boolean setEOF() {
eof = true;
return true;
}
// for debugging
@SuppressWarnings("unused")
static String brToString(BytesRef b) {
try {
return b.utf8ToString() + " " + b;
} catch (Throwable t) {
// If BytesRef isn't actually UTF8, or it's eg a
// prefix of UTF8 that ends mid-unicode-char, we
// fallback to hex:
return b.toString();
}
}
@Override
public boolean seekExact(final BytesRef target) throws IOException {
if (fr.index == null) {
throw new IllegalStateException("terms index was not loaded");
}
term.grow(1+target.length);
assert clearEOF();
/*
if (DEBUG) {
System.out.println("\nBTTR.seekExact seg=" + fr.parent.segment + " target=" + fr.fieldInfo.name + ":" + brToString(target) + " current=" + brToString(term) + " (exists?=" + termExists + ") validIndexPrefix=" + validIndexPrefix);
printSeekState(System.out);
}
*/
FST.Arc arc;
int targetUpto;
Output output;
targetBeforeCurrentLength = currentFrame.ord;
if (positioned && currentFrame != staticFrame) {
// We are already seek'd; find the common
// prefix of new seek term vs current term and
// re-use the corresponding seek state. For
// example, if app first seeks to foobar, then
// seeks to foobaz, we can re-use the seek state
// for the first 5 bytes.
// if (DEBUG) {
// System.out.println(" re-use current seek state validIndexPrefix=" + validIndexPrefix);
// }
arc = arcs[0];
assert arc.isFinal();
output = arc.output;
targetUpto = 0;
OrdsSegmentTermsEnumFrame lastFrame = stack[0];
assert validIndexPrefix <= term.length();
final int targetLimit = Math.min(target.length, validIndexPrefix);
int cmp = 0;
// TODO: reverse vLong byte order for better FST
// prefix output sharing
// First compare up to valid seek frames:
while (targetUpto < targetLimit) {
cmp = (term.byteAt(targetUpto)&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF);
// if (DEBUG) {
// System.out.println(" cycle targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")" + " arc.output=" + arc.output + " output=" + output);
// }
if (cmp != 0) {
break;
}
arc = arcs[1+targetUpto];
assert arc.label == (target.bytes[target.offset + targetUpto] & 0xFF): "arc.label=" + (char) arc.label + " targetLabel=" + (char) (target.bytes[target.offset + targetUpto] & 0xFF);
if (arc.output != OrdsBlockTreeTermsWriter.NO_OUTPUT) {
output = OrdsBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output);
}
if (arc.isFinal()) {
lastFrame = stack[1+lastFrame.ord];
}
targetUpto++;
}
if (cmp == 0) {
final int targetUptoMid = targetUpto;
// Second compare the rest of the term, but
// don't save arc/output/frame; we only do this
// to find out if the target term is before,
// equal or after the current term
final int targetLimit2 = Math.min(target.length, term.length());
while (targetUpto < targetLimit2) {
cmp = (term.byteAt(targetUpto)&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF);
// if (DEBUG) {
// System.out.println(" cycle2 targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")");
// }
if (cmp != 0) {
break;
}
targetUpto++;
}
if (cmp == 0) {
cmp = term.length() - target.length;
}
targetUpto = targetUptoMid;
}
if (cmp < 0) {
// Common case: target term is after current
// term, ie, app is seeking multiple terms
// in sorted order
// if (DEBUG) {
// System.out.println(" target is after current (shares prefixLen=" + targetUpto + "); frame.ord=" + lastFrame.ord);
// }
currentFrame = lastFrame;
} else if (cmp > 0) {
// Uncommon case: target term
// is before current term; this means we can
// keep the currentFrame but we must rewind it
// (so we scan from the start)
targetBeforeCurrentLength = lastFrame.ord;
// if (DEBUG) {
// System.out.println(" target is before current (shares prefixLen=" + targetUpto + "); rewind frame ord=" + lastFrame.ord);
// }
currentFrame = lastFrame;
currentFrame.rewind();
} else {
// Target is exactly the same as current term
assert term.length() == target.length;
if (termExists) {
// if (DEBUG) {
// System.out.println(" target is same as current; return true");
// }
return true;
} else {
// if (DEBUG) {
// System.out.println(" target is same as current but term doesn't exist");
// }
}
//validIndexPrefix = currentFrame.depth;
//term.length = target.length;
//return termExists;
}
} else {
targetBeforeCurrentLength = -1;
arc = fr.index.getFirstArc(arcs[0]);
// Empty string prefix must have an output (block) in the index!
assert arc.isFinal();
assert arc.output != null;
// if (DEBUG) {
// System.out.println(" no seek state; push root frame");
// }
output = arc.output;
currentFrame = staticFrame;
//term.length = 0;
targetUpto = 0;
currentFrame = pushFrame(arc, OrdsBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.nextFinalOutput), 0);
}
positioned = true;
// if (DEBUG) {
// System.out.println(" start index loop targetUpto=" + targetUpto + " output=" + output + " currentFrame.ord=" + currentFrame.ord + " targetBeforeCurrentLength=" + targetBeforeCurrentLength);
// }
// We are done sharing the common prefix with the incoming target and where we are currently seek'd; now continue walking the index:
while (targetUpto < target.length) {
final int targetLabel = target.bytes[target.offset + targetUpto] & 0xFF;
final FST.Arc nextArc = fr.index.findTargetArc(targetLabel, arc, getArc(1+targetUpto), fstReader);
if (nextArc == null) {
// Index is exhausted
// if (DEBUG) {
// System.out.println(" index: index exhausted label=" + ((char) targetLabel) + " " + toHex(targetLabel));
// }
validIndexPrefix = currentFrame.prefix;
//validIndexPrefix = targetUpto;
currentFrame.scanToFloorFrame(target);
if (!currentFrame.hasTerms) {
termExists = false;
term.setByteAt(targetUpto, (byte) targetLabel);
term.setLength(1+targetUpto);
// if (DEBUG) {
// System.out.println(" FAST NOT_FOUND term=" + brToString(term));
// }
return false;
}
currentFrame.loadBlock();
final SeekStatus result = currentFrame.scanToTerm(target, true);
if (result == SeekStatus.FOUND) {
// if (DEBUG) {
// System.out.println(" return FOUND term=" + term.utf8ToString() + " " + term);
// }
return true;
} else {
// if (DEBUG) {
// System.out.println(" got " + result + "; return NOT_FOUND term=" + brToString(term));
// }
return false;
}
} else {
// Follow this arc
arc = nextArc;
term.setByteAt(targetUpto, (byte) targetLabel);
// Aggregate output as we go:
assert arc.output != null;
if (arc.output != OrdsBlockTreeTermsWriter.NO_OUTPUT) {
output = OrdsBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output);
}
// if (DEBUG) {
// System.out.println(" index: follow label=" + toHex(target.bytes[target.offset + targetUpto]&0xff) + " arc.output=" + arc.output + " arc.nfo=" + arc.nextFinalOutput);
// }
targetUpto++;
if (arc.isFinal()) {
//if (DEBUG) System.out.println(" arc is final!");
currentFrame = pushFrame(arc, OrdsBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.nextFinalOutput), targetUpto);
//if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" + currentFrame.hasTerms);
}
}
}
//validIndexPrefix = targetUpto;
validIndexPrefix = currentFrame.prefix;
currentFrame.scanToFloorFrame(target);
// Target term is entirely contained in the index:
if (!currentFrame.hasTerms) {
termExists = false;
term.setLength(targetUpto);
// if (DEBUG) {
// System.out.println(" FAST NOT_FOUND term=" + brToString(term));
// }
return false;
}
currentFrame.loadBlock();
final SeekStatus result = currentFrame.scanToTerm(target, true);
if (result == SeekStatus.FOUND) {
// if (DEBUG) {
// System.out.println(" return FOUND term=" + term.utf8ToString() + " " + term);
// }
return true;
} else {
// if (DEBUG) {
// System.out.println(" got result " + result + "; return NOT_FOUND term=" + term.utf8ToString());
// }
return false;
}
}
@Override
public SeekStatus seekCeil(final BytesRef target) throws IOException {
if (fr.index == null) {
throw new IllegalStateException("terms index was not loaded");
}
term.grow(1+target.length);
assert clearEOF();
//if (DEBUG) {
//System.out.println("\nBTTR.seekCeil seg=" + segment + " target=" + fieldInfo.name + ":" + target.utf8ToString() + " " + target + " current=" + brToString(term) + " (exists?=" + termExists + ") validIndexPrefix= " + validIndexPrefix);
//printSeekState();
//}
FST.Arc arc;
int targetUpto;
Output output;
targetBeforeCurrentLength = currentFrame.ord;
if (positioned && currentFrame != staticFrame) {
// We are already seek'd; find the common
// prefix of new seek term vs current term and
// re-use the corresponding seek state. For
// example, if app first seeks to foobar, then
// seeks to foobaz, we can re-use the seek state
// for the first 5 bytes.
//if (DEBUG) {
//System.out.println(" re-use current seek state validIndexPrefix=" + validIndexPrefix);
//}
arc = arcs[0];
assert arc.isFinal();
output = arc.output;
targetUpto = 0;
OrdsSegmentTermsEnumFrame lastFrame = stack[0];
assert validIndexPrefix <= term.length();
final int targetLimit = Math.min(target.length, validIndexPrefix);
int cmp = 0;
// TODO: we should write our vLong backwards (MSB
// first) to get better sharing from the FST
// First compare up to valid seek frames:
while (targetUpto < targetLimit) {
cmp = (term.byteAt(targetUpto)&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF);
//if (DEBUG) {
//System.out.println(" cycle targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")" + " arc.output=" + arc.output + " output=" + output);
//}
if (cmp != 0) {
break;
}
arc = arcs[1+targetUpto];
assert arc.label == (target.bytes[target.offset + targetUpto] & 0xFF): "arc.label=" + (char) arc.label + " targetLabel=" + (char) (target.bytes[target.offset + targetUpto] & 0xFF);
// TODO: we could save the outputs in local
// byte[][] instead of making new objs ever
// seek; but, often the FST doesn't have any
// shared bytes (but this could change if we
// reverse vLong byte order)
if (arc.output != OrdsBlockTreeTermsWriter.NO_OUTPUT) {
output = OrdsBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output);
}
if (arc.isFinal()) {
lastFrame = stack[1+lastFrame.ord];
}
targetUpto++;
}
if (cmp == 0) {
final int targetUptoMid = targetUpto;
// Second compare the rest of the term, but
// don't save arc/output/frame:
final int targetLimit2 = Math.min(target.length, term.length());
while (targetUpto < targetLimit2) {
cmp = (term.byteAt(targetUpto)&0xFF) - (target.bytes[target.offset + targetUpto]&0xFF);
//if (DEBUG) {
//System.out.println(" cycle2 targetUpto=" + targetUpto + " (vs limit=" + targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")");
//}
if (cmp != 0) {
break;
}
targetUpto++;
}
if (cmp == 0) {
cmp = term.length() - target.length;
}
targetUpto = targetUptoMid;
}
if (cmp < 0) {
// Common case: target term is after current
// term, ie, app is seeking multiple terms
// in sorted order
//if (DEBUG) {
//System.out.println(" target is after current (shares prefixLen=" + targetUpto + "); clear frame.scanned ord=" + lastFrame.ord);
//}
currentFrame = lastFrame;
} else if (cmp > 0) {
// Uncommon case: target term
// is before current term; this means we can
// keep the currentFrame but we must rewind it
// (so we scan from the start)
targetBeforeCurrentLength = 0;
//if (DEBUG) {
//System.out.println(" target is before current (shares prefixLen=" + targetUpto + "); rewind frame ord=" + lastFrame.ord);
//}
currentFrame = lastFrame;
currentFrame.rewind();
} else {
// Target is exactly the same as current term
assert term.length() == target.length;
if (termExists) {
//if (DEBUG) {
//System.out.println(" target is same as current; return FOUND");
//}
return SeekStatus.FOUND;
} else {
//if (DEBUG) {
//System.out.println(" target is same as current but term doesn't exist");
//}
}
}
} else {
targetBeforeCurrentLength = -1;
arc = fr.index.getFirstArc(arcs[0]);
// Empty string prefix must have an output (block) in the index!
assert arc.isFinal();
assert arc.output != null;
//if (DEBUG) {
//System.out.println(" no seek state; push root frame");
//}
output = arc.output;
currentFrame = staticFrame;
//term.length = 0;
targetUpto = 0;
currentFrame = pushFrame(arc, OrdsBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.nextFinalOutput), 0);
}
positioned = true;
//if (DEBUG) {
//System.out.println(" start index loop targetUpto=" + targetUpto + " output=" + output + " currentFrame.ord+1=" + currentFrame.ord + " targetBeforeCurrentLength=" + targetBeforeCurrentLength);
//}
// We are done sharing the common prefix with the incoming target and where we are currently seek'd; now continue walking the index:
while (targetUpto < target.length) {
final int targetLabel = target.bytes[target.offset + targetUpto] & 0xFF;
final FST.Arc nextArc = fr.index.findTargetArc(targetLabel, arc, getArc(1+targetUpto), fstReader);
if (nextArc == null) {
// Index is exhausted
// if (DEBUG) {
// System.out.println(" index: index exhausted label=" + ((char) targetLabel) + " " + toHex(targetLabel));
// }
validIndexPrefix = currentFrame.prefix;
//validIndexPrefix = targetUpto;
currentFrame.scanToFloorFrame(target);
currentFrame.loadBlock();
final SeekStatus result = currentFrame.scanToTerm(target, false);
if (result == SeekStatus.END) {
term.copyBytes(target);
termExists = false;
if (next() != null) {
//if (DEBUG) {
//System.out.println(" return NOT_FOUND term=" + brToString(term) + " " + term);
//}
return SeekStatus.NOT_FOUND;
} else {
//if (DEBUG) {
//System.out.println(" return END");
//}
return SeekStatus.END;
}
} else {
//if (DEBUG) {
//System.out.println(" return " + result + " term=" + brToString(term) + " " + term);
//}
return result;
}
} else {
// Follow this arc
term.setByteAt(targetUpto, (byte) targetLabel);
arc = nextArc;
// Aggregate output as we go:
assert arc.output != null;
if (arc.output != OrdsBlockTreeTermsWriter.NO_OUTPUT) {
output = OrdsBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output);
}
//if (DEBUG) {
//System.out.println(" index: follow label=" + toHex(target.bytes[target.offset + targetUpto]&0xff) + " arc.output=" + arc.output + " arc.nfo=" + arc.nextFinalOutput);
//}
targetUpto++;
if (arc.isFinal()) {
//if (DEBUG) System.out.println(" arc is final!");
currentFrame = pushFrame(arc, OrdsBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.nextFinalOutput), targetUpto);
//if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" + currentFrame.hasTerms);
}
}
}
//validIndexPrefix = targetUpto;
validIndexPrefix = currentFrame.prefix;
currentFrame.scanToFloorFrame(target);
currentFrame.loadBlock();
final SeekStatus result = currentFrame.scanToTerm(target, false);
if (result == SeekStatus.END) {
term.copyBytes(target);
termExists = false;
if (next() != null) {
//if (DEBUG) {
//System.out.println(" return NOT_FOUND term=" + term.utf8ToString() + " " + term);
//}
return SeekStatus.NOT_FOUND;
} else {
//if (DEBUG) {
//System.out.println(" return END");
//}
return SeekStatus.END;
}
} else {
return result;
}
}
@SuppressWarnings("unused")
private void printSeekState(PrintStream out) throws IOException {
if (currentFrame == staticFrame) {
out.println(" no prior seek");
} else {
out.println(" prior seek state:");
int ord = 0;
boolean isSeekFrame = true;
while(true) {
OrdsSegmentTermsEnumFrame f = getFrame(ord);
assert f != null;
final BytesRef prefix = new BytesRef(term.bytes(), 0, f.prefix);
if (f.nextEnt == -1) {
out.println(" frame " + (isSeekFrame ? "(seek)" : "(next)") + " ord=" + ord + " fp=" + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" + f.prefix + " prefix=" + brToString(prefix) + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " code=" + ((f.fp< arc;
if (fr.index != null) {
arc = fr.index.getFirstArc(arcs[0]);
// Empty string prefix must have an output in the index!
assert arc.isFinal();
} else {
arc = null;
}
currentFrame = pushFrame(arc, fr.rootCode, 0);
currentFrame.loadBlock();
positioned = true;
}
targetBeforeCurrentLength = currentFrame.ord;
assert !eof;
//if (DEBUG) {
//System.out.println("\nBTTR.next seg=" + segment + " term=" + brToString(term) + " termExists?=" + termExists + " field=" + fieldInfo.name + " termBlockOrd=" + currentFrame.state.termBlockOrd + " validIndexPrefix=" + validIndexPrefix);
//printSeekState();
//}
if (currentFrame == staticFrame || positioned == false) {
// If seek was previously called and the term was
// cached, or seek(TermState) was called, usually
// caller is just going to pull a D/&PEnum or get
// docFreq, etc. But, if they then call next(),
// this method catches up all internal state so next()
// works properly:
// if (DEBUG) System.out.println(" re-seek to pending term=" + term.utf8ToString() + " " + term);
final boolean result = seekExact(term.get());
assert result;
}
// Pop finished blocks
while (currentFrame.nextEnt == currentFrame.entCount) {
if (!currentFrame.isLastInFloor) {
currentFrame.loadNextFloorBlock();
} else {
//if (DEBUG) System.out.println(" pop frame");
if (currentFrame.ord == 0) {
//if (DEBUG) System.out.println(" return null");
assert setEOF();
term.setLength(0);
validIndexPrefix = 0;
currentFrame.rewind();
termExists = false;
positioned = false;
return null;
}
final long lastFP = currentFrame.fpOrig;
currentFrame = stack[currentFrame.ord-1];
if (currentFrame.nextEnt == -1 || currentFrame.lastSubFP != lastFP) {
// We popped into a frame that's not loaded
// yet or not scan'd to the right entry
currentFrame.scanToFloorFrame(term.get());
currentFrame.loadBlock();
currentFrame.scanToSubBlock(lastFP);
}
// Note that the seek state (last seek) has been
// invalidated beyond this depth
validIndexPrefix = Math.min(validIndexPrefix, currentFrame.prefix);
//if (DEBUG) {
//System.out.println(" reset validIndexPrefix=" + validIndexPrefix);
//}
}
}
while(true) {
long prevTermOrd = currentFrame.termOrd;
if (currentFrame.next()) {
// Push to new block:
//if (DEBUG) System.out.println(" push frame");
currentFrame = pushFrame(null, currentFrame.lastSubFP, term.length(), prevTermOrd);
// This is a "next" frame -- even if it's
// floor'd we must pretend it isn't so we don't
// try to scan to the right floor frame:
currentFrame.isFloor = false;
//currentFrame.hasTerms = true;
currentFrame.loadBlock();
} else {
//if (DEBUG) System.out.println(" return term=" + term.utf8ToString() + " " + term + " currentFrame.ord=" + currentFrame.ord);
positioned = true;
return term.get();
}
}
}
@Override
public BytesRef term() {
assert !eof;
return term.get();
}
@Override
public long ord() {
assert !eof;
assert currentFrame.termOrd > 0;
return currentFrame.termOrd-1;
}
@Override
public int docFreq() throws IOException {
assert !eof;
//if (DEBUG) System.out.println("BTR.docFreq");
currentFrame.decodeMetaData();
//if (DEBUG) System.out.println(" return " + currentFrame.state.docFreq);
return currentFrame.state.docFreq;
}
@Override
public long totalTermFreq() throws IOException {
assert !eof;
currentFrame.decodeMetaData();
return currentFrame.state.totalTermFreq;
}
@Override
public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {
assert !eof;
//if (DEBUG) {
//System.out.println("BTTR.docs seg=" + segment);
//}
currentFrame.decodeMetaData();
//if (DEBUG) {
//System.out.println(" state=" + currentFrame.state);
//}
return fr.parent.postingsReader.postings(fr.fieldInfo, currentFrame.state, reuse, flags);
}
@Override
public void seekExact(BytesRef target, TermState otherState) {
// if (DEBUG) {
// System.out.println("BTTR.seekExact termState seg=" + segment + " target=" + target.utf8ToString() + " " + target + " state=" + otherState);
// }
assert clearEOF();
if (target.compareTo(term.get()) != 0 || !termExists) {
assert otherState != null && otherState instanceof BlockTermState;
BlockTermState blockState = (BlockTermState) otherState;
currentFrame = staticFrame;
currentFrame.state.copyFrom(otherState);
term.copyBytes(target);
currentFrame.metaDataUpto = currentFrame.getTermBlockOrd();
currentFrame.termOrd = blockState.ord+1;
assert currentFrame.metaDataUpto > 0;
validIndexPrefix = 0;
} else {
// if (DEBUG) {
// System.out.println(" skip seek: already on target state=" + currentFrame.state);
// }
}
positioned = true;
}
@Override
public TermState termState() throws IOException {
assert !eof;
currentFrame.decodeMetaData();
BlockTermState ts = (BlockTermState) currentFrame.state.clone();
assert currentFrame.termOrd > 0;
ts.ord = currentFrame.termOrd-1;
//if (DEBUG) System.out.println("BTTR.termState seg=" + segment + " state=" + ts);
return ts;
}
@Override
public void seekExact(long targetOrd) throws IOException {
// System.out.println("seekExact targetOrd=" + targetOrd);
if (targetOrd < 0 || targetOrd >= fr.numTerms) {
throw new IllegalArgumentException("targetOrd out of bounds (got: " + targetOrd + ", numTerms=" + fr.numTerms + ")");
}
assert clearEOF();
// First do reverse lookup in the index to find the block that holds this term:
InputOutput io = getByOutput(targetOrd);
term.grow(io.input.length);
Util.toBytesRef(io.input, term);
if (io.input.length == 0) {
currentFrame = staticFrame;
} else {
currentFrame = getFrame(io.input.length-1);
}
FST.Arc arc = getArc(io.input.length);
// Don't force rewind based on term length; we rewind below based on ord:
targetBeforeCurrentLength = Integer.MAX_VALUE;
currentFrame = pushFrame(arc, io.output, io.input.length);
if (currentFrame.termOrd > targetOrd) {
//System.out.println(" do rewind: " + currentFrame.termOrd);
currentFrame.rewind();
}
currentFrame.scanToFloorFrame(targetOrd);
currentFrame.loadBlock();
// System.out.println(" after loadBlock termOrd=" + currentFrame.termOrd + " vs " + targetOrd);
while (currentFrame.termOrd <= targetOrd) {
currentFrame.next();
}
assert currentFrame.termOrd == targetOrd+1: "currentFrame.termOrd=" + currentFrame.termOrd + " vs ord=" + targetOrd;
assert termExists;
// Leave enum fully unpositioned, because we didn't set frames for each byte leading up to current term:
validIndexPrefix = 0;
positioned = false;
}
@Override
public String toString() {
return "OrdsSegmentTermsEnum(seg=" + fr.parent + ")";
}
/** Holds a single input (IntsRef) + output pair. */
private static class InputOutput {
public IntsRef input;
public Output output;
@Override
public String toString() {
return "InputOutput(input=" + input + " output=" + output + ")";
}
}
private final FST.Arc arc = new FST.Arc<>();
// TODO: this is similar to Util.getByOutput ... can we refactor/share?
/** Specialized getByOutput that can understand the ranges (startOrd to endOrd) we use here, not just startOrd. */
private InputOutput getByOutput(long targetOrd) throws IOException {
final IntsRefBuilder result = new IntsRefBuilder();
fr.index.getFirstArc(arc);
Output output = arc.output;
int upto = 0;
int bestUpto = 0;
Output bestOutput = null;
/*
Writer w = new OutputStreamWriter(new FileOutputStream("/tmp/out.dot"));
Util.toDot(fr.index, w, true, true);
w.close();
*/
// System.out.println("reverseLookup seg=" + fr.parent.segment + " output=" + targetOrd);
while (true) {
// System.out.println(" loop: output=" + output.startOrd + "-" + (Long.MAX_VALUE-output.endOrd) + " upto=" + upto + " arc=" + arc + " final?=" + arc.isFinal());
if (arc.isFinal()) {
final Output finalOutput = OrdsBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.nextFinalOutput);
// System.out.println(" isFinal: " + finalOutput.startOrd + "-" + (Long.MAX_VALUE-finalOutput.endOrd));
if (targetOrd >= finalOutput.startOrd && targetOrd <= Long.MAX_VALUE-finalOutput.endOrd) {
// Only one range should match across all arc leaving this node
//assert bestOutput == null;
bestOutput = finalOutput;
bestUpto = upto;
}
}
if (FST.targetHasArcs(arc)) {
// System.out.println(" targetHasArcs");
result.grow(1+upto);
fr.index.readFirstRealTargetArc(arc.target, arc, fstReader);
if (arc.bytesPerArc != 0) {
// System.out.println(" array arcs");
int low = 0;
int high = arc.numArcs-1;
int mid = 0;
//System.out.println("bsearch: numArcs=" + arc.numArcs + " target=" + targetOutput + " output=" + output);
boolean found = false;
while (low <= high) {
mid = (low + high) >>> 1;
fstReader.setPosition(arc.posArcsStart);
fstReader.skipBytes(arc.bytesPerArc*mid);
final byte flags = fstReader.readByte();
fr.index.readLabel(fstReader);
final Output minArcOutput;
if ((flags & FST.BIT_ARC_HAS_OUTPUT) != 0) {
minArcOutput = OrdsBlockTreeTermsWriter.FST_OUTPUTS.add(output, OrdsBlockTreeTermsWriter.FST_OUTPUTS.read(fstReader));
} else {
minArcOutput = output;
}
// System.out.println(" cycle mid=" + mid + " targetOrd=" + targetOrd + " output=" + minArcOutput.startOrd + "-" + (Long.MAX_VALUE-minArcOutput.endOrd));
if (targetOrd > Long.MAX_VALUE-minArcOutput.endOrd) {
low = mid + 1;
} else if (targetOrd < minArcOutput.startOrd) {
high = mid - 1;
} else {
// System.out.println(" found!!");
found = true;
break;
}
}
if (found) {
// Keep recursing
arc.arcIdx = mid-1;
} else {
result.setLength(bestUpto);
InputOutput io = new InputOutput();
io.input = result.get();
io.output = bestOutput;
// System.out.println(" ret0=" + io);
return io;
}
fr.index.readNextRealArc(arc, fstReader);
// Recurse on this arc:
result.setIntAt(upto++, arc.label);
output = OrdsBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output);
} else {
// System.out.println(" non-array arc");
while (true) {
// System.out.println(" cycle label=" + arc.label + " output=" + arc.output);
// This is the min output we'd hit if we follow
// this arc:
final Output minArcOutput = OrdsBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output);
long endOrd = Long.MAX_VALUE - minArcOutput.endOrd;
// System.out.println(" endOrd=" + endOrd + " targetOrd=" + targetOrd);
if (targetOrd >= minArcOutput.startOrd && targetOrd <= endOrd) {
// Recurse on this arc:
output = minArcOutput;
result.setIntAt(upto++, arc.label);
break;
} else if (targetOrd < endOrd || arc.isLast()) {
result.setLength(bestUpto);
InputOutput io = new InputOutput();
io.input = result.get();
assert bestOutput != null;
io.output = bestOutput;
// System.out.println(" ret2=" + io);
return io;
} else {
// System.out.println(" next arc");
// Read next arc in this node:
fr.index.readNextRealArc(arc, fstReader);
}
}
}
} else {
result.setLength(bestUpto);
InputOutput io = new InputOutput();
io.input = result.get();
io.output = bestOutput;
// System.out.println(" ret3=" + io);
return io;
}
}
}
}