Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.blocktree;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.apache.lucene.index.FilteredTermsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.StringHelper;
// TODO: instead of inlining auto-prefix terms with normal terms,
// we could write them into their own virtual/private field. This
// would make search time a bit more complex, since we'd need to
// merge sort between two TermEnums, but it would also make stats
// API (used by CheckIndex -verbose) easier to implement since we could
// just walk this virtual field and gather its stats)
/** Used in the first pass when writing a segment to locate
* "appropriate" auto-prefix terms to pre-compile into the index.
* This visits every term in the index to find prefixes that
* match {@code >= min} and {@code <= max} number of terms. */
class AutoPrefixTermsWriter {
//static boolean DEBUG = BlockTreeTermsWriter.DEBUG;
//static boolean DEBUG = false;
//static boolean DEBUG2 = BlockTreeTermsWriter.DEBUG2;
//static boolean DEBUG2 = true;
/** Describes a range of term-space to match, either a simple prefix
* (foo*) or a floor-block range of a prefix (e.g. foo[a-m]*,
* foo[n-z]*) when there are too many terms starting with foo*. */
public static final class PrefixTerm implements Comparable {
/** Common prefix */
public final byte[] prefix;
/** If this is -2, this is a normal prefix (foo *), else it's the minimum lead byte of the suffix (e.g. 'd' in foo[d-m]*). */
public final int floorLeadStart;
/** The lead byte (inclusive) of the suffix for the term range we match (e.g. 'm' in foo[d-m*]); this is ignored when
* floorLeadStart is -2. */
public final int floorLeadEnd;
public final BytesRef term;
/** Sole constructor. */
public PrefixTerm(byte[] prefix, int floorLeadStart, int floorLeadEnd) {
this.prefix = prefix;
this.floorLeadStart = floorLeadStart;
this.floorLeadEnd = floorLeadEnd;
this.term = toBytesRef(prefix, floorLeadStart);
assert floorLeadEnd >= floorLeadStart;
assert floorLeadEnd >= 0;
assert floorLeadStart == -2 || floorLeadStart >= 0;
// We should never create empty-string prefix term:
assert prefix.length > 0 || floorLeadStart != -2 || floorLeadEnd != 0xff;
}
@Override
public String toString() {
String s = brToString(new BytesRef(prefix));
if (floorLeadStart == -2) {
s += "[-" + Integer.toHexString(floorLeadEnd) + "]";
} else {
s += "[" + Integer.toHexString(floorLeadStart) + "-" + Integer.toHexString(floorLeadEnd) + "]";
}
return s;
}
@Override
public int compareTo(PrefixTerm other) {
int cmp = term.compareTo(other.term);
if (cmp == 0) {
if (prefix.length != other.prefix.length) {
return prefix.length - other.prefix.length;
}
// On tie, sort the bigger floorLeadEnd, earlier, since it
// spans more terms, so during intersect, we want to encounter this one
// first so we can use it if the automaton accepts the larger range:
cmp = other.floorLeadEnd - floorLeadEnd;
}
return cmp;
}
/** Returns the leading term for this prefix term, e.g. "foo" (for
* the foo* prefix) or "foom" (for the foo[m-z]* case). */
private static BytesRef toBytesRef(byte[] prefix, int floorLeadStart) {
BytesRef br;
if (floorLeadStart != -2) {
assert floorLeadStart >= 0;
br = new BytesRef(prefix.length+1);
} else {
br = new BytesRef(prefix.length);
}
System.arraycopy(prefix, 0, br.bytes, 0, prefix.length);
br.length = prefix.length;
if (floorLeadStart != -2) {
assert floorLeadStart >= 0;
br.bytes[br.length++] = (byte) floorLeadStart;
}
return br;
}
public int compareTo(BytesRef term) {
return this.term.compareTo(term);
}
public TermsEnum getTermsEnum(TermsEnum in) {
final BytesRef prefixRef = new BytesRef(prefix);
return new FilteredTermsEnum(in) {
{
setInitialSeekTerm(term);
}
@Override
protected AcceptStatus accept(BytesRef term) {
if (StringHelper.startsWith(term, prefixRef) &&
(floorLeadEnd == -1 || term.length == prefixRef.length || (term.bytes[term.offset + prefixRef.length] & 0xff) <= floorLeadEnd)) {
return AcceptStatus.YES;
} else {
return AcceptStatus.END;
}
}
};
}
}
// for debugging
static String brToString(BytesRef b) {
try {
return b.utf8ToString() + " " + b;
} catch (Throwable t) {
// If BytesRef isn't actually UTF8, or it's eg a
// prefix of UTF8 that ends mid-unicode-char, we
// fallback to hex:
return b.toString();
}
}
final List prefixes = new ArrayList<>();
private final int minItemsInPrefix;
private final int maxItemsInPrefix;
// Records index into pending where the current prefix at that
// length "started"; for example, if current term starts with 't',
// startsByPrefix[0] is the index into pending for the first
// term/sub-block starting with 't'. We use this to figure out when
// to write a new block:
private final BytesRefBuilder lastTerm = new BytesRefBuilder();
private int[] prefixStarts = new int[8];
private List