
org.apache.lucene.index.DefaultIndexingChain Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;
import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.function.Consumer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.NormsConsumer;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.NormsProducer;
import org.apache.lucene.codecs.PointsFormat;
import org.apache.lucene.codecs.PointsWriter;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash.MaxBytesLengthExceededException;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.InfoStream;
import org.apache.lucene.util.IntBlockPool;
import org.apache.lucene.util.RamUsageEstimator;
/** Default general purpose indexing chain, which handles
* indexing all types of fields. */
final class DefaultIndexingChain extends DocConsumer {
final Counter bytesUsed = Counter.newCounter();
final FieldInfos.Builder fieldInfos;
// Writes postings and term vectors:
final TermsHash termsHash;
// Shared pool for doc-value terms
final ByteBlockPool docValuesBytePool;
// Writes stored fields
final StoredFieldsConsumer storedFieldsConsumer;
final TermVectorsConsumer termVectorsWriter;
// NOTE: I tried using Hash Map
// but it was ~2% slower on Wiki and Geonames with Java
// 1.7.0_25:
private PerField[] fieldHash = new PerField[2];
private int hashMask = 1;
private int totalFieldCount;
private long nextFieldGen;
// Holds fields seen in each document
private PerField[] fields = new PerField[1];
private final InfoStream infoStream;
private final ByteBlockPool.Allocator byteBlockAllocator;
private final LiveIndexWriterConfig indexWriterConfig;
private final int indexCreatedVersionMajor;
private final Consumer abortingExceptionConsumer;
private boolean hasHitAbortingException;
DefaultIndexingChain(int indexCreatedVersionMajor, SegmentInfo segmentInfo, Directory directory, FieldInfos.Builder fieldInfos, LiveIndexWriterConfig indexWriterConfig,
Consumer abortingExceptionConsumer) {
this.indexCreatedVersionMajor = indexCreatedVersionMajor;
byteBlockAllocator = new ByteBlockPool.DirectTrackingAllocator(bytesUsed);
IntBlockPool.Allocator intBlockAllocator = new IntBlockAllocator(bytesUsed);
this.indexWriterConfig = indexWriterConfig;
assert segmentInfo.getIndexSort() == indexWriterConfig.getIndexSort();
this.fieldInfos = fieldInfos;
this.infoStream = indexWriterConfig.getInfoStream();
this.abortingExceptionConsumer = abortingExceptionConsumer;
if (segmentInfo.getIndexSort() == null) {
storedFieldsConsumer = new StoredFieldsConsumer(indexWriterConfig.getCodec(), directory, segmentInfo);
termVectorsWriter = new TermVectorsConsumer(intBlockAllocator, byteBlockAllocator, directory, segmentInfo, indexWriterConfig.getCodec());
} else {
storedFieldsConsumer = new SortingStoredFieldsConsumer(indexWriterConfig.getCodec(), directory, segmentInfo);
termVectorsWriter = new SortingTermVectorsConsumer(intBlockAllocator, byteBlockAllocator, directory, segmentInfo, indexWriterConfig.getCodec());
}
termsHash = new FreqProxTermsWriter(intBlockAllocator, byteBlockAllocator, bytesUsed, termVectorsWriter);
docValuesBytePool = new ByteBlockPool(byteBlockAllocator);
}
private void onAbortingException(Throwable th) {
assert th != null;
this.hasHitAbortingException = true;
abortingExceptionConsumer.accept(th);
}
private LeafReader getDocValuesLeafReader() {
return new DocValuesLeafReader() {
@Override
public NumericDocValues getNumericDocValues(String field) {
PerField pf = getPerField(field);
if (pf == null) {
return null;
}
if (pf.fieldInfo.getDocValuesType() == DocValuesType.NUMERIC) {
return (NumericDocValues) pf.docValuesWriter.getDocValues();
}
return null;
}
@Override
public BinaryDocValues getBinaryDocValues(String field) {
PerField pf = getPerField(field);
if (pf == null) {
return null;
}
if (pf.fieldInfo.getDocValuesType() == DocValuesType.BINARY) {
return (BinaryDocValues) pf.docValuesWriter.getDocValues();
}
return null;
}
@Override
public SortedDocValues getSortedDocValues(String field) throws IOException {
PerField pf = getPerField(field);
if (pf == null) {
return null;
}
if (pf.fieldInfo.getDocValuesType() == DocValuesType.SORTED) {
return (SortedDocValues) pf.docValuesWriter.getDocValues();
}
return null;
}
@Override
public SortedNumericDocValues getSortedNumericDocValues(String field) throws IOException {
PerField pf = getPerField(field);
if (pf == null) {
return null;
}
if (pf.fieldInfo.getDocValuesType() == DocValuesType.SORTED_NUMERIC) {
return (SortedNumericDocValues) pf.docValuesWriter.getDocValues();
}
return null;
}
@Override
public SortedSetDocValues getSortedSetDocValues(String field) throws IOException {
PerField pf = getPerField(field);
if (pf == null) {
return null;
}
if (pf.fieldInfo.getDocValuesType() == DocValuesType.SORTED_SET) {
return (SortedSetDocValues) pf.docValuesWriter.getDocValues();
}
return null;
}
@Override
public FieldInfos getFieldInfos() {
return fieldInfos.finish();
}
};
}
private Sorter.DocMap maybeSortSegment(SegmentWriteState state) throws IOException {
Sort indexSort = state.segmentInfo.getIndexSort();
if (indexSort == null) {
return null;
}
LeafReader docValuesReader = getDocValuesLeafReader();
List comparators = new ArrayList<>();
for (int i = 0; i < indexSort.getSort().length; i++) {
SortField sortField = indexSort.getSort()[i];
IndexSorter sorter = sortField.getIndexSorter();
if (sorter == null) {
throw new UnsupportedOperationException("Cannot sort index using sort field " + sortField);
}
comparators.add(sorter.getDocComparator(docValuesReader, state.segmentInfo.maxDoc()));
}
Sorter sorter = new Sorter(indexSort);
// returns null if the documents are already sorted
return sorter.sort(state.segmentInfo.maxDoc(), comparators.toArray(new IndexSorter.DocComparator[0]));
}
@Override
public Sorter.DocMap flush(SegmentWriteState state) throws IOException {
// NOTE: caller (DocumentsWriterPerThread) handles
// aborting on any exception from this method
Sorter.DocMap sortMap = maybeSortSegment(state);
int maxDoc = state.segmentInfo.maxDoc();
long t0 = System.nanoTime();
writeNorms(state, sortMap);
if (infoStream.isEnabled("IW")) {
infoStream.message("IW", ((System.nanoTime()-t0)/1000000) + " msec to write norms");
}
SegmentReadState readState = new SegmentReadState(state.directory, state.segmentInfo, state.fieldInfos, IOContext.READ, state.segmentSuffix);
t0 = System.nanoTime();
writeDocValues(state, sortMap);
if (infoStream.isEnabled("IW")) {
infoStream.message("IW", ((System.nanoTime()-t0)/1000000) + " msec to write docValues");
}
t0 = System.nanoTime();
writePoints(state, sortMap);
if (infoStream.isEnabled("IW")) {
infoStream.message("IW", ((System.nanoTime()-t0)/1000000) + " msec to write points");
}
// it's possible all docs hit non-aborting exceptions...
t0 = System.nanoTime();
storedFieldsConsumer.finish(maxDoc);
storedFieldsConsumer.flush(state, sortMap);
if (infoStream.isEnabled("IW")) {
infoStream.message("IW", ((System.nanoTime()-t0)/1000000) + " msec to finish stored fields");
}
t0 = System.nanoTime();
Map fieldsToFlush = new HashMap<>();
for (int i=0;i fieldHash.length;
PerField newHashArray[] = new PerField[newHashSize];
// Rehash
int newHashMask = newHashSize-1;
for(int j=0;j document) throws IOException {
// How many indexed field names we've seen (collapses
// multiple field instances by the same name):
int fieldCount = 0;
long fieldGen = nextFieldGen++;
// NOTE: we need two passes here, in case there are
// multi-valued fields, because we must process all
// instances of a given field at once, since the
// analyzer is free to reuse TokenStream across fields
// (i.e., we cannot have more than one TokenStream
// running "at once"):
termsHash.startDocument();
startStoredFields(docID);
try {
for (IndexableField field : document) {
fieldCount = processField(docID, field, fieldGen, fieldCount);
}
} finally {
if (hasHitAbortingException == false) {
// Finish each indexed field name seen in the document:
for (int i=0;i IndexWriter.MAX_STORED_STRING_LENGTH) {
throw new IllegalArgumentException("stored field \"" + field.name() + "\" is too large (" + value.length() + " characters) to store");
}
try {
storedFieldsConsumer.writeField(fp.fieldInfo, field);
} catch (Throwable th) {
onAbortingException(th);
throw th;
}
}
}
DocValuesType dvType = fieldType.docValuesType();
if (dvType == null) {
throw new NullPointerException("docValuesType must not be null (field: \"" + fieldName + "\")");
}
if (dvType != DocValuesType.NONE) {
if (fp == null) {
fp = getOrAddField(fieldName, fieldType, false);
}
indexDocValue(docID, fp, dvType, field);
}
if (fieldType.pointDimensionCount() != 0) {
if (fp == null) {
fp = getOrAddField(fieldName, fieldType, false);
}
indexPoint(docID, fp, field);
}
return fieldCount;
}
private static void verifyUnIndexedFieldType(String name, IndexableFieldType ft) {
if (ft.storeTermVectors()) {
throw new IllegalArgumentException("cannot store term vectors "
+ "for a field that is not indexed (field=\"" + name + "\")");
}
if (ft.storeTermVectorPositions()) {
throw new IllegalArgumentException("cannot store term vector positions "
+ "for a field that is not indexed (field=\"" + name + "\")");
}
if (ft.storeTermVectorOffsets()) {
throw new IllegalArgumentException("cannot store term vector offsets "
+ "for a field that is not indexed (field=\"" + name + "\")");
}
if (ft.storeTermVectorPayloads()) {
throw new IllegalArgumentException("cannot store term vector payloads "
+ "for a field that is not indexed (field=\"" + name + "\")");
}
}
/** Called from processDocument to index one field's point */
private void indexPoint(int docID, PerField fp, IndexableField field) throws IOException {
int pointDimensionCount = field.fieldType().pointDimensionCount();
int pointIndexDimensionCount = field.fieldType().pointIndexDimensionCount();
int dimensionNumBytes = field.fieldType().pointNumBytes();
// Record dimensions for this field; this setter will throw IllegalArgExc if
// the dimensions were already set to something different:
if (fp.fieldInfo.getPointDimensionCount() == 0) {
fieldInfos.globalFieldNumbers.setDimensions(fp.fieldInfo.number, fp.fieldInfo.name, pointDimensionCount, pointIndexDimensionCount, dimensionNumBytes);
}
fp.fieldInfo.setPointDimensions(pointDimensionCount, pointIndexDimensionCount, dimensionNumBytes);
if (fp.pointValuesWriter == null) {
fp.pointValuesWriter = new PointValuesWriter(bytesUsed, fp.fieldInfo);
}
fp.pointValuesWriter.addPackedValue(docID, field.binaryValue());
}
private void validateIndexSortDVType(Sort indexSort, String fieldToValidate, DocValuesType dvType) throws IOException {
for (SortField sortField : indexSort.getSort()) {
IndexSorter sorter = sortField.getIndexSorter();
if (sorter == null) {
throw new IllegalStateException("Cannot sort index with sort order " + sortField);
}
sorter.getDocComparator(new DocValuesLeafReader() {
@Override
public NumericDocValues getNumericDocValues(String field) {
if (Objects.equals(field, fieldToValidate) && dvType != DocValuesType.NUMERIC) {
throw new IllegalArgumentException("SortField " + sortField + " expected field [" + field + "] to be NUMERIC but it is [" + dvType + "]");
}
return DocValues.emptyNumeric();
}
@Override
public BinaryDocValues getBinaryDocValues(String field) {
if (Objects.equals(field, fieldToValidate) && dvType != DocValuesType.BINARY) {
throw new IllegalArgumentException("SortField " + sortField + " expected field [" + field + "] to be BINARY but it is [" + dvType + "]");
}
return DocValues.emptyBinary();
}
@Override
public SortedDocValues getSortedDocValues(String field) {
if (Objects.equals(field, fieldToValidate) && dvType != DocValuesType.SORTED) {
throw new IllegalArgumentException("SortField " + sortField + " expected field [" + field + "] to be SORTED but it is [" + dvType + "]");
}
return DocValues.emptySorted();
}
@Override
public SortedNumericDocValues getSortedNumericDocValues(String field) {
if (Objects.equals(field, fieldToValidate) && dvType != DocValuesType.SORTED_NUMERIC) {
throw new IllegalArgumentException("SortField " + sortField + " expected field [" + field + "] to be SORTED_NUMERIC but it is [" + dvType + "]");
}
return DocValues.emptySortedNumeric();
}
@Override
public SortedSetDocValues getSortedSetDocValues(String field) {
if (Objects.equals(field, fieldToValidate) && dvType != DocValuesType.SORTED_SET) {
throw new IllegalArgumentException("SortField " + sortField + " expected field [" + field + "] to be SORTED_SET but it is [" + dvType + "]");
}
return DocValues.emptySortedSet();
}
@Override
public FieldInfos getFieldInfos() {
throw new UnsupportedOperationException();
}
}, 0);
}
}
/** Called from processDocument to index one field's doc value */
private void indexDocValue(int docID, PerField fp, DocValuesType dvType, IndexableField field) throws IOException {
if (fp.fieldInfo.getDocValuesType() == DocValuesType.NONE) {
// This is the first time we are seeing this field indexed with doc values, so we
// now record the DV type so that any future attempt to (illegally) change
// the DV type of this field, will throw an IllegalArgExc:
if (indexWriterConfig.getIndexSort() != null) {
final Sort indexSort = indexWriterConfig.getIndexSort();
validateIndexSortDVType(indexSort, fp.fieldInfo.name, dvType);
}
fieldInfos.globalFieldNumbers.setDocValuesType(fp.fieldInfo.number, fp.fieldInfo.name, dvType);
}
fp.fieldInfo.setDocValuesType(dvType);
switch(dvType) {
case NUMERIC:
if (fp.docValuesWriter == null) {
fp.docValuesWriter = new NumericDocValuesWriter(fp.fieldInfo, bytesUsed);
}
if (field.numericValue() == null) {
throw new IllegalArgumentException("field=\"" + fp.fieldInfo.name + "\": null value not allowed");
}
((NumericDocValuesWriter) fp.docValuesWriter).addValue(docID, field.numericValue().longValue());
break;
case BINARY:
if (fp.docValuesWriter == null) {
fp.docValuesWriter = new BinaryDocValuesWriter(fp.fieldInfo, bytesUsed);
}
((BinaryDocValuesWriter) fp.docValuesWriter).addValue(docID, field.binaryValue());
break;
case SORTED:
if (fp.docValuesWriter == null) {
fp.docValuesWriter = new SortedDocValuesWriter(fp.fieldInfo, bytesUsed, docValuesBytePool);
}
((SortedDocValuesWriter) fp.docValuesWriter).addValue(docID, field.binaryValue());
break;
case SORTED_NUMERIC:
if (fp.docValuesWriter == null) {
fp.docValuesWriter = new SortedNumericDocValuesWriter(fp.fieldInfo, bytesUsed);
}
((SortedNumericDocValuesWriter) fp.docValuesWriter).addValue(docID, field.numericValue().longValue());
break;
case SORTED_SET:
if (fp.docValuesWriter == null) {
fp.docValuesWriter = new SortedSetDocValuesWriter(fp.fieldInfo, bytesUsed, docValuesBytePool);
}
((SortedSetDocValuesWriter) fp.docValuesWriter).addValue(docID, field.binaryValue());
break;
default:
throw new AssertionError("unrecognized DocValues.Type: " + dvType);
}
}
/** Returns a previously created {@link PerField}, or null
* if this field name wasn't seen yet. */
private PerField getPerField(String name) {
final int hashPos = name.hashCode() & hashMask;
PerField fp = fieldHash[hashPos];
while (fp != null && !fp.fieldInfo.name.equals(name)) {
fp = fp.next;
}
return fp;
}
/** Returns a previously created {@link PerField},
* absorbing the type information from {@link FieldType},
* and creates a new {@link PerField} if this field name
* wasn't seen yet. */
private PerField getOrAddField(String name, IndexableFieldType fieldType, boolean invert) {
// Make sure we have a PerField allocated
final int hashPos = name.hashCode() & hashMask;
PerField fp = fieldHash[hashPos];
while (fp != null && !fp.fieldInfo.name.equals(name)) {
fp = fp.next;
}
if (fp == null) {
// First time we are seeing this field in this segment
FieldInfo fi = fieldInfos.getOrAdd(name);
initIndexOptions(fi, fieldType.indexOptions());
Map attributes = fieldType.getAttributes();
if (attributes != null) {
attributes.forEach((k, v) -> fi.putAttribute(k, v));
}
fp = new PerField(indexCreatedVersionMajor, fi, invert,
indexWriterConfig.getSimilarity(), indexWriterConfig.getInfoStream(), indexWriterConfig.getAnalyzer());
fp.next = fieldHash[hashPos];
fieldHash[hashPos] = fp;
totalFieldCount++;
// At most 50% load factor:
if (totalFieldCount >= fieldHash.length/2) {
rehash();
}
if (totalFieldCount > fields.length) {
PerField[] newFields = new PerField[ArrayUtil.oversize(totalFieldCount, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
System.arraycopy(fields, 0, newFields, 0, fields.length);
fields = newFields;
}
} else if (invert && fp.invertState == null) {
initIndexOptions(fp.fieldInfo, fieldType.indexOptions());
fp.setInvertState();
}
return fp;
}
private void initIndexOptions(FieldInfo info, IndexOptions indexOptions) {
// Messy: must set this here because e.g. FreqProxTermsWriterPerField looks at the initial
// IndexOptions to decide what arrays it must create).
assert info.getIndexOptions() == IndexOptions.NONE;
// This is the first time we are seeing this field indexed, so we now
// record the index options so that any future attempt to (illegally)
// change the index options of this field, will throw an IllegalArgExc:
fieldInfos.globalFieldNumbers.setIndexOptions(info.number, info.name, indexOptions);
info.setIndexOptions(indexOptions);
}
@Override
public long ramBytesUsed() {
return bytesUsed.get() + storedFieldsConsumer.accountable.ramBytesUsed()
+ termVectorsWriter.accountable.ramBytesUsed();
}
@Override
public Collection getChildResources() {
return Arrays.asList(storedFieldsConsumer.accountable, termVectorsWriter.accountable);
}
/** NOTE: not static: accesses at least docState, termsHash. */
private final class PerField implements Comparable {
final int indexCreatedVersionMajor;
final FieldInfo fieldInfo;
final Similarity similarity;
FieldInvertState invertState;
TermsHashPerField termsHashPerField;
// Non-null if this field ever had doc values in this
// segment:
DocValuesWriter> docValuesWriter;
// Non-null if this field ever had points in this segment:
PointValuesWriter pointValuesWriter;
/** We use this to know when a PerField is seen for the
* first time in the current document. */
long fieldGen = -1;
// Used by the hash table
PerField next;
// Lazy init'd:
NormValuesWriter norms;
// reused
TokenStream tokenStream;
private final InfoStream infoStream;
private final Analyzer analyzer;
PerField(int indexCreatedVersionMajor, FieldInfo fieldInfo, boolean invert, Similarity similarity, InfoStream infoStream, Analyzer analyzer) {
this.indexCreatedVersionMajor = indexCreatedVersionMajor;
this.fieldInfo = fieldInfo;
this.similarity = similarity;
this.infoStream = infoStream;
this.analyzer = analyzer;
if (invert) {
setInvertState();
}
}
void setInvertState() {
invertState = new FieldInvertState(indexCreatedVersionMajor, fieldInfo.name, fieldInfo.getIndexOptions());
termsHashPerField = termsHash.addField(invertState, fieldInfo);
if (fieldInfo.omitsNorms() == false) {
assert norms == null;
// Even if no documents actually succeed in setting a norm, we still write norms for this segment:
norms = new NormValuesWriter(fieldInfo, bytesUsed);
}
}
@Override
public int compareTo(PerField other) {
return this.fieldInfo.name.compareTo(other.fieldInfo.name);
}
public void finish(int docID) throws IOException {
if (fieldInfo.omitsNorms() == false) {
long normValue;
if (invertState.length == 0) {
// the field exists in this document, but it did not have
// any indexed tokens, so we assign a default value of zero
// to the norm
normValue = 0;
} else {
normValue = similarity.computeNorm(invertState);
if (normValue == 0) {
throw new IllegalStateException("Similarity " + similarity + " return 0 for non-empty field");
}
}
norms.addValue(docID, normValue);
}
termsHashPerField.finish();
}
/** Inverts one field for one document; first is true
* if this is the first time we are seeing this field
* name in this document. */
public void invert(int docID, IndexableField field, boolean first) throws IOException {
if (first) {
// First time we're seeing this field (indexed) in
// this document:
invertState.reset();
}
IndexableFieldType fieldType = field.fieldType();
IndexOptions indexOptions = fieldType.indexOptions();
fieldInfo.setIndexOptions(indexOptions);
if (fieldType.omitNorms()) {
fieldInfo.setOmitsNorms();
}
final boolean analyzed = fieldType.tokenized() && analyzer != null;
/*
* To assist people in tracking down problems in analysis components, we wish to write the field name to the infostream
* when we fail. We expect some caller to eventually deal with the real exception, so we don't want any 'catch' clauses,
* but rather a finally that takes note of the problem.
*/
boolean succeededInProcessingField = false;
try (TokenStream stream = tokenStream = field.tokenStream(analyzer, tokenStream)) {
// reset the TokenStream to the first token
stream.reset();
invertState.setAttributeSource(stream);
termsHashPerField.start(field, first);
while (stream.incrementToken()) {
// If we hit an exception in stream.next below
// (which is fairly common, e.g. if analyzer
// chokes on a given document), then it's
// non-aborting and (above) this one document
// will be marked as deleted, but still
// consume a docID
int posIncr = invertState.posIncrAttribute.getPositionIncrement();
invertState.position += posIncr;
if (invertState.position < invertState.lastPosition) {
if (posIncr == 0) {
throw new IllegalArgumentException("first position increment must be > 0 (got 0) for field '" + field.name() + "'");
} else if (posIncr < 0) {
throw new IllegalArgumentException("position increment must be >= 0 (got " + posIncr + ") for field '" + field.name() + "'");
} else {
throw new IllegalArgumentException("position overflowed Integer.MAX_VALUE (got posIncr=" + posIncr + " lastPosition=" + invertState.lastPosition + " position=" + invertState.position + ") for field '" + field.name() + "'");
}
} else if (invertState.position > IndexWriter.MAX_POSITION) {
throw new IllegalArgumentException("position " + invertState.position + " is too large for field '" + field.name() + "': max allowed position is " + IndexWriter.MAX_POSITION);
}
invertState.lastPosition = invertState.position;
if (posIncr == 0) {
invertState.numOverlap++;
}
int startOffset = invertState.offset + invertState.offsetAttribute.startOffset();
int endOffset = invertState.offset + invertState.offsetAttribute.endOffset();
if (startOffset < invertState.lastStartOffset || endOffset < startOffset) {
throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, and offsets must not go backwards "
+ "startOffset=" + startOffset + ",endOffset=" + endOffset + ",lastStartOffset=" + invertState.lastStartOffset + " for field '" + field.name() + "'");
}
invertState.lastStartOffset = startOffset;
try {
invertState.length = Math.addExact(invertState.length, invertState.termFreqAttribute.getTermFrequency());
} catch (ArithmeticException ae) {
throw new IllegalArgumentException("too many tokens for field \"" + field.name() + "\"");
}
//System.out.println(" term=" + invertState.termAttribute);
// If we hit an exception in here, we abort
// all buffered documents since the last
// flush, on the likelihood that the
// internal state of the terms hash is now
// corrupt and should not be flushed to a
// new segment:
try {
termsHashPerField.add(invertState.termAttribute.getBytesRef(), docID);
} catch (MaxBytesLengthExceededException e) {
byte[] prefix = new byte[30];
BytesRef bigTerm = invertState.termAttribute.getBytesRef();
System.arraycopy(bigTerm.bytes, bigTerm.offset, prefix, 0, 30);
String msg = "Document contains at least one immense term in field=\"" + fieldInfo.name + "\" (whose UTF8 encoding is longer than the max length " + IndexWriter.MAX_TERM_LENGTH + "), all of which were skipped. Please correct the analyzer to not produce such terms. The prefix of the first immense term is: '" + Arrays.toString(prefix) + "...', original message: " + e.getMessage();
if (infoStream.isEnabled("IW")) {
infoStream.message("IW", "ERROR: " + msg);
}
// Document will be deleted above:
throw new IllegalArgumentException(msg, e);
} catch (Throwable th) {
onAbortingException(th);
throw th;
}
}
// trigger streams to perform end-of-stream operations
stream.end();
// TODO: maybe add some safety? then again, it's already checked
// when we come back around to the field...
invertState.position += invertState.posIncrAttribute.getPositionIncrement();
invertState.offset += invertState.offsetAttribute.endOffset();
/* if there is an exception coming through, we won't set this to true here:*/
succeededInProcessingField = true;
} finally {
if (!succeededInProcessingField && infoStream.isEnabled("DW")) {
infoStream.message("DW", "An exception was thrown while processing field " + fieldInfo.name);
}
}
if (analyzed) {
invertState.position += analyzer.getPositionIncrementGap(fieldInfo.name);
invertState.offset += analyzer.getOffsetGap(fieldInfo.name);
}
}
}
@Override
DocIdSetIterator getHasDocValues(String field) {
PerField perField = getPerField(field);
if (perField != null) {
if (perField.docValuesWriter != null) {
if (perField.fieldInfo.getDocValuesType() == DocValuesType.NONE) {
return null;
}
return perField.docValuesWriter.getDocValues();
}
}
return null;
}
private static class IntBlockAllocator extends IntBlockPool.Allocator {
private final Counter bytesUsed;
IntBlockAllocator(Counter bytesUsed) {
super(IntBlockPool.INT_BLOCK_SIZE);
this.bytesUsed = bytesUsed;
}
/* Allocate another int[] from the shared pool */
@Override
public int[] getIntBlock() {
int[] b = new int[IntBlockPool.INT_BLOCK_SIZE];
bytesUsed.addAndGet(IntBlockPool.INT_BLOCK_SIZE * Integer.BYTES);
return b;
}
@Override
public void recycleIntBlocks(int[][] blocks, int offset, int length) {
bytesUsed.addAndGet(-(length * (IntBlockPool.INT_BLOCK_SIZE * Integer.BYTES)));
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy