All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.codecs.DocValuesConsumer Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.codecs;


import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;

import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FilteredTermsEnum;
import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.MultiDocValues.OrdinalMap;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.SegmentWriteState; // javadocs
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LongBitSet;
import org.apache.lucene.util.LongValues;
import org.apache.lucene.util.packed.PackedInts;

/** 
 * Abstract API that consumes numeric, binary and
 * sorted docvalues.  Concrete implementations of this
 * actually do "something" with the docvalues (write it into
 * the index in a specific format).
 * 

* The lifecycle is: *

    *
  1. DocValuesConsumer is created by * {@link NormsFormat#normsConsumer(SegmentWriteState)}. *
  2. {@link #addNumericField}, {@link #addBinaryField}, * {@link #addSortedField}, {@link #addSortedSetField}, * or {@link #addSortedNumericField} are called for each Numeric, * Binary, Sorted, SortedSet, or SortedNumeric docvalues field. * The API is a "pull" rather than "push", and the implementation * is free to iterate over the values multiple times * ({@link Iterable#iterator()}). *
  3. After all fields are added, the consumer is {@link #close}d. *
* * @lucene.experimental */ public abstract class DocValuesConsumer implements Closeable { /** Sole constructor. (For invocation by subclass * constructors, typically implicit.) */ protected DocValuesConsumer() {} /** * Writes numeric docvalues for a field. * @param field field information * @param values Iterable of numeric values (one for each document). {@code null} indicates * a missing value. * @throws IOException if an I/O error occurred. */ public abstract void addNumericField(FieldInfo field, Iterable values) throws IOException; /** * Writes binary docvalues for a field. * @param field field information * @param values Iterable of binary values (one for each document). {@code null} indicates * a missing value. * @throws IOException if an I/O error occurred. */ public abstract void addBinaryField(FieldInfo field, Iterable values) throws IOException; /** * Writes pre-sorted binary docvalues for a field. * @param field field information * @param values Iterable of binary values in sorted order (deduplicated). * @param docToOrd Iterable of ordinals (one for each document). {@code -1} indicates * a missing value. * @throws IOException if an I/O error occurred. */ public abstract void addSortedField(FieldInfo field, Iterable values, Iterable docToOrd) throws IOException; /** * Writes pre-sorted numeric docvalues for a field * @param field field information * @param docToValueCount Iterable of the number of values for each document. A zero * count indicates a missing value. * @param values Iterable of numeric values in sorted order (not deduplicated). * @throws IOException if an I/O error occurred. */ public abstract void addSortedNumericField(FieldInfo field, Iterable docToValueCount, Iterable values) throws IOException; /** * Writes pre-sorted set docvalues for a field * @param field field information * @param values Iterable of binary values in sorted order (deduplicated). * @param docToOrdCount Iterable of the number of values for each document. A zero ordinal * count indicates a missing value. * @param ords Iterable of ordinal occurrences (docToOrdCount*maxDoc total). * @throws IOException if an I/O error occurred. */ public abstract void addSortedSetField(FieldInfo field, Iterable values, Iterable docToOrdCount, Iterable ords) throws IOException; /** Merges in the fields from the readers in * mergeState. The default implementation * calls {@link #mergeNumericField}, {@link #mergeBinaryField}, * {@link #mergeSortedField}, {@link #mergeSortedSetField}, * or {@link #mergeSortedNumericField} for each field, * depending on its type. * Implementations can override this method * for more sophisticated merging (bulk-byte copying, etc). */ public void merge(MergeState mergeState) throws IOException { for(DocValuesProducer docValuesProducer : mergeState.docValuesProducers) { if (docValuesProducer != null) { docValuesProducer.checkIntegrity(); } } for (FieldInfo mergeFieldInfo : mergeState.mergeFieldInfos) { DocValuesType type = mergeFieldInfo.getDocValuesType(); if (type != DocValuesType.NONE) { if (type == DocValuesType.NUMERIC) { List toMerge = new ArrayList<>(); List docsWithField = new ArrayList<>(); for (int i=0;i toMerge = new ArrayList<>(); List docsWithField = new ArrayList<>(); for (int i=0;i toMerge = new ArrayList<>(); for (int i=0;i toMerge = new ArrayList<>(); for (int i=0;i toMerge = new ArrayList<>(); for (int i=0;i value new Iterable() { @Override public Iterator iterator() { return new Iterator() { int currentOrd; @Override public boolean hasNext() { return currentOrd < map.getValueCount(); } @Override public BytesRef next() { if (!hasNext()) { throw new NoSuchElementException(); } int segmentNumber = map.getFirstSegmentNumber(currentOrd); int segmentOrd = (int)map.getFirstSegmentOrd(currentOrd); final BytesRef term = dvs[segmentNumber].lookupOrd(segmentOrd); currentOrd++; return term; } @Override public void remove() { throw new UnsupportedOperationException(); } }; } }, // doc -> ord new Iterable() { @Override public Iterator iterator() { return new Iterator() { int readerUpto = -1; int docIDUpto; int nextValue; int currentMaxDoc; Bits currentLiveDocs; LongValues currentMap; boolean nextIsSet; @Override public boolean hasNext() { return nextIsSet || setNext(); } @Override public void remove() { throw new UnsupportedOperationException(); } @Override public Number next() { if (!hasNext()) { throw new NoSuchElementException(); } assert nextIsSet; nextIsSet = false; // TODO make a mutable number return nextValue; } private boolean setNext() { while (true) { if (readerUpto == numReaders) { return false; } if (docIDUpto == currentMaxDoc) { readerUpto++; if (readerUpto < numReaders) { currentMap = map.getGlobalOrds(readerUpto); currentLiveDocs = mergeState.liveDocs[readerUpto]; currentMaxDoc = mergeState.maxDocs[readerUpto]; } docIDUpto = 0; continue; } if (currentLiveDocs == null || currentLiveDocs.get(docIDUpto)) { nextIsSet = true; int segOrd = dvs[readerUpto].getOrd(docIDUpto); nextValue = segOrd == -1 ? -1 : (int) currentMap.get(segOrd); docIDUpto++; return true; } docIDUpto++; } } }; } } ); } /** * Merges the sortedset docvalues from toMerge. *

* The default implementation calls {@link #addSortedSetField}, passing * an Iterable that merges ordinals and values and filters deleted documents . */ public void mergeSortedSetField(FieldInfo fieldInfo, final MergeState mergeState, List toMerge) throws IOException { final SortedSetDocValues dvs[] = toMerge.toArray(new SortedSetDocValues[toMerge.size()]); final int numReaders = mergeState.maxDocs.length; // step 1: iterate thru each sub and mark terms still in use TermsEnum liveTerms[] = new TermsEnum[dvs.length]; long[] weights = new long[liveTerms.length]; for (int sub = 0; sub < liveTerms.length; sub++) { SortedSetDocValues dv = dvs[sub]; Bits liveDocs = mergeState.liveDocs[sub]; int maxDoc = mergeState.maxDocs[sub]; if (liveDocs == null) { liveTerms[sub] = dv.termsEnum(); weights[sub] = dv.getValueCount(); } else { LongBitSet bitset = new LongBitSet(dv.getValueCount()); for (int i = 0; i < maxDoc; i++) { if (liveDocs.get(i)) { dv.setDocument(i); long ord; while ((ord = dv.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { bitset.set(ord); } } } liveTerms[sub] = new BitsFilteredTermsEnum(dv.termsEnum(), bitset); weights[sub] = bitset.cardinality(); } } // step 2: create ordinal map (this conceptually does the "merging") final OrdinalMap map = OrdinalMap.build(this, liveTerms, weights, PackedInts.COMPACT); // step 3: add field addSortedSetField(fieldInfo, // ord -> value new Iterable() { @Override public Iterator iterator() { return new Iterator() { long currentOrd; @Override public boolean hasNext() { return currentOrd < map.getValueCount(); } @Override public BytesRef next() { if (!hasNext()) { throw new NoSuchElementException(); } int segmentNumber = map.getFirstSegmentNumber(currentOrd); long segmentOrd = map.getFirstSegmentOrd(currentOrd); final BytesRef term = dvs[segmentNumber].lookupOrd(segmentOrd); currentOrd++; return term; } @Override public void remove() { throw new UnsupportedOperationException(); } }; } }, // doc -> ord count new Iterable() { @Override public Iterator iterator() { return new Iterator() { int readerUpto = -1; int docIDUpto; int nextValue; int currentMaxDoc; Bits currentLiveDocs; boolean nextIsSet; @Override public boolean hasNext() { return nextIsSet || setNext(); } @Override public void remove() { throw new UnsupportedOperationException(); } @Override public Number next() { if (!hasNext()) { throw new NoSuchElementException(); } assert nextIsSet; nextIsSet = false; // TODO make a mutable number return nextValue; } private boolean setNext() { while (true) { if (readerUpto == numReaders) { return false; } if (docIDUpto == currentMaxDoc) { readerUpto++; if (readerUpto < numReaders) { currentLiveDocs = mergeState.liveDocs[readerUpto]; currentMaxDoc = mergeState.maxDocs[readerUpto]; } docIDUpto = 0; continue; } if (currentLiveDocs == null || currentLiveDocs.get(docIDUpto)) { nextIsSet = true; SortedSetDocValues dv = dvs[readerUpto]; dv.setDocument(docIDUpto); nextValue = 0; while (dv.nextOrd() != SortedSetDocValues.NO_MORE_ORDS) { nextValue++; } docIDUpto++; return true; } docIDUpto++; } } }; } }, // ords new Iterable() { @Override public Iterator iterator() { return new Iterator() { int readerUpto = -1; int docIDUpto; long nextValue; int currentMaxDoc; Bits currentLiveDocs; LongValues currentMap; boolean nextIsSet; long ords[] = new long[8]; int ordUpto; int ordLength; @Override public boolean hasNext() { return nextIsSet || setNext(); } @Override public void remove() { throw new UnsupportedOperationException(); } @Override public Number next() { if (!hasNext()) { throw new NoSuchElementException(); } assert nextIsSet; nextIsSet = false; // TODO make a mutable number return nextValue; } private boolean setNext() { while (true) { if (readerUpto == numReaders) { return false; } if (ordUpto < ordLength) { nextValue = ords[ordUpto]; ordUpto++; nextIsSet = true; return true; } if (docIDUpto == currentMaxDoc) { readerUpto++; if (readerUpto < numReaders) { currentMap = map.getGlobalOrds(readerUpto); currentLiveDocs = mergeState.liveDocs[readerUpto]; currentMaxDoc = mergeState.maxDocs[readerUpto]; } docIDUpto = 0; continue; } if (currentLiveDocs == null || currentLiveDocs.get(docIDUpto)) { assert docIDUpto < currentMaxDoc; SortedSetDocValues dv = dvs[readerUpto]; dv.setDocument(docIDUpto); ordUpto = ordLength = 0; long ord; while ((ord = dv.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { if (ordLength == ords.length) { ords = ArrayUtil.grow(ords, ordLength+1); } ords[ordLength] = currentMap.get(ord); ordLength++; } docIDUpto++; continue; } docIDUpto++; } } }; } } ); } // TODO: seek-by-ord to nextSetBit static class BitsFilteredTermsEnum extends FilteredTermsEnum { final LongBitSet liveTerms; BitsFilteredTermsEnum(TermsEnum in, LongBitSet liveTerms) { super(in, false); // <-- not passing false here wasted about 3 hours of my time!!!!!!!!!!!!! assert liveTerms != null; this.liveTerms = liveTerms; } @Override protected AcceptStatus accept(BytesRef term) throws IOException { if (liveTerms.get(ord())) { return AcceptStatus.YES; } else { return AcceptStatus.NO; } } } /** Helper: returns true if the given docToValue count contains only at most one value */ public static boolean isSingleValued(Iterable docToValueCount) { for (Number count : docToValueCount) { if (count.longValue() > 1) { return false; } } return true; } /** Helper: returns single-valued view, using {@code missingValue} when count is zero */ public static Iterable singletonView(final Iterable docToValueCount, final Iterable values, final Number missingValue) { assert isSingleValued(docToValueCount); return new Iterable() { @Override public Iterator iterator() { final Iterator countIterator = docToValueCount.iterator(); final Iterator valuesIterator = values.iterator(); return new Iterator() { @Override public boolean hasNext() { return countIterator.hasNext(); } @Override public Number next() { int count = countIterator.next().intValue(); if (count == 0) { return missingValue; } else { return valuesIterator.next(); } } @Override public void remove() { throw new UnsupportedOperationException(); } }; } }; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy