org.apache.lucene.codecs.DocValuesConsumer Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.codecs;

import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DocIDMerger;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.EmptyDocValuesProducer;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FilteredTermsEnum;
import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.OrdinalMap;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SegmentWriteState; // javadocs
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LongBitSet;
import org.apache.lucene.util.LongValues;
import org.apache.lucene.util.packed.PackedInts;

import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;

/** 
 * Abstract API that consumes numeric, binary and
 * sorted docvalues.  Concrete implementations of this
 * actually do "something" with the docvalues (write it into
 * the index in a specific format).
 * 
 * The lifecycle is:
 * 

 *   DocValuesConsumer is created by 
 *       {@link NormsFormat#normsConsumer(SegmentWriteState)}.
 *   
{@link #addNumericField}, {@link #addBinaryField},
 *       {@link #addSortedField}, {@link #addSortedSetField},
 *       or {@link #addSortedNumericField} are called for each Numeric,
 *       Binary, Sorted, SortedSet, or SortedNumeric docvalues field. 
 *       The API is a "pull" rather than "push", and the implementation 
 *       is free to iterate over the values multiple times 
 *       ({@link Iterable#iterator()}).
 *   
After all fields are added, the consumer is {@link #close}d.
 * 
 *
 * @lucene.experimental
 */
public abstract class DocValuesConsumer implements Closeable {
  
  /** Sole constructor. (For invocation by subclass 
   *  constructors, typically implicit.) */
  protected DocValuesConsumer() {}

  /**
   * Writes numeric docvalues for a field.
   * @param field field information
   * @param valuesProducer Numeric values to write.
   * @throws IOException if an I/O error occurred.
   */
  public abstract void addNumericField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException;    

  /**
   * Writes binary docvalues for a field.
   * @param field field information
   * @param valuesProducer Binary values to write.
   * @throws IOException if an I/O error occurred.
   */
  public abstract void addBinaryField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException;

  /**
   * Writes pre-sorted binary docvalues for a field.
   * @param field field information
   * @param valuesProducer produces the values and ordinals to write
   * @throws IOException if an I/O error occurred.
   */
  public abstract void addSortedField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException;
  
  /**
   * Writes pre-sorted numeric docvalues for a field
   * @param field field information
   * @param valuesProducer produces the values to write
   * @throws IOException if an I/O error occurred.
   */
  public abstract void addSortedNumericField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException;

  /**
   * Writes pre-sorted set docvalues for a field
   * @param field field information
   * @param valuesProducer produces the values to write
   * @throws IOException if an I/O error occurred.
   */
  public abstract void addSortedSetField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException;
  
  /** Merges in the fields from the readers in 
   *  mergeState. The default implementation 
   *  calls {@link #mergeNumericField}, {@link #mergeBinaryField},
   *  {@link #mergeSortedField}, {@link #mergeSortedSetField},
   *  or {@link #mergeSortedNumericField} for each field,
   *  depending on its type.
   *  Implementations can override this method 
   *  for more sophisticated merging (bulk-byte copying, etc). */
  public void merge(MergeState mergeState) throws IOException {
    for(DocValuesProducer docValuesProducer : mergeState.docValuesProducers) {
      if (docValuesProducer != null) {
        docValuesProducer.checkIntegrity();
      }
    }

    for (FieldInfo mergeFieldInfo : mergeState.mergeFieldInfos) {
      DocValuesType type = mergeFieldInfo.getDocValuesType();
      if (type != DocValuesType.NONE) {
        if (type == DocValuesType.NUMERIC) {
          mergeNumericField(mergeFieldInfo, mergeState);
        } else if (type == DocValuesType.BINARY) {
          mergeBinaryField(mergeFieldInfo, mergeState);
        } else if (type == DocValuesType.SORTED) {
          mergeSortedField(mergeFieldInfo, mergeState);
        } else if (type == DocValuesType.SORTED_SET) {
          mergeSortedSetField(mergeFieldInfo, mergeState);
        } else if (type == DocValuesType.SORTED_NUMERIC) {
          mergeSortedNumericField(mergeFieldInfo, mergeState);
        } else {
          throw new AssertionError("type=" + type);
        }
      }
    }
  }

  /** Tracks state of one numeric sub-reader that we are merging */
  private static class NumericDocValuesSub extends DocIDMerger.Sub {

    final NumericDocValues values;

    public NumericDocValuesSub(MergeState.DocMap docMap, NumericDocValues values) {
      super(docMap);
      this.values = values;
      assert values.docID() == -1;
    }

    @Override
    public int nextDoc() throws IOException {
      return values.nextDoc();
    }
  }
  
  /**
   * Merges the numeric docvalues from MergeState.
   * 
   * The default implementation calls {@link #addNumericField}, passing
   * a DocValuesProducer that merges and filters deleted documents on the fly.
   */
  public void mergeNumericField(final FieldInfo mergeFieldInfo, final MergeState mergeState) throws IOException {
    addNumericField(mergeFieldInfo,
                    new EmptyDocValuesProducer() {
                      @Override
                      public NumericDocValues getNumeric(FieldInfo fieldInfo) throws IOException {
                        if (fieldInfo != mergeFieldInfo) {
                          throw new IllegalArgumentException("wrong fieldInfo");
                        }

                        List subs = new ArrayList<>();
                        assert mergeState.docMaps.length == mergeState.docValuesProducers.length;
                        long cost = 0;
                        for (int i=0;i docIDMerger = DocIDMerger.of(subs, mergeState.needsIndexSort);

                        final long finalCost = cost;
                        
                        return new NumericDocValues() {
                          private int docID = -1;
                          private NumericDocValuesSub current;

                          @Override
                          public int docID() {
                            return docID;
                          }

                          @Override
                          public int nextDoc() throws IOException {
                            current = docIDMerger.next();
                            if (current == null) {
                              docID = NO_MORE_DOCS;
                            } else {
                              docID = current.mappedDocID;
                            }
                            return docID;
                          }

                          @Override
                          public int advance(int target) throws IOException {
                            throw new UnsupportedOperationException();
                          }

                          @Override
                          public boolean advanceExact(int target) throws IOException {
                            throw new UnsupportedOperationException();
                          }

                          @Override
                          public long cost() {
                            return finalCost;
                          }

                          @Override
                          public long longValue() throws IOException {
                            return current.values.longValue();
                          }
                        };
                      }
                    });
  }
  
  /** Tracks state of one binary sub-reader that we are merging */
  private static class BinaryDocValuesSub extends DocIDMerger.Sub {

    final BinaryDocValues values;

    public BinaryDocValuesSub(MergeState.DocMap docMap, BinaryDocValues values) {
      super(docMap);
      this.values = values;
      assert values.docID() == -1;
    }

    @Override
    public int nextDoc() throws IOException {
      return values.nextDoc();
    }
  }

  /**
   * Merges the binary docvalues from MergeState.
   * 

   * The default implementation calls {@link #addBinaryField}, passing
   * a DocValuesProducer that merges and filters deleted documents on the fly.
   */
  public void mergeBinaryField(FieldInfo mergeFieldInfo, final MergeState mergeState) throws IOException {
    addBinaryField(mergeFieldInfo,
                   new EmptyDocValuesProducer() {
                     @Override
                     public BinaryDocValues getBinary(FieldInfo fieldInfo) throws IOException {
                       if (fieldInfo != mergeFieldInfo) {
                         throw new IllegalArgumentException("wrong fieldInfo");
                       }
                   
                       List subs = new ArrayList<>();

                       long cost = 0;
                       for (int i=0;i docIDMerger = DocIDMerger.of(subs, mergeState.needsIndexSort);
                       final long finalCost = cost;
                       
                       return new BinaryDocValues() {
                         private BinaryDocValuesSub current;
                         private int docID = -1;

                         @Override
                         public int docID() {
                           return docID;
                         }

                         @Override
                         public int nextDoc() throws IOException {
                           current = docIDMerger.next();
                           if (current == null) {
                             docID = NO_MORE_DOCS;
                           } else {
                             docID = current.mappedDocID;
                           }
                           return docID;
                         }

                         @Override
                         public int advance(int target) throws IOException {
                           throw new UnsupportedOperationException();
                         }

                         @Override
                         public boolean advanceExact(int target) throws IOException {
                           throw new UnsupportedOperationException();
                         }

                         @Override
                         public long cost() {
                           return finalCost;
                         }

                         @Override
                         public BytesRef binaryValue() throws IOException {
                           return current.values.binaryValue();
                         }
                       };
                     }
                   });
  }

  /** Tracks state of one sorted numeric sub-reader that we are merging */
  private static class SortedNumericDocValuesSub extends DocIDMerger.Sub {

    final SortedNumericDocValues values;

    public SortedNumericDocValuesSub(MergeState.DocMap docMap, SortedNumericDocValues values) {
      super(docMap);
      this.values = values;
      assert values.docID() == -1;
    }

    @Override
    public int nextDoc() throws IOException {
      return values.nextDoc();
    }
  }

  /**
   * Merges the sorted docvalues from toMerge.
   * 

   * The default implementation calls {@link #addSortedNumericField}, passing
   * iterables that filter deleted documents.
   */
  public void mergeSortedNumericField(FieldInfo mergeFieldInfo, final MergeState mergeState) throws IOException {
    
    addSortedNumericField(mergeFieldInfo,
                          new EmptyDocValuesProducer() {
                            @Override
                            public SortedNumericDocValues getSortedNumeric(FieldInfo fieldInfo) throws IOException {
                              if (fieldInfo != mergeFieldInfo) {
                                throw new IllegalArgumentException("wrong FieldInfo");
                              }
                              
                              // We must make new iterators + DocIDMerger for each iterator:
                              List subs = new ArrayList<>();
                              long cost = 0;
                              for (int i=0;i docIDMerger = DocIDMerger.of(subs, mergeState.needsIndexSort);

                              return new SortedNumericDocValues() {

                                private int docID = -1;
                                private SortedNumericDocValuesSub currentSub;

                                @Override
                                public int docID() {
                                  return docID;
                                }
                                
                                @Override
                                public int nextDoc() throws IOException {
                                  currentSub = docIDMerger.next();
                                  if (currentSub == null) {
                                    docID = NO_MORE_DOCS;
                                  } else {
                                    docID = currentSub.mappedDocID;
                                  }

                                  return docID;
                                }

                                @Override
                                public int advance(int target) throws IOException {
                                  throw new UnsupportedOperationException();
                                }

                                @Override
                                public boolean advanceExact(int target) throws IOException {
                                  throw new UnsupportedOperationException();
                                }

                                @Override
                                public int docValueCount() {
                                  return currentSub.values.docValueCount();
                                }

                                @Override
                                public long cost() {
                                  return finalCost;
                                }

                                @Override
                                public long nextValue() throws IOException {
                                  return currentSub.values.nextValue();
                                }
                              };
                            }
                          });
  }

  /**
   * A merged {@link TermsEnum}. This helps avoid relying on the default terms enum,
   * which calls {@link SortedDocValues#lookupOrd(int)} or
   * {@link SortedSetDocValues#lookupOrd(long)} on every call to {@link TermsEnum#next()}.
   */
  private static class MergedTermsEnum extends TermsEnum {

    private final TermsEnum[] subs;
    private final OrdinalMap ordinalMap;
    private final long valueCount;
    private long ord = -1;
    private BytesRef term;

    MergedTermsEnum(OrdinalMap ordinalMap, TermsEnum[] subs) {
      this.ordinalMap = ordinalMap;
      this.subs = subs;
      this.valueCount = ordinalMap.getValueCount();
    }

    @Override
    public BytesRef term() throws IOException {
      return term;
    }

    @Override
    public long ord() throws IOException {
      return ord;
    }

    @Override
    public BytesRef next() throws IOException {
      if (++ord >= valueCount) {
        return null;
      }
      final int subNum = ordinalMap.getFirstSegmentNumber(ord);
      final TermsEnum sub = subs[subNum];
      final long subOrd = ordinalMap.getFirstSegmentOrd(ord);
      do {
        term = sub.next();
      } while (sub.ord() < subOrd);
      assert sub.ord() == subOrd;
      return term;
    }

    @Override
    public AttributeSource attributes() {
      throw new UnsupportedOperationException();
    }

    @Override
    public boolean seekExact(BytesRef text) throws IOException {
      throw new UnsupportedOperationException();
    }

    @Override
    public SeekStatus seekCeil(BytesRef text) throws IOException {
      throw new UnsupportedOperationException();
    }

    @Override
    public void seekExact(long ord) throws IOException {
      throw new UnsupportedOperationException();
    }

    @Override
    public void seekExact(BytesRef term, TermState state) throws IOException {
      throw new UnsupportedOperationException();
    }

    @Override
    public int docFreq() throws IOException {
      throw new UnsupportedOperationException();
    }

    @Override
    public long totalTermFreq() throws IOException {
      throw new UnsupportedOperationException();
    }

    @Override
    public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {
      throw new UnsupportedOperationException();
    }

    @Override
    public ImpactsEnum impacts(int flags) throws IOException {
      throw new UnsupportedOperationException();
    }

    @Override
    public TermState termState() throws IOException {
      throw new UnsupportedOperationException();
    }

  }

  /** Tracks state of one sorted sub-reader that we are merging */
  private static class SortedDocValuesSub extends DocIDMerger.Sub {

    final SortedDocValues values;
    final LongValues map;
    
    public SortedDocValuesSub(MergeState.DocMap docMap, SortedDocValues values, LongValues map) {
      super(docMap);
      this.values = values;
      this.map = map;
      assert values.docID() == -1;
    }

    @Override
    public int nextDoc() throws IOException {
      return values.nextDoc();
    }
  }

  /**
   * Merges the sorted docvalues from toMerge.
   * 

   * The default implementation calls {@link #addSortedField}, passing
   * an Iterable that merges ordinals and values and filters deleted documents .
   */
  public void mergeSortedField(FieldInfo fieldInfo, final MergeState mergeState) throws IOException {
    List toMerge = new ArrayList<>();
    for (int i=0;i= 0) {
              bitset.set(ord);
            }
          }
        }
        liveTerms[sub] = new BitsFilteredTermsEnum(dv.termsEnum(), bitset);
        weights[sub] = bitset.cardinality();
      }
    }
    
    // step 2: create ordinal map (this conceptually does the "merging")
    final OrdinalMap map = OrdinalMap.build(null, liveTerms, weights, PackedInts.COMPACT);
    
    // step 3: add field
    addSortedField(fieldInfo,
                   new EmptyDocValuesProducer() {
                     @Override
                     public SortedDocValues getSorted(FieldInfo fieldInfoIn) throws IOException {
                       if (fieldInfoIn != fieldInfo) {
                         throw new IllegalArgumentException("wrong FieldInfo");
                       }

                       // We must make new iterators + DocIDMerger for each iterator:

                       List subs = new ArrayList<>();
                       long cost = 0;
                       for (int i=0;i docIDMerger = DocIDMerger.of(subs, mergeState.needsIndexSort);
                       
                       return new SortedDocValues() {
                         private int docID = -1;
                         private int ord;

                         @Override
                         public int docID() {
                           return docID;
                         }

                         @Override
                         public int nextDoc() throws IOException {
                           SortedDocValuesSub sub = docIDMerger.next();
                           if (sub == null) {
                             return docID = NO_MORE_DOCS;
                           }
                           int subOrd = sub.values.ordValue();
                           assert subOrd != -1;
                           ord = (int) sub.map.get(subOrd);
                           docID = sub.mappedDocID;
                           return docID;
                         }

                         @Override
                         public int ordValue() {
                           return ord;
                         }
                         
                         @Override
                         public int advance(int target) {
                           throw new UnsupportedOperationException();
                         }

                         @Override
                         public boolean advanceExact(int target) throws IOException {
                           throw new UnsupportedOperationException();
                         }

                         @Override
                         public long cost() {
                           return finalCost;
                         }

                         @Override
                         public int getValueCount() {
                           return (int) map.getValueCount();
                         }
                         
                         @Override
                         public BytesRef lookupOrd(int ord) throws IOException {
                           int segmentNumber = map.getFirstSegmentNumber(ord);
                           int segmentOrd = (int) map.getFirstSegmentOrd(ord);
                           return dvs[segmentNumber].lookupOrd(segmentOrd);
                         }

                         @Override
                         public TermsEnum termsEnum() throws IOException {
                           TermsEnum[] subs = new TermsEnum[toMerge.size()];
                           for (int sub = 0; sub < subs.length; ++sub) {
                             subs[sub] = toMerge.get(sub).termsEnum();
                           }
                           return new MergedTermsEnum(map, subs);
                         }
                       };
                     }
                   });
  }
  
  /** Tracks state of one sorted set sub-reader that we are merging */
  private static class SortedSetDocValuesSub extends DocIDMerger.Sub {

    final SortedSetDocValues values;
    final LongValues map;
    
    public SortedSetDocValuesSub(MergeState.DocMap docMap, SortedSetDocValues values, LongValues map) {
      super(docMap);
      this.values = values;
      this.map = map;
      assert values.docID() == -1;
    }

    @Override
    public int nextDoc() throws IOException {
      return values.nextDoc();
    }

    @Override
    public String toString() {
      return "SortedSetDocValuesSub(mappedDocID=" + mappedDocID + " values=" + values + ")";
    }
  }

  /**
   * Merges the sortedset docvalues from toMerge.
   * 
   * The default implementation calls {@link #addSortedSetField}, passing
   * an Iterable that merges ordinals and values and filters deleted documents .
   */
  public void mergeSortedSetField(FieldInfo mergeFieldInfo, final MergeState mergeState) throws IOException {

    List toMerge = new ArrayList<>();
    for (int i=0;i subs = new ArrayList<>();

                          long cost = 0;
                          
                          for (int i=0;i docIDMerger = DocIDMerger.of(subs, mergeState.needsIndexSort);
                          
                          final long finalCost = cost;

                          return new SortedSetDocValues() {
                            private int docID = -1;
                            private SortedSetDocValuesSub currentSub;

                            @Override
                            public int docID() {
                              return docID;
                            }

                            @Override
                            public int nextDoc() throws IOException {
                              currentSub = docIDMerger.next();
                              if (currentSub == null) {
                                docID = NO_MORE_DOCS;
                              } else {
                                docID = currentSub.mappedDocID;
                              }

                              return docID;
                            }

                            @Override
                            public int advance(int target) throws IOException {
                              throw new UnsupportedOperationException();
                            }

                            @Override
                            public boolean advanceExact(int target) throws IOException {
                              throw new UnsupportedOperationException();
                            }

                            @Override
                            public long nextOrd() throws IOException {
                              long subOrd = currentSub.values.nextOrd();
                              if (subOrd == NO_MORE_ORDS) {
                                return NO_MORE_ORDS;
                              }
                              return currentSub.map.get(subOrd);
                            }

                            @Override
                            public long cost() {
                              return finalCost;
                            }

                            @Override
                            public BytesRef lookupOrd(long ord) throws IOException {
                              int segmentNumber = map.getFirstSegmentNumber(ord);
                              long segmentOrd = map.getFirstSegmentOrd(ord);
                              return toMerge.get(segmentNumber).lookupOrd(segmentOrd);
                            }

                            @Override
                            public long getValueCount() {
                              return map.getValueCount();
                            }

                            @Override
                            public TermsEnum termsEnum() throws IOException {
                              TermsEnum[] subs = new TermsEnum[toMerge.size()];
                              for (int sub = 0; sub < subs.length; ++sub) {
                                subs[sub] = toMerge.get(sub).termsEnum();
                              }
                              return new MergedTermsEnum(map, subs);
                            }
                          };
                        }
                      });
  }
  
  // TODO: seek-by-ord to nextSetBit
  static class BitsFilteredTermsEnum extends FilteredTermsEnum {
    final LongBitSet liveTerms;
    
    BitsFilteredTermsEnum(TermsEnum in, LongBitSet liveTerms) {
      super(in, false); // <-- not passing false here wasted about 3 hours of my time!!!!!!!!!!!!!
      assert liveTerms != null;
      this.liveTerms = liveTerms;
    }

    @Override
    protected AcceptStatus accept(BytesRef term) throws IOException {
      if (liveTerms.get(ord())) {
        return AcceptStatus.YES;
      } else {
        return AcceptStatus.NO;
      }
    }
  }
  
  /** Helper: returns true if the given docToValue count contains only at most one value */
  public static boolean isSingleValued(Iterable docToValueCount) {
    for (Number count : docToValueCount) {
      if (count.longValue() > 1) {
        return false;
      }
    }
    return true;
  }
  
  /** Helper: returns single-valued view, using {@code missingValue} when count is zero */
  public static Iterable singletonView(final Iterable docToValueCount, final Iterable values, final Number missingValue) {
    assert isSingleValued(docToValueCount);
    return new Iterable() {

      @Override
      public Iterator iterator() {
        final Iterator countIterator = docToValueCount.iterator();
        final Iterator valuesIterator = values.iterator();
        return new Iterator() {

          @Override
          public boolean hasNext() {
            return countIterator.hasNext();
          }

          @Override
          public Number next() {
            int count = countIterator.next().intValue();
            if (count == 0) {
              return missingValue;
            } else {
              return valuesIterator.next();
            }
          }

          @Override
          public void remove() {
            throw new UnsupportedOperationException();
          }
        };
      }
    };
  }
}