All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.codecs.memory.DirectDocValuesConsumer Maven / Gradle / Ivy

There is a newer version: 9.11.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.codecs.memory;


import java.io.IOException;
import java.util.Iterator;

import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;

import static org.apache.lucene.codecs.memory.DirectDocValuesProducer.BYTES;
import static org.apache.lucene.codecs.memory.DirectDocValuesProducer.NUMBER;
import static org.apache.lucene.codecs.memory.DirectDocValuesProducer.SORTED;
import static org.apache.lucene.codecs.memory.DirectDocValuesProducer.SORTED_NUMERIC;
import static org.apache.lucene.codecs.memory.DirectDocValuesProducer.SORTED_NUMERIC_SINGLETON;
import static org.apache.lucene.codecs.memory.DirectDocValuesProducer.SORTED_SET;
import static org.apache.lucene.codecs.memory.DirectDocValuesProducer.SORTED_SET_SINGLETON;
import static org.apache.lucene.codecs.memory.DirectDocValuesProducer.VERSION_CURRENT;

/**
 * Writer for {@link DirectDocValuesFormat}
 */

class DirectDocValuesConsumer extends DocValuesConsumer {
  IndexOutput data, meta;
  final int maxDoc;

  DirectDocValuesConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
    maxDoc = state.segmentInfo.maxDoc();
    boolean success = false;
    try {
      String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
      data = state.directory.createOutput(dataName, state.context);
      CodecUtil.writeIndexHeader(data, dataCodec, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
      String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
      meta = state.directory.createOutput(metaName, state.context);
      CodecUtil.writeIndexHeader(meta, metaCodec, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
      success = true;
    } finally {
      if (!success) {
        IOUtils.closeWhileHandlingException(this);
      }
    }
  }

  @Override
  public void addNumericField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
    meta.writeVInt(field.number);
    meta.writeByte(NUMBER);
    addNumericFieldValues(field, LegacyDocValuesIterables.numericIterable(field, valuesProducer, maxDoc));
  }

  private void addNumericFieldValues(FieldInfo field, Iterable values) throws IOException {
    meta.writeLong(data.getFilePointer());
    long minValue = Long.MAX_VALUE;
    long maxValue = Long.MIN_VALUE;
    boolean missing = false;

    long count = 0;
    for (Number nv : values) {
      if (nv != null) {
        long v = nv.longValue();
        minValue = Math.min(minValue, v);
        maxValue = Math.max(maxValue, v);
      } else {
        missing = true;
      }
      count++;
      if (count >= DirectDocValuesFormat.MAX_SORTED_SET_ORDS) {
        throw new IllegalArgumentException("DocValuesField \"" + field.name + "\" is too large, must be <= " + DirectDocValuesFormat.MAX_SORTED_SET_ORDS + " values/total ords");
      }
    }
    meta.writeInt((int) count);
    
    if (missing) {
      long start = data.getFilePointer();
      writeMissingBitset(values);
      meta.writeLong(start);
      meta.writeLong(data.getFilePointer() - start);
    } else {
      meta.writeLong(-1L);
    }

    byte byteWidth;
    if (minValue >= Byte.MIN_VALUE && maxValue <= Byte.MAX_VALUE) {
      byteWidth = 1;
    } else if (minValue >= Short.MIN_VALUE && maxValue <= Short.MAX_VALUE) {
      byteWidth = 2;
    } else if (minValue >= Integer.MIN_VALUE && maxValue <= Integer.MAX_VALUE) {
      byteWidth = 4;
    } else {
      byteWidth = 8;
    }
    meta.writeByte(byteWidth);

    for (Number nv : values) {
      long v;
      if (nv != null) {
        v = nv.longValue();
      } else {
        v = 0;
      }

      switch(byteWidth) {
      case 1:
        data.writeByte((byte) v);
        break;
      case 2:
        data.writeShort((short) v);
        break;
      case 4:
        data.writeInt((int) v);
        break;
      case 8:
        data.writeLong(v);
        break;
      }
    }
  }
  
  @Override
  public void close() throws IOException {
    boolean success = false;
    try {
      if (meta != null) {
        meta.writeVInt(-1); // write EOF marker
        CodecUtil.writeFooter(meta); // write checksum
      }
      if (data != null) {
        CodecUtil.writeFooter(data);
      }
      success = true;
    } finally {
      if (success) {
        IOUtils.close(data, meta);
      } else {
        IOUtils.closeWhileHandlingException(data, meta);
      }
      data = meta = null;
    }
  }

  @Override
  public void addBinaryField(FieldInfo field, final DocValuesProducer valuesProducer) throws IOException {
    meta.writeVInt(field.number);
    meta.writeByte(BYTES);
    addBinaryFieldValues(field, LegacyDocValuesIterables.binaryIterable(field, valuesProducer, maxDoc));
  }

  private void addBinaryFieldValues(FieldInfo field, final Iterable values) throws IOException {
    // write the byte[] data
    final long startFP = data.getFilePointer();
    boolean missing = false;
    long totalBytes = 0;
    int count = 0;
    for(BytesRef v : values) {
      if (v != null) {
        data.writeBytes(v.bytes, v.offset, v.length);
        totalBytes += v.length;
        if (totalBytes > DirectDocValuesFormat.MAX_TOTAL_BYTES_LENGTH) {
          throw new IllegalArgumentException("DocValuesField \"" + field.name + "\" is too large, cannot have more than DirectDocValuesFormat.MAX_TOTAL_BYTES_LENGTH (" + DirectDocValuesFormat.MAX_TOTAL_BYTES_LENGTH + ") bytes");
        }
      } else {
        missing = true;
      }
      count++;
    }

    meta.writeLong(startFP);
    meta.writeInt((int) totalBytes);
    meta.writeInt(count);
    if (missing) {
      long start = data.getFilePointer();
      writeMissingBitset(values);
      meta.writeLong(start);
      meta.writeLong(data.getFilePointer() - start);
    } else {
      meta.writeLong(-1L);
    }
    
    int addr = 0;
    for (BytesRef v : values) {
      data.writeInt(addr);
      if (v != null) {
        addr += v.length;
      }
    }
    data.writeInt(addr);
  }
  
  // TODO: in some cases representing missing with minValue-1 wouldn't take up additional space and so on,
  // but this is very simple, and algorithms only check this for values of 0 anyway (doesnt slow down normal decode)
  void writeMissingBitset(Iterable values) throws IOException {
    long bits = 0;
    int count = 0;
    for (Object v : values) {
      if (count == 64) {
        data.writeLong(bits);
        count = 0;
        bits = 0;
      }
      if (v != null) {
        bits |= 1L << (count & 0x3f);
      }
      count++;
    }
    if (count > 0) {
      data.writeLong(bits);
    }
  }

  @Override
  public void addSortedField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
    meta.writeVInt(field.number);
    meta.writeByte(SORTED);

    // write the ordinals as numerics
    addNumericFieldValues(field, LegacyDocValuesIterables.sortedOrdIterable(valuesProducer, field, maxDoc));
    // write the values as binary
    addBinaryFieldValues(field, LegacyDocValuesIterables.valuesIterable(valuesProducer.getSorted(field)));
  }
  
  @Override
  public void addSortedNumericField(FieldInfo field, final DocValuesProducer valuesProducer) throws IOException {

    final Iterable docToValueCount = LegacyDocValuesIterables.sortedNumericToDocCount(valuesProducer, field, maxDoc);
    final Iterable values = LegacyDocValuesIterables.sortedNumericToValues(valuesProducer, field);

    meta.writeVInt(field.number);
    if (isSingleValued(docToValueCount)) {
      meta.writeByte(SORTED_NUMERIC_SINGLETON);
      addNumericFieldValues(field, singletonView(docToValueCount, values, null));
    } else {
      meta.writeByte(SORTED_NUMERIC);

      // First write docToValueCounts, except we "aggregate" the
      // counts so they turn into addresses, and add a final
      // value = the total aggregate:
      addNumericFieldValues(field, countToAddressIterator(docToValueCount));

      // Write values for all docs, appended into one big
      // numerics:
      addNumericFieldValues(field, values);
    }
  }

  // note: this might not be the most efficient... but it's fairly simple
  @Override
  public void addSortedSetField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
    Iterable values = LegacyDocValuesIterables.valuesIterable(valuesProducer.getSortedSet(field));
    Iterable docToOrdCount = LegacyDocValuesIterables.sortedSetOrdCountIterable(valuesProducer, field, maxDoc);
    Iterable ords = LegacyDocValuesIterables.sortedSetOrdsIterable(valuesProducer, field);
    
    meta.writeVInt(field.number);
    
    if (isSingleValued(docToOrdCount)) {
      meta.writeByte(SORTED_SET_SINGLETON);
      // Write ordinals for all docs, appended into one big
      // numerics:
      addNumericFieldValues(field, singletonView(docToOrdCount, ords, -1L));
      
      // write the values as binary
      addBinaryFieldValues(field, values);
    } else {
      meta.writeByte(SORTED_SET);

      // First write docToOrdCounts, except we "aggregate" the
      // counts so they turn into addresses, and add a final
      // value = the total aggregate:
      addNumericFieldValues(field, countToAddressIterator(docToOrdCount));

      // Write ordinals for all docs, appended into one big
      // numerics:
      addNumericFieldValues(field, ords);
      
      // write the values as binary
      addBinaryFieldValues(field, values);
    }
  }
  
  /** 
   * Just aggregates the count values so they become
   * "addresses", and adds one more value in the end
   * (the final sum)
   */ 
  private Iterable countToAddressIterator(final Iterable counts) {
    return new Iterable() {
      @Override
      public Iterator iterator() {
        final Iterator iter = counts.iterator();
        
        return new Iterator() {
          
          long sum;
          boolean ended;
          
          @Override
          public boolean hasNext() {
            return iter.hasNext() || !ended;
          }
          
          @Override
          public Number next() {
            long toReturn = sum;
            
            if (iter.hasNext()) {
              Number n = iter.next();
              if (n != null) {
                sum += n.longValue();
              }
            } else if (!ended) {
              ended = true;
            } else {
              assert false;
            }
            
            return toReturn;
          }
          
          @Override
          public void remove() {
            throw new UnsupportedOperationException();
          }
        };
      }
    };
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy