org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of aem-sdk-api Show documentation
The Adobe Experience Manager SDK
There is a newer version: 2024.11.18751.20241128T090041Z-241100
/*
 * COPIED FROM APACHE LUCENE 4.7.2
 *
 * Git URL: [email protected]:apache/lucene.git, tag: releases/lucene-solr/4.7.2, path: lucene/core/src/java
 *
 * (see https://issues.apache.org/jira/browse/OAK-10786 for details)
 */

package org.apache.lucene.codecs.compressing;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import static org.apache.lucene.codecs.lucene40.Lucene40StoredFieldsWriter.FIELDS_EXTENSION;
import static org.apache.lucene.codecs.lucene40.Lucene40StoredFieldsWriter.FIELDS_INDEX_EXTENSION;

import java.io.IOException;
import java.util.Arrays;

import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.StoredFieldsReader;
import org.apache.lucene.codecs.StoredFieldsWriter;
import org.apache.lucene.codecs.compressing.CompressingStoredFieldsReader.ChunkIterator;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.SegmentReader;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.GrowableByteArrayDataOutput;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.packed.PackedInts;

/**
 * {@link StoredFieldsWriter} impl for {@link CompressingStoredFieldsFormat}.
 * @lucene.experimental
 */
public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {

  // hard limit on the maximum number of documents per chunk
  static final int MAX_DOCUMENTS_PER_CHUNK = 128;

  static final int         STRING = 0x00;
  static final int       BYTE_ARR = 0x01;
  static final int    NUMERIC_INT = 0x02;
  static final int  NUMERIC_FLOAT = 0x03;
  static final int   NUMERIC_LONG = 0x04;
  static final int NUMERIC_DOUBLE = 0x05;

  static final int TYPE_BITS = PackedInts.bitsRequired(NUMERIC_DOUBLE);
  static final int TYPE_MASK = (int) PackedInts.maxValue(TYPE_BITS);

  static final String CODEC_SFX_IDX = "Index";
  static final String CODEC_SFX_DAT = "Data";
  static final int VERSION_START = 0;
  static final int VERSION_BIG_CHUNKS = 1;
  static final int VERSION_CURRENT = VERSION_BIG_CHUNKS;

  private final Directory directory;
  private final String segment;
  private final String segmentSuffix;
  private CompressingStoredFieldsIndexWriter indexWriter;
  private IndexOutput fieldsStream;

  private final CompressionMode compressionMode;
  private final Compressor compressor;
  private final int chunkSize;

  private final GrowableByteArrayDataOutput bufferedDocs;
  private int[] numStoredFields; // number of stored fields
  private int[] endOffsets; // end offsets in bufferedDocs
  private int docBase; // doc ID at the beginning of the chunk
  private int numBufferedDocs; // docBase + numBufferedDocs == current doc ID

  /** Sole constructor. */
  public CompressingStoredFieldsWriter(Directory directory, SegmentInfo si, String segmentSuffix, IOContext context,
      String formatName, CompressionMode compressionMode, int chunkSize) throws IOException {
    assert directory != null;
    this.directory = directory;
    this.segment = si.name;
    this.segmentSuffix = segmentSuffix;
    this.compressionMode = compressionMode;
    this.compressor = compressionMode.newCompressor();
    this.chunkSize = chunkSize;
    this.docBase = 0;
    this.bufferedDocs = new GrowableByteArrayDataOutput(chunkSize);
    this.numStoredFields = new int[16];
    this.endOffsets = new int[16];
    this.numBufferedDocs = 0;

    boolean success = false;
    IndexOutput indexStream = directory.createOutput(IndexFileNames.segmentFileName(segment, segmentSuffix, FIELDS_INDEX_EXTENSION), context);
    try {
      fieldsStream = directory.createOutput(IndexFileNames.segmentFileName(segment, segmentSuffix, FIELDS_EXTENSION), context);

      final String codecNameIdx = formatName + CODEC_SFX_IDX;
      final String codecNameDat = formatName + CODEC_SFX_DAT;
      CodecUtil.writeHeader(indexStream, codecNameIdx, VERSION_CURRENT);
      CodecUtil.writeHeader(fieldsStream, codecNameDat, VERSION_CURRENT);
      assert CodecUtil.headerLength(codecNameDat) == fieldsStream.getFilePointer();
      assert CodecUtil.headerLength(codecNameIdx) == indexStream.getFilePointer();

      indexWriter = new CompressingStoredFieldsIndexWriter(indexStream);
      indexStream = null;

      fieldsStream.writeVInt(chunkSize);
      fieldsStream.writeVInt(PackedInts.VERSION_CURRENT);

      success = true;
    } finally {
      if (!success) {
        IOUtils.closeWhileHandlingException(indexStream);
        abort();
      }
    }
  }

  @Override
  public void close() throws IOException {
    try {
      IOUtils.close(fieldsStream, indexWriter);
    } finally {
      fieldsStream = null;
      indexWriter = null;
    }
  }

  @Override
  public void startDocument(int numStoredFields) throws IOException {
    if (numBufferedDocs == this.numStoredFields.length) {
      final int newLength = ArrayUtil.oversize(numBufferedDocs + 1, 4);
      this.numStoredFields = Arrays.copyOf(this.numStoredFields, newLength);
      endOffsets = Arrays.copyOf(endOffsets, newLength);
    }
    this.numStoredFields[numBufferedDocs] = numStoredFields;
    ++numBufferedDocs;
  }

  @Override
  public void finishDocument() throws IOException {
    endOffsets[numBufferedDocs - 1] = bufferedDocs.length;
    if (triggerFlush()) {
      flush();
    }
  }

  private static void saveInts(int[] values, int length, DataOutput out) throws IOException {
    assert length > 0;
    if (length == 1) {
      out.writeVInt(values[0]);
    } else {
      boolean allEqual = true;
      for (int i = 1; i < length; ++i) {
        if (values[i] != values[0]) {
          allEqual = false;
          break;
        }
      }
      if (allEqual) {
        out.writeVInt(0);
        out.writeVInt(values[0]);
      } else {
        long max = 0;
        for (int i = 0; i < length; ++i) {
          max |= values[i];
        }
        final int bitsRequired = PackedInts.bitsRequired(max);
        out.writeVInt(bitsRequired);
        final PackedInts.Writer w = PackedInts.getWriterNoHeader(out, PackedInts.Format.PACKED, length, bitsRequired, 1);
        for (int i = 0; i < length; ++i) {
          w.add(values[i]);
        }
        w.finish();
      }
    }
  }

  private void writeHeader(int docBase, int numBufferedDocs, int[] numStoredFields, int[] lengths) throws IOException {
    // save docBase and numBufferedDocs
    fieldsStream.writeVInt(docBase);
    fieldsStream.writeVInt(numBufferedDocs);

    // save numStoredFields
    saveInts(numStoredFields, numBufferedDocs, fieldsStream);

    // save lengths
    saveInts(lengths, numBufferedDocs, fieldsStream);
  }

  private boolean triggerFlush() {
    return bufferedDocs.length >= chunkSize || // chunks of at least chunkSize bytes
        numBufferedDocs >= MAX_DOCUMENTS_PER_CHUNK;
  }

  private void flush() throws IOException {
    indexWriter.writeIndex(numBufferedDocs, fieldsStream.getFilePointer());

    // transform end offsets into lengths
    final int[] lengths = endOffsets;
    for (int i = numBufferedDocs - 1; i > 0; --i) {
      lengths[i] = endOffsets[i] - endOffsets[i - 1];
      assert lengths[i] >= 0;
    }
    writeHeader(docBase, numBufferedDocs, numStoredFields, lengths);

    // compress stored fields to fieldsStream
    if (bufferedDocs.length >= 2 * chunkSize) {
      // big chunk, slice it
      for (int compressed = 0; compressed < bufferedDocs.length; compressed += chunkSize) {
        compressor.compress(bufferedDocs.bytes, compressed, Math.min(chunkSize, bufferedDocs.length - compressed), fieldsStream);
      }
    } else {
      compressor.compress(bufferedDocs.bytes, 0, bufferedDocs.length, fieldsStream);
    }

    // reset
    docBase += numBufferedDocs;
    numBufferedDocs = 0;
    bufferedDocs.length = 0;
  }

  @Override
  public void writeField(FieldInfo info, IndexableField field)
      throws IOException {
    int bits = 0;
    final BytesRef bytes;
    final String string;

    Number number = field.numericValue();
    if (number != null) {
      if (number instanceof Byte || number instanceof Short || number instanceof Integer) {
        bits = NUMERIC_INT;
      } else if (number instanceof Long) {
        bits = NUMERIC_LONG;
      } else if (number instanceof Float) {
        bits = NUMERIC_FLOAT;
      } else if (number instanceof Double) {
        bits = NUMERIC_DOUBLE;
      } else {
        throw new IllegalArgumentException("cannot store numeric type " + number.getClass());
      }
      string = null;
      bytes = null;
    } else {
      bytes = field.binaryValue();
      if (bytes != null) {
        bits = BYTE_ARR;
        string = null;
      } else {
        bits = STRING;
        string = field.stringValue();
        if (string == null) {
          throw new IllegalArgumentException("field " + field.name() + " is stored but does not have binaryValue, stringValue nor numericValue");
        }
      }
    }

    final long infoAndBits = (((long) info.number) << TYPE_BITS) | bits;
    bufferedDocs.writeVLong(infoAndBits);

    if (bytes != null) {
      bufferedDocs.writeVInt(bytes.length);
      bufferedDocs.writeBytes(bytes.bytes, bytes.offset, bytes.length);
    } else if (string != null) {
      bufferedDocs.writeString(field.stringValue());
    } else {
      if (number instanceof Byte || number instanceof Short || number instanceof Integer) {
        bufferedDocs.writeInt(number.intValue());
      } else if (number instanceof Long) {
        bufferedDocs.writeLong(number.longValue());
      } else if (number instanceof Float) {
        bufferedDocs.writeInt(Float.floatToIntBits(number.floatValue()));
      } else if (number instanceof Double) {
        bufferedDocs.writeLong(Double.doubleToLongBits(number.doubleValue()));
      } else {
        throw new AssertionError("Cannot get here");
      }
    }
  }

  @Override
  public void abort() {
    IOUtils.closeWhileHandlingException(this);
    IOUtils.deleteFilesIgnoringExceptions(directory,
        IndexFileNames.segmentFileName(segment, segmentSuffix, FIELDS_EXTENSION),
        IndexFileNames.segmentFileName(segment, segmentSuffix, FIELDS_INDEX_EXTENSION));
  }

  @Override
  public void finish(FieldInfos fis, int numDocs) throws IOException {
    if (numBufferedDocs > 0) {
      flush();
    } else {
      assert bufferedDocs.length == 0;
    }
    if (docBase != numDocs) {
      throw new RuntimeException("Wrote " + docBase + " docs, finish called with numDocs=" + numDocs);
    }
    indexWriter.finish(numDocs);
    assert bufferedDocs.length == 0;
  }

  @Override
  public int merge(MergeState mergeState) throws IOException {
    int docCount = 0;
    int idx = 0;

    for (AtomicReader reader : mergeState.readers) {
      final SegmentReader matchingSegmentReader = mergeState.matchingSegmentReaders[idx++];
      CompressingStoredFieldsReader matchingFieldsReader = null;
      if (matchingSegmentReader != null) {
        final StoredFieldsReader fieldsReader = matchingSegmentReader.getFieldsReader();
        // we can only bulk-copy if the matching reader is also a CompressingStoredFieldsReader
        if (fieldsReader != null && fieldsReader instanceof CompressingStoredFieldsReader) {
          matchingFieldsReader = (CompressingStoredFieldsReader) fieldsReader;
        }
      }

      final int maxDoc = reader.maxDoc();
      final Bits liveDocs = reader.getLiveDocs();

      if (matchingFieldsReader == null
          || matchingFieldsReader.getVersion() != VERSION_CURRENT // means reader version is not the same as the writer version
          || matchingFieldsReader.getCompressionMode() != compressionMode
          || matchingFieldsReader.getChunkSize() != chunkSize) { // the way data is decompressed depends on the chunk size
        // naive merge...
        for (int i = nextLiveDoc(0, liveDocs, maxDoc); i < maxDoc; i = nextLiveDoc(i + 1, liveDocs, maxDoc)) {
          Document doc = reader.document(i);
          addDocument(doc, mergeState.fieldInfos);
          ++docCount;
          mergeState.checkAbort.work(300);
        }
      } else {
        int docID = nextLiveDoc(0, liveDocs, maxDoc);
        if (docID < maxDoc) {
          // not all docs were deleted
          final ChunkIterator it = matchingFieldsReader.chunkIterator(docID);
          int[] startOffsets = new int[0];
          do {
            // go to the next chunk that contains docID
            it.next(docID);
            // transform lengths into offsets
            if (startOffsets.length < it.chunkDocs) {
              startOffsets = new int[ArrayUtil.oversize(it.chunkDocs, 4)];
            }
            for (int i = 1; i < it.chunkDocs; ++i) {
              startOffsets[i] = startOffsets[i - 1] + it.lengths[i - 1];
            }

            if (numBufferedDocs == 0 // starting a new chunk
                && startOffsets[it.chunkDocs - 1] < chunkSize // chunk is small enough
                && startOffsets[it.chunkDocs - 1] + it.lengths[it.chunkDocs - 1] >= chunkSize // chunk is large enough
                && nextDeletedDoc(it.docBase, liveDocs, it.docBase + it.chunkDocs) == it.docBase + it.chunkDocs) { // no deletion in the chunk
              assert docID == it.docBase;

              // no need to decompress, just copy data
              indexWriter.writeIndex(it.chunkDocs, fieldsStream.getFilePointer());
              writeHeader(this.docBase, it.chunkDocs, it.numStoredFields, it.lengths);
              it.copyCompressedData(fieldsStream);
              this.docBase += it.chunkDocs;
              docID = nextLiveDoc(it.docBase + it.chunkDocs, liveDocs, maxDoc);
              docCount += it.chunkDocs;
              mergeState.checkAbort.work(300 * it.chunkDocs);
            } else {
              // decompress
              it.decompress();
              if (startOffsets[it.chunkDocs - 1] + it.lengths[it.chunkDocs - 1] != it.bytes.length) {
                throw new CorruptIndexException("Corrupted: expected chunk size=" + startOffsets[it.chunkDocs - 1] + it.lengths[it.chunkDocs - 1] + ", got " + it.bytes.length);
              }
              // copy non-deleted docs
              for (; docID < it.docBase + it.chunkDocs; docID = nextLiveDoc(docID + 1, liveDocs, maxDoc)) {
                final int diff = docID - it.docBase;
                startDocument(it.numStoredFields[diff]);
                bufferedDocs.writeBytes(it.bytes.bytes, it.bytes.offset + startOffsets[diff], it.lengths[diff]);
                finishDocument();
                ++docCount;
                mergeState.checkAbort.work(300);
              }
            }
          } while (docID < maxDoc);
        }
      }
    }
    finish(mergeState.fieldInfos, docCount);
    return docCount;
  }

  private static int nextLiveDoc(int doc, Bits liveDocs, int maxDoc) {
    if (liveDocs == null) {
      return doc;
    }
    while (doc < maxDoc && !liveDocs.get(doc)) {
      ++doc;
    }
    return doc;
  }

  private static int nextDeletedDoc(int doc, Bits liveDocs, int maxDoc) {
    if (liveDocs == null) {
      return maxDoc;
    }
    while (doc < maxDoc && liveDocs.get(doc)) {
      ++doc;
    }
    return doc;
  }

}