All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.index.FreqProxTermsWriter Maven / Gradle / Ivy

There is a newer version: 6.4.2_1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.index;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.NormsProducer;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.ByteBuffersDataInput;
import org.apache.lucene.store.ByteBuffersDataOutput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CollectionUtil;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IntBlockPool;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.LSBRadixSorter;
import org.apache.lucene.util.LongsRef;
import org.apache.lucene.util.TimSorter;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.packed.PackedInts;

final class FreqProxTermsWriter extends TermsHash {

  FreqProxTermsWriter(
      final IntBlockPool.Allocator intBlockAllocator,
      final ByteBlockPool.Allocator byteBlockAllocator,
      Counter bytesUsed,
      TermsHash termVectors) {
    super(intBlockAllocator, byteBlockAllocator, bytesUsed, termVectors);
  }

  private void applyDeletes(SegmentWriteState state, Fields fields) throws IOException {
    // Process any pending Term deletes for this newly
    // flushed segment:
    if (state.segUpdates != null && state.segUpdates.deleteTerms.size() > 0) {

      BufferedUpdates.DeletedTerms segDeletes = state.segUpdates.deleteTerms;
      FrozenBufferedUpdates.TermDocsIterator iterator =
          new FrozenBufferedUpdates.TermDocsIterator(fields, true);

      segDeletes.forEachOrdered(
          (term, docId) -> {
            DocIdSetIterator postings = iterator.nextTerm(term.field(), term.bytes());
            if (postings != null) {
              assert docId < PostingsEnum.NO_MORE_DOCS;
              int doc;
              while ((doc = postings.nextDoc()) < docId) {
                if (state.liveDocs == null) {
                  state.liveDocs = new FixedBitSet(state.segmentInfo.maxDoc());
                  state.liveDocs.set(0, state.segmentInfo.maxDoc());
                }
                if (state.liveDocs.getAndClear(doc)) {
                  state.delCountOnFlush++;
                }
              }
            }
          });
    }
  }

  @Override
  public void flush(
      Map fieldsToFlush,
      final SegmentWriteState state,
      Sorter.DocMap sortMap,
      NormsProducer norms)
      throws IOException {
    super.flush(fieldsToFlush, state, sortMap, norms);

    // Gather all fields that saw any postings:
    List allFields = new ArrayList<>();

    for (TermsHashPerField f : fieldsToFlush.values()) {
      final FreqProxTermsWriterPerField perField = (FreqProxTermsWriterPerField) f;
      if (perField.getNumTerms() > 0) {
        perField.sortTerms();
        assert perField.indexOptions != IndexOptions.NONE;
        allFields.add(perField);
      }
    }

    if (!state.fieldInfos.hasPostings()) {
      assert allFields.isEmpty();
      return;
    }

    // Sort by field name
    CollectionUtil.introSort(allFields);

    Fields fields = new FreqProxFields(allFields);
    applyDeletes(state, fields);
    if (sortMap != null) {
      final Sorter.DocMap docMap = sortMap;
      final FieldInfos infos = state.fieldInfos;
      fields =
          new FilterLeafReader.FilterFields(fields) {

            @Override
            public Terms terms(final String field) throws IOException {
              Terms terms = in.terms(field);
              if (terms == null) {
                return null;
              } else {
                return new SortingTerms(terms, infos.fieldInfo(field).getIndexOptions(), docMap);
              }
            }
          };
    }

    try (FieldsConsumer consumer =
        state.segmentInfo.getCodec().postingsFormat().fieldsConsumer(state)) {
      consumer.write(fields, norms);
    }
  }

  @Override
  public TermsHashPerField addField(FieldInvertState invertState, FieldInfo fieldInfo) {
    return new FreqProxTermsWriterPerField(
        invertState, this, fieldInfo, nextTermsHash.addField(invertState, fieldInfo));
  }

  static class SortingTerms extends FilterLeafReader.FilterTerms {

    private final Sorter.DocMap docMap;
    private final IndexOptions indexOptions;

    SortingTerms(final Terms in, IndexOptions indexOptions, final Sorter.DocMap docMap) {
      super(in);
      this.docMap = docMap;
      this.indexOptions = indexOptions;
    }

    @Override
    public TermsEnum iterator() throws IOException {
      return new SortingTermsEnum(in.iterator(), docMap, indexOptions);
    }

    @Override
    public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
      return new SortingTermsEnum(in.intersect(compiled, startTerm), docMap, indexOptions);
    }
  }

  private static class SortingTermsEnum extends FilterLeafReader.FilterTermsEnum {

    final Sorter.DocMap docMap; // pkg-protected to avoid synthetic accessor methods
    private final IndexOptions indexOptions;

    SortingTermsEnum(final TermsEnum in, Sorter.DocMap docMap, IndexOptions indexOptions) {
      super(in);
      this.docMap = docMap;
      this.indexOptions = indexOptions;
    }

    @Override
    public PostingsEnum postings(PostingsEnum reuse, final int flags) throws IOException {

      if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0
          && PostingsEnum.featureRequested(flags, PostingsEnum.FREQS)) {
        final PostingsEnum inReuse;
        final SortingPostingsEnum wrapReuse;
        if (reuse != null && reuse instanceof SortingPostingsEnum) {
          // if we're asked to reuse the given DocsEnum and it is Sorting, return
          // the wrapped one, since some Codecs expect it.
          wrapReuse = (SortingPostingsEnum) reuse;
          inReuse = wrapReuse.getWrapped();
        } else {
          wrapReuse = new SortingPostingsEnum();
          inReuse = reuse;
        }

        final PostingsEnum inDocsAndPositions = in.postings(inReuse, flags);
        // we ignore the fact that positions/offsets may be stored but not asked for,
        // since this code is expected to be used during addIndexes which will
        // ask for everything. if that assumption changes in the future, we can
        // factor in whether 'flags' says offsets are not required.
        final boolean storePositions =
            indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
        final boolean storeOffsets =
            indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
        wrapReuse.reset(docMap, inDocsAndPositions, storePositions, storeOffsets);
        return wrapReuse;
      }

      final PostingsEnum inReuse;
      final SortingDocsEnum wrapReuse;
      if (reuse != null && reuse instanceof SortingDocsEnum) {
        // if we're asked to reuse the given DocsEnum and it is Sorting, return
        // the wrapped one, since some Codecs expect it.
        wrapReuse = (SortingDocsEnum) reuse;
        inReuse = wrapReuse.getWrapped();
      } else {
        wrapReuse = new SortingDocsEnum();
        inReuse = reuse;
      }

      final PostingsEnum inDocs = in.postings(inReuse, flags);
      wrapReuse.reset(docMap, inDocs);
      return wrapReuse;
    }
  }

  static class SortingDocsEnum extends PostingsEnum {

    private final LSBRadixSorter sorter;
    private PostingsEnum in;
    private int[] docs = IntsRef.EMPTY_INTS;
    private int docIt;
    private int upTo;

    SortingDocsEnum() {
      sorter = new LSBRadixSorter();
    }

    void reset(Sorter.DocMap docMap, PostingsEnum in) throws IOException {
      this.in = in;
      int i = 0;
      for (int doc = in.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = in.nextDoc()) {
        if (docs.length <= i) {
          docs = ArrayUtil.grow(docs);
        }
        docs[i++] = docMap.oldToNew(doc);
      }
      upTo = i;
      if (docs.length == upTo) {
        docs = ArrayUtil.grow(docs);
      }
      docs[upTo] = DocIdSetIterator.NO_MORE_DOCS;
      final int maxDoc = docMap.size();
      final int numBits = PackedInts.bitsRequired(Math.max(0, maxDoc - 1));
      // Even though LSBRadixSorter cannot take advantage of partial ordering like TimSorter it is
      // often still faster for nearly-sorted inputs.
      sorter.sort(numBits, docs, upTo);
      docIt = -1;
    }

    PostingsEnum getWrapped() {
      return in;
    }

    @Override
    public int advance(final int target) throws IOException {
      // need to support it for checkIndex, but in practice it won't be called, so
      // don't bother to implement efficiently for now.
      return slowAdvance(target);
    }

    @Override
    public int docID() {
      return docIt < 0 ? -1 : docs[docIt];
    }

    @Override
    public int nextDoc() throws IOException {
      return docs[++docIt];
    }

    @Override
    public long cost() {
      return upTo;
    }

    @Override
    public int freq() throws IOException {
      return 1;
    }

    @Override
    public int nextPosition() throws IOException {
      return -1;
    }

    @Override
    public int startOffset() throws IOException {
      return -1;
    }

    @Override
    public int endOffset() throws IOException {
      return -1;
    }

    @Override
    public BytesRef getPayload() throws IOException {
      return null;
    }
  }

  static class SortingPostingsEnum extends PostingsEnum {

    /**
     * A {@link TimSorter} which sorts two parallel arrays of doc IDs and offsets in one go. Everyti
     * me a doc ID is 'swapped', its corresponding offset is swapped too.
     */
    private static final class DocOffsetSorter extends TimSorter {

      private int[] docs;
      private long[] offsets;
      private int[] tmpDocs;
      private long[] tmpOffsets;

      public DocOffsetSorter(int numTempSlots) {
        super(numTempSlots);
        this.tmpDocs = IntsRef.EMPTY_INTS;
        this.tmpOffsets = LongsRef.EMPTY_LONGS;
      }

      public void reset(int[] docs, long[] offsets) {
        this.docs = docs;
        this.offsets = offsets;
      }

      @Override
      protected int compare(int i, int j) {
        return docs[i] - docs[j];
      }

      @Override
      protected void swap(int i, int j) {
        int tmpDoc = docs[i];
        docs[i] = docs[j];
        docs[j] = tmpDoc;

        long tmpOffset = offsets[i];
        offsets[i] = offsets[j];
        offsets[j] = tmpOffset;
      }

      @Override
      protected void copy(int src, int dest) {
        docs[dest] = docs[src];
        offsets[dest] = offsets[src];
      }

      @Override
      protected void save(int i, int len) {
        if (tmpDocs.length < len) {
          tmpDocs = new int[ArrayUtil.oversize(len, Integer.BYTES)];
          tmpOffsets = new long[tmpDocs.length];
        }
        System.arraycopy(docs, i, tmpDocs, 0, len);
        System.arraycopy(offsets, i, tmpOffsets, 0, len);
      }

      @Override
      protected void restore(int i, int j) {
        docs[j] = tmpDocs[i];
        offsets[j] = tmpOffsets[i];
      }

      @Override
      protected int compareSaved(int i, int j) {
        return tmpDocs[i] - docs[j];
      }
    }

    private DocOffsetSorter sorter;
    private int[] docs = IntsRef.EMPTY_INTS;
    private long[] offsets = LongsRef.EMPTY_LONGS;
    private int upto;

    private ByteBuffersDataInput postingInput;
    private PostingsEnum in;
    private boolean storePositions, storeOffsets;

    private int docIt;
    private int pos;
    private int startOffset;
    private int endOffset;
    private final BytesRef payload = new BytesRef();
    private int currFreq;

    private final ByteBuffersDataOutput buffer = ByteBuffersDataOutput.newResettableInstance();

    void reset(Sorter.DocMap docMap, PostingsEnum in, boolean storePositions, boolean storeOffsets)
        throws IOException {
      this.in = in;
      this.storePositions = storePositions;
      this.storeOffsets = storeOffsets;
      if (sorter == null) {
        final int numTempSlots = docMap.size() / 8;
        sorter = new DocOffsetSorter(numTempSlots);
      }
      docIt = -1;
      startOffset = -1;
      endOffset = -1;

      buffer.reset();
      int doc;
      int i = 0;
      while ((doc = in.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
        if (i == docs.length) {
          final int newLength = ArrayUtil.oversize(i + 1, 4);
          docs = ArrayUtil.growExact(docs, newLength);
          offsets = ArrayUtil.growExact(offsets, newLength);
        }
        docs[i] = docMap.oldToNew(doc);
        offsets[i] = buffer.size();
        addPositions(in, buffer);
        i++;
      }
      upto = i;
      sorter.reset(docs, offsets);
      sorter.sort(0, upto);

      this.postingInput = buffer.toDataInput();
    }

    private void addPositions(final PostingsEnum in, final DataOutput out) throws IOException {
      int freq = in.freq();
      out.writeVInt(freq);
      if (storePositions) {
        int previousPosition = 0;
        int previousEndOffset = 0;
        for (int i = 0; i < freq; i++) {
          final int pos = in.nextPosition();
          final BytesRef payload = in.getPayload();
          // The low-order bit of token is set only if there is a payload, the
          // previous bits are the delta-encoded position.
          final int token = (pos - previousPosition) << 1 | (payload == null ? 0 : 1);
          out.writeVInt(token);
          previousPosition = pos;
          if (storeOffsets) { // don't encode offsets if they are not stored
            final int startOffset = in.startOffset();
            final int endOffset = in.endOffset();
            out.writeVInt(startOffset - previousEndOffset);
            out.writeVInt(endOffset - startOffset);
            previousEndOffset = endOffset;
          }
          if (payload != null) {
            out.writeVInt(payload.length);
            out.writeBytes(payload.bytes, payload.offset, payload.length);
          }
        }
      }
    }

    @Override
    public int advance(final int target) throws IOException {
      // need to support it for checkIndex, but in practice it won't be called, so
      // don't bother to implement efficiently for now.
      return slowAdvance(target);
    }

    @Override
    public int docID() {
      return docIt < 0 ? -1 : docIt >= upto ? NO_MORE_DOCS : docs[docIt];
    }

    @Override
    public int endOffset() throws IOException {
      return endOffset;
    }

    @Override
    public int freq() throws IOException {
      return currFreq;
    }

    @Override
    public BytesRef getPayload() throws IOException {
      return payload.length == 0 ? null : payload;
    }

    @Override
    public int nextDoc() throws IOException {
      if (++docIt >= upto) return DocIdSetIterator.NO_MORE_DOCS;
      postingInput.seek(offsets[docIt]);
      currFreq = postingInput.readVInt();
      // reset variables used in nextPosition
      pos = 0;
      endOffset = 0;
      return docs[docIt];
    }

    @Override
    public int nextPosition() throws IOException {
      if (storePositions == false) {
        return -1;
      }
      final int token = postingInput.readVInt();
      pos += token >>> 1;
      if (storeOffsets) {
        startOffset = endOffset + postingInput.readVInt();
        endOffset = startOffset + postingInput.readVInt();
      }
      if ((token & 1) != 0) {
        payload.offset = 0;
        payload.length = postingInput.readVInt();
        if (payload.length > payload.bytes.length) {
          payload.bytes = new byte[ArrayUtil.oversize(payload.length, 1)];
        }
        postingInput.readBytes(payload.bytes, 0, payload.length);
      } else {
        payload.length = 0;
      }
      return pos;
    }

    @Override
    public int startOffset() throws IOException {
      return startOffset;
    }

    /** Returns the wrapped {@link PostingsEnum}. */
    PostingsEnum getWrapped() {
      return in;
    }

    @Override
    public long cost() {
      return in.cost();
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy