org.apache.lucene.search.suggest.document.CompletionFieldsConsumer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-suggest Show documentation
Apache Lucene (module: suggest)
There is a newer version: 10.0.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.suggest.document;

import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.COMPLETION_VERSION_CURRENT;
import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.DICT_EXTENSION;
import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.INDEX_EXTENSION;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.NormsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IOUtils;

/**
 * Weighted FSTs for any indexed {@link SuggestField} is built on {@link
 * #write(Fields,NormsProducer)}. A weighted FST maps the analyzed forms of a field to its surface
 * form and document id. FSTs are stored in the CompletionDictionary (.lkp).
 *
 * The file offsets of a field's FST are stored in the CompletionIndex (.cmp) along with the
 * field's internal number {@link FieldInfo#number} on {@link #close()}.
 */
final class CompletionFieldsConsumer extends FieldsConsumer {

  private final String delegatePostingsFormatName;
  private final Map seenFields = new HashMap<>();
  private final SegmentWriteState state;
  private IndexOutput dictOut;
  private FieldsConsumer delegateFieldsConsumer;
  private final String codecName;

  CompletionFieldsConsumer(
      String codecName, PostingsFormat delegatePostingsFormat, SegmentWriteState state)
      throws IOException {
    this.codecName = codecName;
    this.delegatePostingsFormatName = delegatePostingsFormat.getName();
    this.state = state;
    String dictFile =
        IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, DICT_EXTENSION);
    boolean success = false;
    try {
      this.delegateFieldsConsumer = delegatePostingsFormat.fieldsConsumer(state);
      dictOut = state.directory.createOutput(dictFile, state.context);
      CodecUtil.writeIndexHeader(
          dictOut,
          codecName,
          COMPLETION_VERSION_CURRENT,
          state.segmentInfo.getId(),
          state.segmentSuffix);
      success = true;
    } finally {
      if (success == false) {
        IOUtils.closeWhileHandlingException(dictOut, delegateFieldsConsumer);
      }
    }
  }

  @Override
  public void write(Fields fields, NormsProducer norms) throws IOException {
    delegateFieldsConsumer.write(fields, norms);

    for (String field : fields) {
      CompletionTermWriter termWriter = new CompletionTermWriter();
      Terms terms = fields.terms(field);
      if (terms == null) {
        // this can happen from ghost fields, where the incoming Fields iterator claims a field
        // exists but it does not
        continue;
      }
      TermsEnum termsEnum = terms.iterator();

      // write terms
      BytesRef term;
      while ((term = termsEnum.next()) != null) {
        termWriter.write(term, termsEnum);
      }

      // store lookup, if needed
      long filePointer = dictOut.getFilePointer();
      if (termWriter.finish(dictOut)) {
        seenFields.put(
            field,
            new CompletionMetaData(
                filePointer, termWriter.minWeight, termWriter.maxWeight, termWriter.type));
      }
    }
  }

  private boolean closed = false;

  @Override
  public void close() throws IOException {
    if (closed) {
      return;
    }
    closed = true;
    String indexFile =
        IndexFileNames.segmentFileName(
            state.segmentInfo.name, state.segmentSuffix, INDEX_EXTENSION);
    boolean success = false;
    try (IndexOutput indexOut = state.directory.createOutput(indexFile, state.context)) {
      delegateFieldsConsumer.close();
      CodecUtil.writeIndexHeader(
          indexOut,
          codecName,
          COMPLETION_VERSION_CURRENT,
          state.segmentInfo.getId(),
          state.segmentSuffix);
      /*
       * we write the delegate postings format name so we can load it
       * without getting an instance in the ctor
       */
      indexOut.writeString(delegatePostingsFormatName);
      // write # of seen fields
      indexOut.writeVInt(seenFields.size());
      // write field numbers and dictOut offsets
      for (Map.Entry seenField : seenFields.entrySet()) {
        FieldInfo fieldInfo = state.fieldInfos.fieldInfo(seenField.getKey());
        indexOut.writeVInt(fieldInfo.number);
        CompletionMetaData metaData = seenField.getValue();
        indexOut.writeVLong(metaData.filePointer);
        indexOut.writeVLong(metaData.minWeight);
        indexOut.writeVLong(metaData.maxWeight);
        indexOut.writeByte(metaData.type);
      }
      CodecUtil.writeFooter(indexOut);
      CodecUtil.writeFooter(dictOut);
      IOUtils.close(dictOut);
      success = true;
    } finally {
      if (success == false) {
        IOUtils.closeWhileHandlingException(dictOut, delegateFieldsConsumer);
      }
    }
  }

  private static class CompletionMetaData {
    private final long filePointer;
    private final long minWeight;
    private final long maxWeight;
    private final byte type;

    private CompletionMetaData(long filePointer, long minWeight, long maxWeight, byte type) {
      this.filePointer = filePointer;
      this.minWeight = minWeight;
      this.maxWeight = maxWeight;
      this.type = type;
    }
  }

  // builds an FST based on the terms written
  private static class CompletionTermWriter {

    private PostingsEnum postingsEnum = null;
    private int docCount = 0;
    private long maxWeight = 0;
    private long minWeight = Long.MAX_VALUE;
    private byte type;
    private boolean first;

    private final BytesRefBuilder scratch = new BytesRefBuilder();
    private final NRTSuggesterBuilder builder;

    public CompletionTermWriter() throws IOException {
      builder = new NRTSuggesterBuilder();
      first = true;
    }

    /**
     * Stores the built FST in output Returns true if there was anything stored, false
     * otherwise
     */
    public boolean finish(IndexOutput output) throws IOException {
      boolean stored = builder.store(output);
      assert stored || docCount == 0
          : "the FST is null but docCount is != 0 actual value: [" + docCount + "]";
      if (docCount == 0) {
        minWeight = 0;
      }
      return stored;
    }

    /** Writes all postings (surface form, weight, document id) for term */
    public void write(BytesRef term, TermsEnum termsEnum) throws IOException {
      postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.PAYLOADS);
      builder.startTerm(term);
      int docFreq = 0;
      while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
        int docID = postingsEnum.docID();
        for (int i = 0; i < postingsEnum.freq(); i++) {
          postingsEnum.nextPosition();
          assert postingsEnum.getPayload() != null;
          BytesRef payload = postingsEnum.getPayload();
          ByteArrayDataInput input =
              new ByteArrayDataInput(payload.bytes, payload.offset, payload.length);
          int len = input.readVInt();
          scratch.growNoCopy(len);
          scratch.setLength(len);
          input.readBytes(scratch.bytes(), 0, scratch.length());
          long weight = input.readVInt() - 1;
          maxWeight = Math.max(maxWeight, weight);
          minWeight = Math.min(minWeight, weight);
          byte type = input.readByte();
          if (first) {
            this.type = type;
            first = false;
          } else if (this.type != type) {
            throw new IllegalArgumentException("single field name has mixed types");
          }
          builder.addEntry(docID, scratch.get(), weight);
        }
        docFreq++;
        docCount = Math.max(docCount, docFreq + 1);
      }
      builder.finishTerm();
    }
  }
}