com.yelp.nrtsearch.server.handler.AddDocumentHandler Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of server Show documentation
GRPC lucene server using near-real-time replication
There is a newer version: 1.0.0-beta.1
/*
 * Copyright 2020 Yelp Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.yelp.nrtsearch.server.handler;

import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import com.google.protobuf.ProtocolStringList;
import com.yelp.nrtsearch.server.field.FieldDef;
import com.yelp.nrtsearch.server.field.IdFieldDef;
import com.yelp.nrtsearch.server.field.IndexableFieldDef;
import com.yelp.nrtsearch.server.grpc.AddDocumentRequest;
import com.yelp.nrtsearch.server.grpc.AddDocumentResponse;
import com.yelp.nrtsearch.server.grpc.DeadlineUtils;
import com.yelp.nrtsearch.server.grpc.FacetHierarchyPath;
import com.yelp.nrtsearch.server.index.IndexState;
import com.yelp.nrtsearch.server.index.ShardState;
import com.yelp.nrtsearch.server.state.GlobalState;
import io.grpc.Context;
import io.grpc.Status;
import io.grpc.stub.StreamObserver;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.Future;
import java.util.concurrent.LinkedBlockingDeque;
import java.util.concurrent.RejectedExecutionException;
import java.util.stream.Collectors;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexableField;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class AddDocumentHandler extends Handler {
  private static final Logger logger = LoggerFactory.getLogger(AddDocumentHandler.class);

  public AddDocumentHandler(GlobalState globalState) {
    super(globalState);
  }

  @Override
  public StreamObserver handle(
      StreamObserver responseObserver) {
    return new StreamObserver<>() {
      Multimap> futures = HashMultimap.create();
      // Map of {indexName: addDocumentRequestQueue}
      Map> addDocumentRequestQueueMap =
          new ConcurrentHashMap<>();
      // Map of {indexName: count}
      Map countMap = new ConcurrentHashMap<>();

      private int getAddDocumentsMaxBufferLen(String indexName) {
        try {
          return getGlobalState().getIndex(indexName).getAddDocumentsMaxBufferLen();
        } catch (Exception e) {
          String error =
              String.format("Index %s does not exist, unable to add documents", indexName);
          logger.error(error, e);
          throw Status.INVALID_ARGUMENT.withDescription(error).withCause(e).asRuntimeException();
        }
      }

      private ArrayBlockingQueue getAddDocumentRequestQueue(String indexName) {
        if (addDocumentRequestQueueMap.containsKey(indexName)) {
          return addDocumentRequestQueueMap.get(indexName);
        } else {
          int addDocumentsMaxBufferLen = getAddDocumentsMaxBufferLen(indexName);
          ArrayBlockingQueue addDocumentRequestQueue =
              new ArrayBlockingQueue<>(addDocumentsMaxBufferLen);
          addDocumentRequestQueueMap.put(indexName, addDocumentRequestQueue);
          return addDocumentRequestQueue;
        }
      }

      private long getCount(String indexName) {
        return countMap.getOrDefault(indexName, 0L);
      }

      private void incrementCount(String indexName) {
        if (countMap.containsKey(indexName)) {
          countMap.put(indexName, countMap.get(indexName) + 1);
        } else {
          countMap.put(indexName, 1L);
        }
      }

      @Override
      public void onNext(AddDocumentRequest addDocumentRequest) {
        String indexName = addDocumentRequest.getIndexName();
        ArrayBlockingQueue addDocumentRequestQueue;
        try {
          addDocumentRequestQueue = getAddDocumentRequestQueue(indexName);
        } catch (Exception e) {
          onError(e);
          return;
        }
        logger.debug(
            String.format(
                "onNext, index: %s, addDocumentRequestQueue size: %s",
                indexName, addDocumentRequestQueue.size()));
        incrementCount(indexName);
        addDocumentRequestQueue.add(addDocumentRequest);
        if (addDocumentRequestQueue.remainingCapacity() == 0) {
          logger.debug(
              String.format(
                  "indexing addDocumentRequestQueue size: %s, total: %s",
                  addDocumentRequestQueue.size(), getCount(indexName)));
          try {
            DeadlineUtils.checkDeadline("addDocuments: onNext", "INDEXING");

            List addDocRequestList = new ArrayList<>(addDocumentRequestQueue);
            Future future =
                getGlobalState()
                    .submitIndexingTask(
                        Context.current()
                            .wrap(
                                new DocumentIndexer(
                                    getGlobalState(), addDocRequestList, indexName)));
            futures.put(indexName, future);
          } catch (Exception e) {
            responseObserver.onError(e);
          } finally {
            addDocumentRequestQueue.clear();
          }
        }
      }

      @Override
      public void onError(Throwable t) {
        logger.warn("addDocuments Cancelled", t);
        responseObserver.onError(t);
      }

      private String onCompletedForIndex(String indexName) {
        ArrayBlockingQueue addDocumentRequestQueue =
            getAddDocumentRequestQueue(indexName);
        logger.debug(
            String.format(
                "onCompleted, addDocumentRequestQueue: %s", addDocumentRequestQueue.size()));
        long highestGen = -1;
        try {
          DeadlineUtils.checkDeadline("addDocuments: onCompletedForIndex", "INDEXING");

          // index the left over docs
          if (!addDocumentRequestQueue.isEmpty()) {
            logger.debug(
                String.format(
                    "indexing left over addDocumentRequestQueue of size: %s",
                    addDocumentRequestQueue.size()));
            List addDocRequestList = new ArrayList<>(addDocumentRequestQueue);
            // Since we are already running in the indexing threadpool run the indexing job
            // for remaining documents directly. This serializes indexing remaining documents for
            // multiple indices but avoids deadlocking if there aren't more threads than the
            // maximum
            // number of parallel addDocuments calls.
            long gen =
                new DocumentIndexer(getGlobalState(), addDocRequestList, indexName)
                    .runIndexingJob();
            if (gen > highestGen) {
              highestGen = gen;
            }
          }
          // collect futures, block if needed
          int numIndexingChunks = futures.size();
          long t0 = System.nanoTime();
          for (Future result : futures.get(indexName)) {
            Long gen = result.get();
            logger.debug("Indexing returned sequence-number {}", gen);
            if (gen > highestGen) {
              highestGen = gen;
            }
          }
          long t1 = System.nanoTime();
          logger.debug(
              "Indexing job completed for {} docs, in {} chunks, with latest sequence number: {}, took: {} micro seconds",
              getCount(indexName),
              numIndexingChunks,
              highestGen,
              ((t1 - t0) / 1000));
          return String.valueOf(highestGen);
        } catch (Exception e) {
          logger.warn("error while trying to addDocuments", e);
          throw Status.INTERNAL
              .withDescription("error while trying to addDocuments ")
              .augmentDescription(e.getMessage())
              .withCause(e)
              .asRuntimeException();
        } finally {
          addDocumentRequestQueue.clear();
          countMap.put(indexName, 0L);
        }
      }

      @Override
      public void onCompleted() {
        try {
          getGlobalState()
              .submitIndexingTask(
                  Context.current()
                      .wrap(
                          () -> {
                            try {
                              // TODO: this should return a map on index to genId in the response
                              String genId = "-1";
                              for (String indexName : addDocumentRequestQueueMap.keySet()) {
                                genId = onCompletedForIndex(indexName);
                              }
                              responseObserver.onNext(
                                  AddDocumentResponse.newBuilder()
                                      .setGenId(genId)
                                      .setPrimaryId(getGlobalState().getEphemeralId())
                                      .build());
                              responseObserver.onCompleted();
                            } catch (Throwable t) {
                              responseObserver.onError(t);
                            }
                            return null;
                          }));
        } catch (RejectedExecutionException e) {
          logger.error("Threadpool is full, unable to submit indexing completion job");
          responseObserver.onError(
              Status.RESOURCE_EXHAUSTED
                  .withDescription("Threadpool is full, unable to submit indexing completion job")
                  .augmentDescription(e.getMessage())
                  .asRuntimeException());
        }
      }
    };
  }

  /**
   * DocumentsContext is created for each GRPC AddDocumentRequest It hold all lucene documents
   * context for the AddDocumentRequest including root document and optional child documents if
   * schema contains nested objects
   */
  public static class DocumentsContext {
    private final Document rootDocument;
    private final Map> childDocuments;

    public DocumentsContext() {
      this.rootDocument = new Document();
      this.childDocuments = new HashMap<>();
    }

    public Document getRootDocument() {
      return rootDocument;
    }

    public Map> getChildDocuments() {
      return childDocuments;
    }

    public void addChildDocuments(String key, List documents) {
      childDocuments.put(key, documents);
    }

    public boolean hasNested() {
      return !childDocuments.isEmpty();
    }
  }

  public static class LuceneDocumentBuilder {

    public static DocumentsContext getDocumentsContext(
        AddDocumentRequest addDocumentRequest, IndexState indexState)
        throws AddDocumentHandlerException {
      DocumentsContext documentsContext = new DocumentsContext();
      Map fields = addDocumentRequest.getFieldsMap();
      for (Map.Entry entry : fields.entrySet()) {
        parseOneField(entry.getKey(), entry.getValue(), documentsContext, indexState);
      }

      ((IndexableFieldDef) (IndexState.getMetaField(IndexState.NESTED_PATH)))
          .parseDocumentField(
              documentsContext.getRootDocument(), List.of(IndexState.ROOT), List.of());

      // Include all fields and meta-fields in field names
      extractFieldNames(documentsContext);
      return documentsContext;
    }

    /** Extract all field names for each document and stores it into a hidden field */
    private static void extractFieldNames(DocumentsContext documentsContext) {
      extractFieldNamesForDocument(documentsContext.getRootDocument());
      documentsContext.getChildDocuments().values().stream()
          .flatMap(List::stream)
          .forEach(LuceneDocumentBuilder::extractFieldNamesForDocument);
    }

    /** Extract all field names in the document and stores it into a hidden field */
    private static void extractFieldNamesForDocument(Document document) {
      IndexableFieldDef fieldNamesFieldDef =
          (IndexableFieldDef) IndexState.getMetaField(IndexState.FIELD_NAMES);

      List fieldNames =
          document.getFields().stream()
              .map(IndexableField::name)
              .distinct()
              .collect(Collectors.toList());

      fieldNamesFieldDef.parseDocumentField(document, fieldNames, List.of());
    }

    /** Parses a field's value, which is a MultiValuedField in all cases */
    private static void parseOneField(
        String fieldName,
        AddDocumentRequest.MultiValuedField value,
        DocumentsContext documentsContext,
        IndexState indexState)
        throws AddDocumentHandlerException {
      parseMultiValueField(indexState.getField(fieldName), value, documentsContext);
    }

    /** Parse MultiValuedField for a single field, which is always a List. */
    private static void parseMultiValueField(
        FieldDef field,
        AddDocumentRequest.MultiValuedField value,
        DocumentsContext documentsContext)
        throws AddDocumentHandlerException {
      ProtocolStringList fieldValues = value.getValueList();
      List facetHierarchyPaths = value.getFaceHierarchyPathsList();
      List> facetHierarchyPathValues =
          facetHierarchyPaths.stream().map(fp -> fp.getValueList()).collect(Collectors.toList());
      if (!facetHierarchyPathValues.isEmpty()) {
        if (facetHierarchyPathValues.size() != fieldValues.size()) {
          throw new AddDocumentHandlerException(
              String.format(
                  "Field: %s, fieldValues.size(): %s !=  "
                      + "facetHierarchyPaths.size(): %s, must have same list size for "
                      + "fieldValues and facetHierarchyPaths",
                  field.getName(), fieldValues.size(), facetHierarchyPathValues.size()));
        }
      }
      if (!(field instanceof IndexableFieldDef)) {
        throw new AddDocumentHandlerException(
            String.format("Field: %s is not indexable", field.getName()));
      }
      IndexableFieldDef indexableFieldDef = (IndexableFieldDef) field;
      indexableFieldDef.parseFieldWithChildren(
          documentsContext, fieldValues, facetHierarchyPathValues);
    }
  }

  public static class AddDocumentHandlerException extends Handler.HandlerException {
    public AddDocumentHandlerException(String errorMessage) {
      super(errorMessage);
    }

    public AddDocumentHandlerException(String errorMessage, Throwable err) {
      super(errorMessage, err);
    }

    public AddDocumentHandlerException(Throwable err) {
      super(err);
    }
  }

  public static class DocumentIndexer implements Callable {
    private final GlobalState globalState;
    private final List addDocumentRequestList;
    private final String indexName;

    public DocumentIndexer(
        GlobalState globalState,
        List addDocumentRequestList,
        String indexName) {
      this.globalState = globalState;
      this.addDocumentRequestList = addDocumentRequestList;
      this.indexName = indexName;
    }

    public long runIndexingJob() throws Exception {
      DeadlineUtils.checkDeadline("DocumentIndexer: runIndexingJob", "INDEXING");

      logger.debug(
          String.format(
              "running indexing job on threadId: %s",
              Thread.currentThread().getName() + Thread.currentThread().getId()));
      Queue documents = new LinkedBlockingDeque<>();
      IndexState indexState;
      ShardState shardState;
      IdFieldDef idFieldDef;

      try {
        indexState = globalState.getIndex(this.indexName);
        shardState = indexState.getShard(0);
        idFieldDef = indexState.getIdFieldDef().orElse(null);
        for (AddDocumentRequest addDocumentRequest : addDocumentRequestList) {
          DocumentsContext documentsContext =
              AddDocumentHandler.LuceneDocumentBuilder.getDocumentsContext(
                  addDocumentRequest, indexState);
          if (documentsContext.hasNested()) {
            try {
              if (idFieldDef != null) {
                // update documents in the queue to keep order
                updateDocuments(documents, idFieldDef, indexState, shardState);
                updateNestedDocuments(documentsContext, idFieldDef, indexState, shardState);
              } else {
                // add documents in the queue to keep order
                addDocuments(documents, indexState, shardState);
                addNestedDocuments(documentsContext, indexState, shardState);
              }
              documents.clear();
            } catch (IOException e) { // This exception should be caught in parent to and set
              // responseObserver.onError(e) so client knows the job failed
              logger.warn(
                  String.format(
                      "ThreadId: %s, IndexWriter.addDocuments failed",
                      Thread.currentThread().getName() + Thread.currentThread().getId()));
              throw new IOException(e);
            }
          } else {
            documents.add(documentsContext.getRootDocument());
          }
        }
      } catch (Exception e) {
        logger.warn("addDocuments Cancelled", e);
        throw e; // parent thread should catch and send error back to client
      }

      try {
        if (idFieldDef != null) {
          updateDocuments(documents, idFieldDef, indexState, shardState);
        } else {
          addDocuments(documents, indexState, shardState);
        }
      } catch (IOException e) { // This exception should be caught in parent to and set
        // responseObserver.onError(e) so client knows the job failed
        logger.warn(
            String.format(
                "ThreadId: %s, IndexWriter.addDocuments failed",
                Thread.currentThread().getName() + Thread.currentThread().getId()));
        throw new IOException(e);
      }
      logger.debug(
          String.format(
              "indexing job on threadId: %s done with SequenceId: %s",
              Thread.currentThread().getName() + Thread.currentThread().getId(),
              shardState.writer.getMaxCompletedSequenceNumber()));
      return shardState.writer.getMaxCompletedSequenceNumber();
    }

    /**
     * update documents with nested objects
     *
     * @param documentsContext
     * @param idFieldDef
     * @param shardState
     * @throws IOException
     */
    private void updateNestedDocuments(
        DocumentsContext documentsContext,
        IdFieldDef idFieldDef,
        IndexState indexState,
        ShardState shardState)
        throws IOException {
      List documents = new ArrayList<>();
      for (Map.Entry> e : documentsContext.getChildDocuments().entrySet()) {
        documents.addAll(
            e.getValue().stream()
                .map(v -> handleFacets(indexState, shardState, v))
                .collect(Collectors.toList()));
      }
      Document rootDoc = handleFacets(indexState, shardState, documentsContext.getRootDocument());

      for (Document doc : documents) {
        for (IndexableField f : rootDoc.getFields(idFieldDef.getName())) {
          doc.add(f);
        }
      }

      documents.add(rootDoc);
      shardState.writer.updateDocuments(idFieldDef.getTerm(rootDoc), documents);
    }

    /**
     * Add documents with nested object
     *
     * @param documentsContext
     * @param shardState
     * @throws IOException
     */
    private void addNestedDocuments(
        DocumentsContext documentsContext, IndexState indexState, ShardState shardState)
        throws IOException {
      List documents = new ArrayList<>();
      for (Map.Entry> e : documentsContext.getChildDocuments().entrySet()) {
        documents.addAll(
            e.getValue().stream()
                .map(v -> handleFacets(indexState, shardState, v))
                .collect(Collectors.toList()));
      }
      Document rootDoc = handleFacets(indexState, shardState, documentsContext.getRootDocument());
      documents.add(rootDoc);
      shardState.writer.addDocuments(documents);
    }

    private void updateDocuments(
        Queue documents,
        IdFieldDef idFieldDef,
        IndexState indexState,
        ShardState shardState)
        throws IOException {
      for (Document nextDoc : documents) {
        nextDoc = handleFacets(indexState, shardState, nextDoc);
        shardState.writer.updateDocument(idFieldDef.getTerm(nextDoc), nextDoc);
      }
    }

    private void addDocuments(
        Queue documents, IndexState indexState, ShardState shardState)
        throws IOException {
      if (shardState.isReplica()) {
        throw new IllegalStateException(
            "Adding documents to an index on a replica node is not supported");
      }
      shardState.writer.addDocuments(
          (Iterable)
              () ->
                  new Iterator<>() {
                    private Document nextDoc;

                    @Override
                    public boolean hasNext() {
                      if (!documents.isEmpty()) {
                        nextDoc = documents.poll();
                        nextDoc = handleFacets(indexState, shardState, nextDoc);
                        return true;
                      } else {
                        nextDoc = null;
                        return false;
                      }
                    }

                    @Override
                    public Document next() {
                      return nextDoc;
                    }
                  });
    }

    private Document handleFacets(IndexState indexState, ShardState shardState, Document nextDoc) {
      final boolean hasFacets = indexState.hasFacets();
      if (hasFacets) {
        try {
          nextDoc = indexState.getFacetsConfig().build(shardState.taxoWriter, nextDoc);
        } catch (IOException ioe) {
          throw new RuntimeException(
              String.format("document: %s hit exception building facets", nextDoc), ioe);
        }
      }
      return nextDoc;
    }

    @Override
    public Long call() throws Exception {
      return runIndexingJob();
    }
  }
}