org.apache.drill.exec.physical.resultSet.ResultSetCopier Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.drill.exec.physical.resultSet;

import org.apache.drill.exec.physical.impl.aggregate.BatchIterator;
import org.apache.drill.exec.record.VectorContainer;

/**
 * Copies rows from an input batch to an output batch. The input
 * batch is assumed to have a selection vector, or the caller
 * will pick the rows to copy.
 * 
 * Works to create full output batches to minimize per-batch
 * overhead and to eliminate unnecessary empty batches if no
 * rows are copied.
 * 

 * The output batches are assumed to have the same schema as
 * input batches. (No projection occurs.) The output schema will
 * change each time the input schema changes. (For an SV4, then
 * the upstream operator must have ensured all batches covered
 * by the SV4 have the same schema.)
 * 

 * This implementation works with a single stream of batches which,
 * following Drill's rules, must consist of the same set of vectors on
 * each non-schema-change batch.
 *
 * 
Protocol
 *
 * Overall lifecycle:
 * 
 * Create an instance of the
 *     {@link org.apache.drill.exec.physical.resultSet.impl.ResultSetCopierImpl
 *      ResultSetCopierImpl} class, passing the input row set reader
 *      to the constructor.
 * Loop to process each output batch as shown below. That is, continually
 *     process calls to the {@link BatchIterator#next()} method.
 * Call {@link #close()}.
 * 
 * 
 *
 * To build each output batch:
 *
 * 

 * public IterOutcome next() {
 *   copier.startOutputBatch();
 *   while (!copier.isFull() {
 *     IterOutcome innerResult = inner.next();
 *     if (innerResult == DONE) { break; }
 *     copier.startInputBatch();
 *     copier.copyAllRows();
 *   }
 *   if (copier.hasRows()) {
 *     outputContainer = copier.harvest();
 *     return outputContainer.isSchemaChanged() ? OK_NEW_SCHEMA ? OK;
 *   } else { return DONE; }
 * }
 * 
 * 
 * The above assumes that the upstream operator can be polled
 * multiple times in the DONE state. The extra polling is
 * needed to handle any in-flight copies when the input
 * exhausts its batches.
 * 

 * The above also shows that the copier handles and reports
 * schema changes by setting the schema change flag in the
 * output container. Real code must handle multiple calls to
 * next() in the DONE state, and work around lack of such support
 * in its input (perhaps by tracking a state.)
 * 

 * An input batch is processed by copying the rows. Copying can be done
 * row-by row, via a row range, or by copying the entire input batch as
 * shown in the example.
 * Copying the entire batch make sense when the input batch carries as
 * selection vector that identifies which rows to copy, in which
 * order.
 * 

 * Because we wish to fill the output batch, we may be able to copy
 * part of a batch, the whole batch, or multiple batches to the output.
 */
public interface ResultSetCopier {

  /**
   * Start the next output batch.
   */
  void startOutputBatch();

  /**
   * Start the next input batch. The input batch must be held
   * by the {@code ResultSetReader} passed into the constructor.
   */
  boolean nextInputBatch();

  /**
   * If copying rows one by one, copy the next row from the
   * input.
   *
   * @return true if more rows remain on the input, false
   * if all rows are exhausted
   */
  boolean copyNextRow();

  /**
   * Copy a row at the given position. For those cases in
   * which random copying is needed, but a selection vector
   * is not available. Note that this version is slow because
   * of the need to reset indexes for every row. Better to
   * use a selection vector, then copy sequentially.
   *
   * @param inputRowIndex the input row position. If a selection vector
   * is attached, then this is the selection vector position
   */
  void copyRow(int inputRowIndex);

  /**
   * Copy all (remaining) input rows to the output.
   * If insufficient space exists in the output, does a partial
   * copy, and {@link #isCopyPending()} will return true.
   */
  void copyAllRows();

  /**
   * Reports if the output batch has rows. Useful after the end
   * of input to determine if a partial output batch exists to
   * send downstream.
   * @return true if the output batch has one or more rows
   */
  boolean hasOutputRows();

  /**
   * Reports if the output batch is full and must be sent
   * downstream. The output batch can be full in the middle
   * of a copy, in which case {@link #isCopyPending()} will
   * also return true.
   * 
   * This function also returns true if a schema change
   * occurred on the latest input row, in which case the
   * partially-completed batch of the old schema must be
   * flushed downstream.
   *
   * @return true if the output is full and must be harvested
   * and sent downstream
   */
  boolean isOutputFull();

  /**
   * Helper method to determine if a copy is pending: more rows
   * remain to be copied. If so, start a new output batch, which
   * will finish the copy. Do that before start a new input
   * batch.
   * @return
   */
  boolean isCopyPending();

  /**
   * Obtain the output batch. Returned as a vector container
   * since the output will not have a selection vector.
   *
   * @return a vector container holding the output batch
   */
  VectorContainer harvest();

  /**
   * Release resources, including any pending input batch
   * and any non-harvested output batch.
   */
  void close();
}