org.apache.drill.exec.physical.resultSet.ResultSetLoader Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.drill.exec.physical.resultSet;
import org.apache.drill.common.exceptions.CustomErrorContext;
import org.apache.drill.exec.record.VectorContainer;
import org.apache.drill.exec.record.metadata.TupleMetadata;
import org.apache.drill.exec.vector.BaseValueVector;
import org.apache.drill.exec.vector.complex.impl.VectorContainerWriter;
/**
* Builds a result set (series of zero or more row sets) based on a defined
* schema which may
* evolve (expand) over time. Automatically rolls "overflow" rows over
* when a batch fills.
*
* Many of the methods in this interface verify that the loader is
* in the proper state. For example, an exception is thrown if the caller
* attempts to save a row before starting a batch. However, the per-column
* write methods are checked only through assertions that should enabled
* during testing, but will be disabled during production.
*
* @see {@link VectorContainerWriter}, the class which this class
* replaces
*/
public interface ResultSetLoader {
public static final int DEFAULT_ROW_COUNT = BaseValueVector.INITIAL_VALUE_ALLOCATION;
/**
* Context for error messages.
*/
CustomErrorContext errorContext();
/**
* Current schema version. The version increments by one each time
* a column is added.
* @return the current schema version
*/
int schemaVersion();
/**
* Adjust the number of rows to produce in the next batch. Takes
* affect after the next call to {@link #startBatch()}.
*
* @param count target batch row count
*/
void setTargetRowCount(int count);
/**
* The number of rows produced by this loader (as configured in the loader
* options.)
*
* @return the target row count for batches that this loader produces
*/
int targetRowCount();
/**
* The maximum number of rows for the present batch. Will be the lesser
* of the {@link #targetRowCount()) and the overall scan limit remaining.
*/
int maxBatchSize();
/**
* The largest vector size produced by this loader (as specified by
* the value vector limit.)
*
* @return the largest vector size. Attempting to extend a vector beyond
* this limit causes automatic vector overflow and terminates the
* in-flight batch, even if the batch has not yet reached the target
* row count
*/
int targetVectorSize();
/**
* Total number of batches created. Includes the current batch if
* the row count in this batch is non-zero.
* @return the number of batches produced including the current
* one
*/
int batchCount();
/**
* Total number of rows loaded for all previous batches and the
* current batch.
* @return total row count
*/
long totalRowCount();
/**
* Report whether the loader currently holds rows. If within a batch,
* reports if at least one row has been read (which might be a look-ahead
* row.) If between batches, reports if a look-ahead row is available.
*
* @return true if at least one row is available to harvest, false
* otherwise
*/
boolean hasRows();
/**
* Start a new row batch. Valid only when first started, or after the
* previous batch has been harvested.
*
* @return {@code true} if another batch can be read, {@code false} if
* the reader has reached the given scan limit.
*/
boolean startBatch();
/**
* Writer for the top-level tuple (the entire row). Valid only when
* the mutator is actively writing a batch (after startBatch()
* but before harvest().)
*
* @return writer for the top-level columns
*/
RowSetLoader writer();
/**
* Reports whether the loader is in a writable state. The writable state
* occurs only when a batch has been started, and before that batch
* becomes full.
*
* @return true if the client can add a row to the loader, false if
* not
*/
boolean writeable();
/**
* Load a row using column values passed as variable-length arguments. Expects
* map values to represented as an array.
* A schema of (a:int, b:map(c:varchar)) would be>
* set as
loadRow(10, new Object[] {"foo"});
* Values of arrays can be expressed as a Java
* array. A schema of (a:int, b:int[]) can be set as
* loadRow(10, new int[] {100, 200});
.
* Primarily for testing, too slow for production code.
*
* If the row consists of a single map or list, then the one value will be an
* Object array, creating an ambiguity. Use writer().set(0, value);
* in this case.
*
* @param values column values in column index order
* @return this loader
*/
ResultSetLoader setRow(Object...values);
/**
* Requests to skip the given number of rows. Returns the number of rows
* actually skipped (which is limited by batch count.)
*
* Used in SELECT COUNT(*) style queries when the downstream
* operators want just record count, but no actual rows.
*
* Also used to fill in a batch of only null values (such a filling
* in a set of null vectors for unprojected columns.)
*
* @param requestedCount
* the number of rows to skip
* @return the actual number of rows skipped, which may be less than the
* requested amount. If less, the client should call this method for
* multiple batches until the requested count is reached
*/
int skipRows(int requestedCount);
/**
* Reports if this is an empty projection such as occurs in a
* SELECT COUNT(*) query. If the projection is empty, then
* the downstream needs only the row count set in each batch, but no
* actual vectors will be created. In this case, the client can do
* the work to populate rows (the data will be discarded), or can call
* {@link #skipRows(int)} to skip over the number of rows that would
* have been read if any data had been projected.
*
* Note that the empty schema case can also occur if the project list
* from the SELECT clause is disjoint from the table schema.
* For example, SELECT a, b from a table with schema
* (c, d).
*
* @return true if no columns are actually projected, false if at
* least one column is projected
*/
boolean isProjectionEmpty();
/**
* Returns the active output schema; the schema used by the writers,
* minus any unprojected columns. This is usually the same as the
* output schema, but may differ if the writer adds columns during
* an overflow row. Unlike the output schema, this schema is defined
* as long as the loader is open.
*/
TupleMetadata activeSchema();
/**
* Returns the output container which holds (or will hold) batches
* from this loader. For use when the container is needed prior
* to "harvesting" a batch. The data is not valid until
* {@link #harvest()} is called, and is no longer valid once
* {@link #startBatch()} is called.
*
* @return container used to publish results from this loader
*/
VectorContainer outputContainer();
/**
* Harvest the current row batch, and reset the mutator
* to the start of the next row batch (which may already contain
* an overflow row.
*
* The schema of the returned container is defined as:
*
* - The schema as passed in via the loader options, plus
* - Columns added dynamically during write, minus
* - Any columns not included in the project list, minus
* - Any columns added in the overflow row.
*
* That is, column order is as defined by the initial schema and column
* additions. In particular, the schema order is not defined by
* the projection list. (Another mechanism is required to reorder columns
* for the actual projection.)
*
* @return the row batch to send downstream
*/
VectorContainer harvest();
/**
* After a {@link #harvest()}, call, call this method to determine if
* the scan limit has been hit. If so, treat this as the final batch
* for the reader, even if more data is available to read.
*
* @return {@code true} if the scan has reached a set scan row limit,
* {@code false} if there is no limit, or more rows can be read.
*/
boolean atLimit();
/**
* The schema of the harvested batch. Valid until the start of the
* next batch.
*
* @return the extended schema of the harvested batch which includes
* any allocation hints used when creating the batch
*/
TupleMetadata outputSchema();
/**
* Peek at the internal vector cache for readers that need a bit of help
* resolving types based on what was previously seen.
*
* @return real or dummy vector cache
*/
ResultVectorCache vectorCache();
/**
* Called after all rows are returned, whether because no more data is
* available, or the caller wishes to cancel the current row batch
* and complete.
*/
void close();
}