Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
org.apache.drill.exec.physical.impl.ScanBatch Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.drill.exec.physical.impl;
import java.nio.charset.StandardCharsets;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.drill.common.AutoCloseables;
import org.apache.drill.common.exceptions.ExecutionSetupException;
import org.apache.drill.common.exceptions.UserException;
import org.apache.drill.common.expression.SchemaPath;
import org.apache.drill.common.map.CaseInsensitiveMap;
import org.apache.drill.common.types.TypeProtos.MinorType;
import org.apache.drill.common.types.Types;
import org.apache.drill.exec.ExecConstants;
import org.apache.drill.exec.exception.OutOfMemoryException;
import org.apache.drill.exec.exception.SchemaChangeException;
import org.apache.drill.exec.expr.TypeHelper;
import org.apache.drill.exec.memory.BufferAllocator;
import org.apache.drill.exec.ops.FragmentContext;
import org.apache.drill.exec.ops.OperatorContext;
import org.apache.drill.exec.physical.base.PhysicalOperator;
import org.apache.drill.exec.record.BatchSchema;
import org.apache.drill.exec.record.BatchSchema.SelectionVectorMode;
import org.apache.drill.exec.record.CloseableRecordBatch;
import org.apache.drill.exec.record.MaterializedField;
import org.apache.drill.exec.record.TypedFieldId;
import org.apache.drill.exec.record.VectorAccessibleUtilities;
import org.apache.drill.exec.record.VectorContainer;
import org.apache.drill.exec.record.VectorWrapper;
import org.apache.drill.exec.record.WritableBatch;
import org.apache.drill.exec.record.selection.SelectionVector2;
import org.apache.drill.exec.record.selection.SelectionVector4;
import org.apache.drill.exec.store.RecordReader;
import org.apache.drill.exec.testing.ControlsInjector;
import org.apache.drill.exec.testing.ControlsInjectorFactory;
import org.apache.drill.exec.util.CallBack;
import org.apache.drill.exec.util.record.RecordBatchStats;
import org.apache.drill.exec.util.record.RecordBatchStats.RecordBatchIOType;
import org.apache.drill.exec.util.record.RecordBatchStats.RecordBatchStatsContext;
import org.apache.drill.exec.vector.AllocationHelper;
import org.apache.drill.exec.vector.NullableVarCharVector;
import org.apache.drill.exec.vector.SchemaChangeCallBack;
import org.apache.drill.exec.vector.ValueVector;
import org.apache.drill.shaded.guava.com.google.common.annotations.VisibleForTesting;
import org.apache.drill.shaded.guava.com.google.common.base.Preconditions;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import io.netty.buffer.DrillBuf;
/**
* Record batch used for a particular scan. Operators against one or more
*/
public class ScanBatch implements CloseableRecordBatch {
private static final Logger logger = LoggerFactory.getLogger(ScanBatch.class);
private static final ControlsInjector injector = ControlsInjectorFactory.getInjector(ScanBatch.class);
/** Main collection of fields' value vectors. */
private final VectorContainer container = new VectorContainer();
private int recordCount;
private final FragmentContext context;
private final OperatorContext oContext;
private Iterator extends RecordReader> readers;
private RecordReader currentReader;
private BatchSchema schema;
private final Mutator mutator;
private boolean done;
private final Iterator> implicitColumns;
private Map implicitValues;
private final BufferAllocator allocator;
private final List> implicitColumnList;
private String currentReaderClassName;
private final RecordBatchStatsContext batchStatsContext;
// Represents last outcome of next(). If an Exception is thrown
// during the method's execution a value IterOutcome.STOP will be assigned.
private IterOutcome lastOutcome;
private List readerList; // needed for repeatable scanners
private boolean isRepeatableScan; // needed for repeatable scanners
/**
*
* @param context
* @param oContext
* @param readerList
* @param implicitColumnList : either an empty list when all the readers do not have implicit
* columns, or there is a one-to-one mapping between reader and implicitColumns.
*/
public ScanBatch(FragmentContext context,
OperatorContext oContext, List extends RecordReader> readerList,
List> implicitColumnList) {
this.context = context;
this.readers = readerList.iterator();
this.implicitColumns = implicitColumnList.iterator();
if (!readers.hasNext()) {
throw UserException.internalError(
new ExecutionSetupException("A scan batch must contain at least one reader."))
.build(logger);
}
this.oContext = oContext;
allocator = oContext.getAllocator();
mutator = new Mutator(oContext, allocator, container);
oContext.getStats().startProcessing();
try {
if (!verifyImplcitColumns(readerList.size(), implicitColumnList)) {
Exception ex = new ExecutionSetupException("Either implicit column list does not have same cardinality as reader list, "
+ "or implicit columns are not same across all the record readers!");
throw UserException.internalError(ex)
.addContext("Setup failed for", readerList.get(0).getClass().getSimpleName())
.build(logger);
}
this.implicitColumnList = implicitColumnList;
addImplicitVectors();
currentReader = null;
batchStatsContext = new RecordBatchStatsContext(context, oContext);
} finally {
oContext.getStats().stopProcessing();
}
}
public ScanBatch(PhysicalOperator subScanConfig, FragmentContext context,
List readers)
throws ExecutionSetupException {
this(context, context.newOperatorContext(subScanConfig),
readers, Collections.> emptyList());
}
public ScanBatch(PhysicalOperator subScanConfig, FragmentContext context,
List readerList, boolean isRepeatableScan)
throws ExecutionSetupException {
this(context, context.newOperatorContext(subScanConfig),
readerList, Collections.> emptyList());
this.readerList = readerList;
this.isRepeatableScan = isRepeatableScan;
}
@Override
public FragmentContext getContext() {
return context;
}
public OperatorContext getOperatorContext() { return oContext; }
@Override
public BatchSchema getSchema() {
return schema;
}
@Override
public int getRecordCount() {
return recordCount;
}
@Override
public void cancel() {
done = true;
releaseAssets();
}
/**
* This method is to perform scan specific actions when the scan needs to
* clean/reset readers and return NONE status
*
* @return NONE
*/
private IterOutcome cleanAndReturnNone() {
if (isRepeatableScan) {
readers = readerList.iterator();
} else {
releaseAssets(); // All data has been read. Release resource.
done = true;
}
return IterOutcome.NONE;
}
/**
* When receive zero record from current reader, update reader accordingly,
* and return the decision whether the iteration should continue
* @return whether we could continue iteration
* @throws Exception
*/
private boolean shouldContinueAfterNoRecords() {
logger.trace("scan got 0 record.");
if (isRepeatableScan) {
if (!currentReader.hasNext()) {
currentReader = null;
readers = readerList.iterator();
return false;
}
return true;
} else { // Regular scan
closeCurrentReader();
return true; // In regular case, we always continue the iteration, if no more reader, we will break out at the head of loop
}
}
private void closeCurrentReader() {
AutoCloseables.closeSilently(currentReader);
currentReader = null;
}
private IterOutcome internalNext() {
while (true) {
if (currentReader == null && !getNextReaderIfHas()) {
logger.trace("currentReader is null");
return cleanAndReturnNone();
}
injector.injectChecked(context.getExecutionControls(), "next-allocate", OutOfMemoryException.class);
currentReader.allocate(mutator.fieldVectorMap());
recordCount = currentReader.next();
logger.trace("currentReader.next return recordCount={}", recordCount);
Preconditions.checkArgument(recordCount >= 0, "recordCount from RecordReader.next() should not be negative");
boolean isNewSchema = mutator.isNewSchema();
// If scan is done for collecting metadata, additional implicit column `$project_metadata$`
// will be projected to handle the case when scan may return empty results (scan on empty file or row group).
// Scan will return single row for the case when empty file or row group is present with correct
// values of other implicit columns (like `fqn`, `rgi`), so this metadata will be stored to the Metastore.
if (implicitValues != null) {
String projectMetadataColumn = context.getOptions().getOption(ExecConstants.IMPLICIT_PROJECT_METADATA_COLUMN_LABEL_VALIDATOR);
if (recordCount > 0) {
// Sets the implicit value to null to signal that some results were returned and there is no need for creating an additional record.
implicitValues.replace(projectMetadataColumn, null);
} else if (Boolean.FALSE.toString().equals(implicitValues.get(projectMetadataColumn))) {
recordCount++;
// Sets implicit value to true to avoid affecting resulting count value.
implicitValues.put(projectMetadataColumn, Boolean.TRUE.toString());
}
}
populateImplicitVectors();
mutator.container.setValueCount(recordCount);
oContext.getStats().batchReceived(0, recordCount, isNewSchema);
boolean toContinueIter = true;
if (recordCount == 0) {
// If we got 0 record, we may need to clean and exit, but we may need to return a new schema in below code block,
// so we use toContinueIter to mark the decision whether we should continue the iteration
toContinueIter = shouldContinueAfterNoRecords();
}
if (isNewSchema) {
// Even when recordCount = 0, we should return return OK_NEW_SCHEMA if current reader presents a new schema.
// This could happen when data sources have a non-trivial schema with 0 row.
container.buildSchema(SelectionVectorMode.NONE);
schema = container.getSchema();
lastOutcome = IterOutcome.OK_NEW_SCHEMA;
return lastOutcome;
}
// Handle case of same schema.
if (recordCount == 0) {
if (toContinueIter) {
continue; // Skip to next loop iteration if reader returns 0 row and has same schema.
} else {
// Return NONE if recordCount == 0 && !isNewSchema
lastOutcome = IterOutcome.NONE;
return lastOutcome;
}
} else {
// return OK if recordCount > 0 && ! isNewSchema
lastOutcome = IterOutcome.OK;
return lastOutcome;
}
}
}
@Override
public IterOutcome next() {
if (done) {
lastOutcome = IterOutcome.NONE;
return lastOutcome;
}
oContext.getStats().startProcessing();
try {
return internalNext();
} catch (OutOfMemoryException ex) {
clearFieldVectorMap();
throw UserException.memoryError(ex).build(logger);
} catch (UserException ex) {
throw ex;
} catch (Exception ex) {
throw UserException.internalError(ex).build(logger);
} finally {
oContext.getStats().stopProcessing();
}
}
private void releaseAssets() {
container.zeroVectors();
}
private void clearFieldVectorMap() {
VectorAccessibleUtilities.clear(mutator.fieldVectorMap().values());
VectorAccessibleUtilities.clear(mutator.implicitFieldVectorMap.values());
}
private boolean getNextReaderIfHas() {
if (!readers.hasNext()) {
return false;
}
currentReader = readers.next();
if (!isRepeatableScan && readers.hasNext()) {
readers.remove();
}
implicitValues = implicitColumns.hasNext() ? implicitColumns.next() : null;
currentReaderClassName = currentReader.getClass().getSimpleName();
try {
currentReader.setup(oContext, mutator);
} catch (ExecutionSetupException e) {
closeCurrentReader();
throw UserException.executionError(e)
.addContext("Failed to setup reader", currentReaderClassName)
.build(logger);
}
return true;
}
private void addImplicitVectors() {
try {
if (!implicitColumnList.isEmpty()) {
for (String column : implicitColumnList.get(0).keySet()) {
final MaterializedField field = MaterializedField.create(column, Types.optional(MinorType.VARCHAR));
mutator.addField(field, NullableVarCharVector.class, true /*implicit field*/);
}
}
} catch(SchemaChangeException e) {
// No exception should be thrown here.
throw UserException.internalError(e)
.addContext("Failure while allocating implicit vectors")
.build(logger);
}
}
private void populateImplicitVectors() {
mutator.populateImplicitVectors(implicitValues, recordCount);
}
@Override
public SelectionVector2 getSelectionVector2() {
return null;
}
@Override
public SelectionVector4 getSelectionVector4() {
throw new UnsupportedOperationException();
}
@Override
public TypedFieldId getValueVectorId(SchemaPath path) {
return container.getValueVectorId(path);
}
@Override
public VectorWrapper> getValueAccessorById(Class> clazz, int... ids) {
return container.getValueAccessorById(clazz, ids);
}
@SuppressWarnings("unused")
private void logRecordBatchStats() {
final int MAX_FQN_LENGTH = 50;
if (recordCount == 0) {
return; // NOOP
}
RecordBatchStats.logRecordBatchStats(RecordBatchIOType.OUTPUT, getFQNForLogging(MAX_FQN_LENGTH), this, batchStatsContext);
}
/** Might truncate the FQN if too long */
private String getFQNForLogging(int maxLength) {
final String FQNKey = "FQN";
final ValueVector v = mutator.implicitFieldVectorMap.get(FQNKey);
final Object fqnObj;
if (v == null
|| v.getAccessor().getValueCount() == 0
|| (fqnObj = ((NullableVarCharVector) v).getAccessor().getObject(0)) == null) {
return "NA";
}
String fqn = fqnObj.toString();
if (fqn != null && fqn.length() > maxLength) {
fqn = fqn.substring(fqn.length() - maxLength, fqn.length());
}
return fqn;
}
/**
* Row set mutator implementation provided to record readers created by
* this scan batch. Made visible so that tests can create this mutator
* without also needing a ScanBatch instance. (This class is really independent
* of the ScanBatch, but resides here for historical reasons. This is,
* in turn, the only use of the generated vector readers in the vector
* package.)
*/
@VisibleForTesting
public static class Mutator implements OutputMutator {
/** Flag keeping track whether top-level schema has changed since last inquiry (via #isNewSchema}).
* It's initialized to false, or reset to false after #isNewSchema or after #clear, until a new value vector
* or a value vector with different type is added to fieldVectorMap.
**/
private boolean schemaChanged;
/** Regular fields' value vectors indexed by fields' keys. */
private final CaseInsensitiveMap regularFieldVectorMap =
CaseInsensitiveMap.newHashMap();
/** Implicit fields' value vectors index by fields' keys. */
private final CaseInsensitiveMap implicitFieldVectorMap =
CaseInsensitiveMap.newHashMap();
private final SchemaChangeCallBack callBack = new SchemaChangeCallBack();
private final BufferAllocator allocator;
private final VectorContainer container;
private final OperatorContext oContext;
public Mutator(OperatorContext oContext, BufferAllocator allocator, VectorContainer container) {
this.oContext = oContext;
this.allocator = allocator;
this.container = container;
this.schemaChanged = false;
}
public Map fieldVectorMap() {
return regularFieldVectorMap;
}
public Map implicitFieldVectorMap() {
return implicitFieldVectorMap;
}
@Override
public T addField(MaterializedField field,
Class clazz) throws SchemaChangeException {
return addField(field, clazz, false);
}
@Override
public void allocate(int recordCount) {
for (final ValueVector v : regularFieldVectorMap.values()) {
AllocationHelper.allocate(v, recordCount, 50, 10);
}
}
/**
* Reports whether schema has changed (field was added or re-added) since
* last call to {@link #isNewSchema}. Returns true at first call.
*/
@Override
public boolean isNewSchema() {
// Check if top-level schema or any of the deeper map schemas has changed.
// Note: Callback's getSchemaChangedAndReset() must get called in order
// to reset it and avoid false reports of schema changes in future. (Be
// careful with short-circuit OR (||) operator.)
final boolean deeperSchemaChanged = callBack.getSchemaChangedAndReset();
if (schemaChanged || deeperSchemaChanged) {
schemaChanged = false;
return true;
}
return false;
}
@Override
public DrillBuf getManagedBuffer() {
return oContext.getManagedBuffer();
}
@Override
public CallBack getCallBack() {
return callBack;
}
@Override
public void clear() {
regularFieldVectorMap.clear();
implicitFieldVectorMap.clear();
container.clear();
schemaChanged = false;
}
private T addField(MaterializedField field,
Class clazz, boolean isImplicitField) throws SchemaChangeException {
Map fieldVectorMap;
if (isImplicitField) {
fieldVectorMap = implicitFieldVectorMap;
} else {
fieldVectorMap = regularFieldVectorMap;
}
if (!isImplicitField && implicitFieldVectorMap.containsKey(field.getName()) ||
isImplicitField && regularFieldVectorMap.containsKey(field.getName())) {
throw new SchemaChangeException(
String.format(
"It's not allowed to have regular field and implicit field share common name %s. "
+ "Either change regular field name in datasource, or change the default implicit field names.",
field.getName()));
}
// Check if the field exists.
ValueVector v = fieldVectorMap.get(field.getName());
// for the cases when fields have a different scale or precision,
// the new vector should be used to handle the value correctly
if (v == null || !v.getField().getType().equals(field.getType())) {
// Field does not exist--add it to the map and the output container.
v = TypeHelper.getNewVector(field, allocator, callBack);
if (!clazz.isAssignableFrom(v.getClass())) {
throw new SchemaChangeException(
String.format(
"The class that was provided, %s, does not correspond to the "
+ "expected vector type of %s.",
clazz.getSimpleName(), v.getClass().getSimpleName()));
}
final ValueVector old = fieldVectorMap.put(field.getName(), v);
if (old != null) {
old.clear();
container.remove(old);
}
container.add(v);
// Only mark schema change for regular vectors added to the container; implicit schema is constant.
if (!isImplicitField) {
schemaChanged = true;
}
}
return clazz.cast(v);
}
private void populateImplicitVectors(Map implicitValues, int recordCount) {
if (implicitValues != null) {
for (Map.Entry entry : implicitValues.entrySet()) {
final NullableVarCharVector v = (NullableVarCharVector) implicitFieldVectorMap.get(entry.getKey());
String val;
if ((val = entry.getValue()) != null) {
AllocationHelper.allocate(v, recordCount, val.length());
final byte[] bytes = val.getBytes(StandardCharsets.UTF_8);
for (int j = 0; j < recordCount; j++) {
v.getMutator().setSafe(j, bytes, 0, bytes.length);
}
v.getMutator().setValueCount(recordCount);
} else {
AllocationHelper.allocate(v, recordCount, 0);
v.getMutator().setValueCount(recordCount);
}
}
}
}
}
@Override
public Iterator> iterator() {
return container.iterator();
}
@Override
public WritableBatch getWritableBatch() {
return WritableBatch.get(this);
}
@Override
public void close() throws Exception {
container.clear();
mutator.clear();
closeCurrentReader();
}
@Override
public VectorContainer getOutgoingContainer() {
throw new UnsupportedOperationException(
String.format("You should not call getOutgoingContainer() for class %s",
this.getClass().getCanonicalName()));
}
@Override
public VectorContainer getContainer() {
return container;
}
/**
* Verify list of implicit column values is valid input:
* - Either implicit column list is empty;
* - Or implicit column list has same sie as reader list, and the key set is same across all the readers.
* @param numReaders
* @param implicitColumnList
* @return return true if
*/
private boolean verifyImplcitColumns(int numReaders, List> implicitColumnList) {
if (implicitColumnList.isEmpty()) {
return true;
}
if (numReaders != implicitColumnList.size()) {
return false;
}
Map firstMap = implicitColumnList.get(0);
for (int i = 1; i< implicitColumnList.size(); i++) {
Map nonFirstMap = implicitColumnList.get(i);
if (!firstMap.keySet().equals(nonFirstMap.keySet())) {
return false;
}
}
return true;
}
@Override
public void dump() {
logger.error("ScanBatch[container={}, currentReader={}, schema={}]", container, currentReader, schema);
}
}