org.apache.drill.exec.physical.impl.ScanBatch Maven / Gradle / Ivy

Go to download
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.drill.exec.physical.impl;

import io.netty.buffer.DrillBuf;

import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.apache.drill.common.exceptions.ExecutionSetupException;
import org.apache.drill.common.exceptions.UserException;
import org.apache.drill.common.expression.SchemaPath;
import org.apache.drill.common.types.TypeProtos.MinorType;
import org.apache.drill.common.types.Types;
import org.apache.drill.exec.ExecConstants;
import org.apache.drill.exec.exception.OutOfMemoryException;
import org.apache.drill.exec.exception.SchemaChangeException;
import org.apache.drill.exec.expr.TypeHelper;
import org.apache.drill.exec.ops.FragmentContext;
import org.apache.drill.exec.ops.OperatorContext;
import org.apache.drill.exec.physical.base.PhysicalOperator;
import org.apache.drill.exec.record.BatchSchema;
import org.apache.drill.exec.record.BatchSchema.SelectionVectorMode;
import org.apache.drill.exec.record.CloseableRecordBatch;
import org.apache.drill.exec.record.MaterializedField;
import org.apache.drill.exec.record.TypedFieldId;
import org.apache.drill.exec.record.VectorContainer;
import org.apache.drill.exec.record.VectorWrapper;
import org.apache.drill.exec.record.WritableBatch;
import org.apache.drill.exec.record.selection.SelectionVector2;
import org.apache.drill.exec.record.selection.SelectionVector4;
import org.apache.drill.exec.server.options.OptionValue;
import org.apache.drill.exec.store.RecordReader;
import org.apache.drill.exec.testing.ControlsInjector;
import org.apache.drill.exec.testing.ControlsInjectorFactory;
import org.apache.drill.exec.util.CallBack;
import org.apache.drill.exec.vector.AllocationHelper;
import org.apache.drill.exec.vector.NullableVarCharVector;
import org.apache.drill.exec.vector.SchemaChangeCallBack;
import org.apache.drill.exec.vector.ValueVector;

import com.google.common.collect.Lists;
import com.google.common.collect.Maps;

/**
 * Record batch used for a particular scan. Operators against one or more
 */
public class ScanBatch implements CloseableRecordBatch {
  private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(ScanBatch.class);
  private static final ControlsInjector injector = ControlsInjectorFactory.getInjector(ScanBatch.class);

  /** Main collection of fields' value vectors. */
  private final VectorContainer container = new VectorContainer();

  /** Fields' value vectors indexed by fields' keys. */
  private final Map fieldVectorMap =
      Maps.newHashMap();

  private int recordCount;
  private final FragmentContext context;
  private final OperatorContext oContext;
  private Iterator readers;
  private RecordReader currentReader;
  private BatchSchema schema;
  private final Mutator mutator = new Mutator();
  private Iterator partitionColumns;
  private String[] partitionValues;
  private List partitionVectors;
  private List selectedPartitionColumns;
  private String partitionColumnDesignator;
  private boolean done = false;
  private SchemaChangeCallBack callBack = new SchemaChangeCallBack();
  private boolean hasReadNonEmptyFile = false;


  public ScanBatch(PhysicalOperator subScanConfig, FragmentContext context,
                   OperatorContext oContext, Iterator readers,
                   List partitionColumns,
                   List selectedPartitionColumns) throws ExecutionSetupException {
    this.context = context;
    this.readers = readers;
    if (!readers.hasNext()) {
      throw new ExecutionSetupException("A scan batch must contain at least one reader.");
    }
    currentReader = readers.next();
    this.oContext = oContext;

    boolean setup = false;
    try {
      oContext.getStats().startProcessing();
      currentReader.setup(oContext, mutator);
      setup = true;
    } finally {
      // if we had an exception during setup, make sure to release existing data.
      if (!setup) {
        try {
          currentReader.close();
        } catch(final Exception e) {
          throw new ExecutionSetupException(e);
        }
      }
      oContext.getStats().stopProcessing();
    }
    this.partitionColumns = partitionColumns.iterator();
    partitionValues = this.partitionColumns.hasNext() ? this.partitionColumns.next() : null;
    this.selectedPartitionColumns = selectedPartitionColumns;

    // TODO Remove null check after DRILL-2097 is resolved. That JIRA refers to test cases that do not initialize
    // options; so labelValue = null.
    final OptionValue labelValue = context.getOptions().getOption(ExecConstants.FILESYSTEM_PARTITION_COLUMN_LABEL);
    partitionColumnDesignator = labelValue == null ? "dir" : labelValue.string_val;

    addPartitionVectors();
  }

  public ScanBatch(PhysicalOperator subScanConfig, FragmentContext context,
                   Iterator readers)
      throws ExecutionSetupException {
    this(subScanConfig, context,
        context.newOperatorContext(subScanConfig),
        readers, Collections. emptyList(), Collections. emptyList());
  }

  @Override
  public FragmentContext getContext() {
    return context;
  }

  @Override
  public BatchSchema getSchema() {
    return schema;
  }

  @Override
  public int getRecordCount() {
    return recordCount;
  }

  @Override
  public void kill(boolean sendUpstream) {
    if (sendUpstream) {
      done = true;
    } else {
      releaseAssets();
    }
  }

  private void releaseAssets() {
    container.zeroVectors();
  }

  private void clearFieldVectorMap() {
    for (final ValueVector v : fieldVectorMap.values()) {
      v.clear();
    }
  }

  @Override
  public IterOutcome next() {
    if (done) {
      return IterOutcome.NONE;
    }
    oContext.getStats().startProcessing();
    try {
      try {
        injector.injectChecked(context.getExecutionControls(), "next-allocate", OutOfMemoryException.class);

        currentReader.allocate(fieldVectorMap);
      } catch (OutOfMemoryException e) {
        logger.debug("Caught Out of Memory Exception", e);
        clearFieldVectorMap();
        return IterOutcome.OUT_OF_MEMORY;
      }
      while ((recordCount = currentReader.next()) == 0) {
        try {
          if (!readers.hasNext()) {
            // We're on the last reader, and it has no (more) rows.
            currentReader.close();
            releaseAssets();
            done = true;  // have any future call to next() return NONE

            if (mutator.isNewSchema()) {
              // This last reader has a new schema (e.g., we have a zero-row
              // file or other source).  (Note that some sources have a non-
              // null/non-trivial schema even when there are no rows.)

              container.buildSchema(SelectionVectorMode.NONE);
              schema = container.getSchema();

              return IterOutcome.OK_NEW_SCHEMA;
            }
            return IterOutcome.NONE;
          }
          // At this point, the reader that hit its end is not the last reader.

          // If all the files we have read so far are just empty, the schema is not useful
          if (! hasReadNonEmptyFile) {
            container.clear();
            for (ValueVector v : fieldVectorMap.values()) {
              v.clear();
            }
            fieldVectorMap.clear();
          }

          currentReader.close();
          currentReader = readers.next();
          partitionValues = partitionColumns.hasNext() ? partitionColumns.next() : null;
          currentReader.setup(oContext, mutator);
          try {
            currentReader.allocate(fieldVectorMap);
          } catch (OutOfMemoryException e) {
            logger.debug("Caught OutOfMemoryException");
            clearFieldVectorMap();
            return IterOutcome.OUT_OF_MEMORY;
          }
          addPartitionVectors();

        } catch (ExecutionSetupException e) {
          this.context.fail(e);
          releaseAssets();
          return IterOutcome.STOP;
        }
      }
      // At this point, the current reader has read 1 or more rows.

      hasReadNonEmptyFile = true;
      populatePartitionVectors();

      for (VectorWrapper w : container) {
        w.getValueVector().getMutator().setValueCount(recordCount);
      }


      // this is a slight misuse of this metric but it will allow Readers to report how many records they generated.
      final boolean isNewSchema = mutator.isNewSchema();
      oContext.getStats().batchReceived(0, getRecordCount(), isNewSchema);

      if (isNewSchema) {
        container.buildSchema(SelectionVectorMode.NONE);
        schema = container.getSchema();
        return IterOutcome.OK_NEW_SCHEMA;
      } else {
        return IterOutcome.OK;
      }
    } catch (OutOfMemoryException ex) {
      context.fail(UserException.memoryError(ex).build(logger));
      return IterOutcome.STOP;
    } catch (Exception ex) {
      logger.debug("Failed to read the batch. Stopping...", ex);
      context.fail(ex);
      return IterOutcome.STOP;
    } finally {
      oContext.getStats().stopProcessing();
    }
  }

  private void addPartitionVectors() throws ExecutionSetupException {
    try {
      if (partitionVectors != null) {
        for (ValueVector v : partitionVectors) {
          v.clear();
        }
      }
      partitionVectors = Lists.newArrayList();
      for (int i : selectedPartitionColumns) {
        final MaterializedField field =
            MaterializedField.create(SchemaPath.getSimplePath(partitionColumnDesignator + i).getAsUnescapedPath(),
                                     Types.optional(MinorType.VARCHAR));
        final ValueVector v = mutator.addField(field, NullableVarCharVector.class);
        partitionVectors.add(v);
      }
    } catch(SchemaChangeException e) {
      throw new ExecutionSetupException(e);
    }
  }

  private void populatePartitionVectors() {
    for (int index = 0; index < selectedPartitionColumns.size(); index++) {
      final int i = selectedPartitionColumns.get(index);
      final NullableVarCharVector v = (NullableVarCharVector) partitionVectors.get(index);
      if (partitionValues.length > i) {
        final String val = partitionValues[i];
        AllocationHelper.allocate(v, recordCount, val.length());
        final byte[] bytes = val.getBytes();
        for (int j = 0; j < recordCount; j++) {
          v.getMutator().setSafe(j, bytes, 0, bytes.length);
        }
        v.getMutator().setValueCount(recordCount);
      } else {
        AllocationHelper.allocate(v, recordCount, 0);
        v.getMutator().setValueCount(recordCount);
      }
    }
  }

  @Override
  public SelectionVector2 getSelectionVector2() {
    throw new UnsupportedOperationException();
  }

  @Override
  public SelectionVector4 getSelectionVector4() {
    throw new UnsupportedOperationException();
  }

  @Override
  public TypedFieldId getValueVectorId(SchemaPath path) {
    return container.getValueVectorId(path);
  }

  @Override
  public VectorWrapper getValueAccessorById(Class clazz, int... ids) {
    return container.getValueAccessorById(clazz, ids);
  }

  private class Mutator implements OutputMutator {
    /** Whether schema has changed since last inquiry (via #isNewSchema}).  Is
     *  true before first inquiry. */
    private boolean schemaChanged = true;


    @SuppressWarnings("unchecked")
    @Override
    public  T addField(MaterializedField field,
                                              Class clazz) throws SchemaChangeException {
      // Check if the field exists.
      ValueVector v = fieldVectorMap.get(field.getPath());
      if (v == null || v.getClass() != clazz) {
        // Field does not exist--add it to the map and the output container.
        v = TypeHelper.getNewVector(field, oContext.getAllocator(), callBack);
        if (!clazz.isAssignableFrom(v.getClass())) {
          throw new SchemaChangeException(
              String.format(
                  "The class that was provided, %s, does not correspond to the "
                  + "expected vector type of %s.",
                  clazz.getSimpleName(), v.getClass().getSimpleName()));
        }

        final ValueVector old = fieldVectorMap.put(field.getPath(), v);
        if (old != null) {
          old.clear();
          container.remove(old);
        }

        container.add(v);
        // Added new vectors to the container--mark that the schema has changed.
        schemaChanged = true;
      }

      return clazz.cast(v);
    }

    @Override
    public void allocate(int recordCount) {
      for (final ValueVector v : fieldVectorMap.values()) {
        AllocationHelper.allocate(v, recordCount, 50, 10);
      }
    }

    /**
     * Reports whether schema has changed (field was added or re-added) since
     * last call to {@link #isNewSchema}.  Returns true at first call.
     */
    @Override
    public boolean isNewSchema() {
      // Check if top-level schema or any of the deeper map schemas has changed.

      // Note:  Callback's getSchemaChangedAndReset() must get called in order
      // to reset it and avoid false reports of schema changes in future.  (Be
      // careful with short-circuit OR (||) operator.)

      final boolean deeperSchemaChanged = callBack.getSchemaChangedAndReset();
      if (schemaChanged || deeperSchemaChanged) {
        schemaChanged = false;
        return true;
      }
      return false;
    }

    @Override
    public DrillBuf getManagedBuffer() {
      return oContext.getManagedBuffer();
    }

    @Override
    public CallBack getCallBack() {
      return callBack;
    }
  }

  @Override
  public Iterator> iterator() {
    return container.iterator();
  }

  @Override
  public WritableBatch getWritableBatch() {
    return WritableBatch.get(this);
  }

  @Override
  public void close() throws Exception {
    container.clear();
    for (final ValueVector v : partitionVectors) {
      v.clear();
    }
    fieldVectorMap.clear();
    currentReader.close();
  }

  @Override
  public VectorContainer getOutgoingContainer() {
    throw new UnsupportedOperationException(
        String.format("You should not call getOutgoingContainer() for class %s",
                      this.getClass().getCanonicalName()));
  }
}