All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.databricks.jdbc.api.impl.arrow.ArrowResultChunk Maven / Gradle / Ivy

There is a newer version: 2.7.1
Show newest version
package com.databricks.jdbc.api.impl.arrow;

import static com.databricks.jdbc.common.DatabricksJdbcConstants.IS_FAKE_SERVICE_TEST_PROP;
import static com.databricks.jdbc.common.util.DatabricksThriftUtil.createExternalLink;
import static com.databricks.jdbc.common.util.ValidationUtil.checkHTTPError;

import com.databricks.jdbc.common.CompressionCodec;
import com.databricks.jdbc.common.util.DecompressionUtil;
import com.databricks.jdbc.dbclient.IDatabricksHttpClient;
import com.databricks.jdbc.exception.DatabricksParsingException;
import com.databricks.jdbc.exception.DatabricksSQLException;
import com.databricks.jdbc.log.JdbcLogger;
import com.databricks.jdbc.log.JdbcLoggerFactory;
import com.databricks.jdbc.model.client.thrift.generated.TSparkArrowResultLink;
import com.databricks.jdbc.model.core.ExternalLink;
import com.databricks.sdk.service.sql.BaseChunkInfo;
import com.google.common.annotations.VisibleForTesting;
import java.io.IOException;
import java.io.InputStream;
import java.net.SocketException;
import java.net.URISyntaxException;
import java.nio.channels.ClosedByInterruptException;
import java.time.Instant;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.ValueVector;
import org.apache.arrow.vector.VectorSchemaRoot;
import org.apache.arrow.vector.ipc.ArrowStreamReader;
import org.apache.arrow.vector.util.TransferPair;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.URIBuilder;

public class ArrowResultChunk {

  private static final JdbcLogger LOGGER = JdbcLoggerFactory.getLogger(ArrowResultChunk.class);

  /**
   * The status of a chunk would proceed in following path:
   *
   * 
    *
  • Create placeholder for chunk, along with the chunk cardinal *
  • Fetch chunk url *
  • Submit task for data download *
      *
    • Download has completed *
    • Download has failed and we will retry *
    • Download has failed and we gave up *
    *
  • Data has been consumed and chunk is free to be released from memory *
*/ enum ChunkStatus { /** Default status, though for the ArrowChunk, it should be initialized with Pending state */ UNKNOWN, /** This is a placeholder for chunk, we don't even have the Url */ PENDING, /** We have the Url for the chunk, and it is ready for download */ URL_FETCHED, /** Download task has been submitted */ DOWNLOAD_IN_PROGRESS, /** Data has been downloaded and ready for consumption */ DOWNLOAD_SUCCEEDED, /** Result Chunk was of type inline arrow and extract is successful */ EXTRACT_SUCCEEDED, /** Download has failed and it would be retried */ DOWNLOAD_FAILED, /** Result Chunk was of type inline arrow and extract has failed */ EXTRACT_FAILED, /** Download has failed and we have given up */ DOWNLOAD_FAILED_ABORTED, /** Download has been cancelled */ CANCELLED, /** Chunk memory has been consumed and released */ CHUNK_RELEASED, DOWNLOAD_RETRY } private static final Integer SECONDS_BUFFER_FOR_EXPIRY = 60; final long numRows; long rowOffset; List> recordBatchList; private final long chunkIndex; private ExternalLink chunkLink; private final String statementId; private Instant expiryTime; private ChunkStatus status; private final BufferAllocator rootAllocator; private String errorMessage; private boolean isDataInitialized; private static boolean injectError = false; private static int errorInjectionCountMaxValue = 0; private int errorInjectionCount = 0; private ArrowResultChunk(Builder builder) throws DatabricksParsingException { this.chunkIndex = builder.chunkIndex; this.numRows = builder.numRows; this.rowOffset = builder.rowOffset; this.chunkLink = builder.chunkLink; this.statementId = builder.statementId; this.expiryTime = builder.expiryTime; this.status = builder.status; this.rootAllocator = new RootAllocator(/* limit= */ Integer.MAX_VALUE); if (builder.inputStream != null) { // Data is already available try { initializeData(builder.inputStream); this.status = ChunkStatus.EXTRACT_SUCCEEDED; } catch (DatabricksSQLException | IOException e) { handleFailure(e, ChunkStatus.EXTRACT_FAILED); } } } public static Builder builder() { return new Builder(); } public static class ArrowResultChunkIterator { private final ArrowResultChunk resultChunk; // total number of record batches in the chunk private final int recordBatchesInChunk; // index of record batch in chunk private int recordBatchCursorInChunk; // total number of rows in record batch under consideration private int rowsInRecordBatch; // current row index in current record batch private int rowCursorInRecordBatch; // total number of rows read private int rowsReadByIterator; ArrowResultChunkIterator(ArrowResultChunk resultChunk) { this.resultChunk = resultChunk; this.recordBatchesInChunk = resultChunk.getRecordBatchCountInChunk(); // start before first batch this.recordBatchCursorInChunk = -1; // initialize to -1 this.rowsInRecordBatch = -1; // start before first row this.rowCursorInRecordBatch = -1; // initialize rows read to 0 this.rowsReadByIterator = 0; } /** * Moves iterator to the next row of the chunk. Returns false if it is at the last row in the * chunk. */ boolean nextRow() { if (!hasNextRow()) { return false; } // Either not initialized or crossed record batch boundary if (rowsInRecordBatch < 0 || ++rowCursorInRecordBatch == rowsInRecordBatch) { // reset rowCursor to 0 rowCursorInRecordBatch = 0; // Fetches number of rows in the record batch using the number of values in the first column // vector recordBatchCursorInChunk++; while (recordBatchCursorInChunk < recordBatchesInChunk && resultChunk.recordBatchList.get(recordBatchCursorInChunk).get(0).getValueCount() == 0) { recordBatchCursorInChunk++; } rowsInRecordBatch = resultChunk.recordBatchList.get(recordBatchCursorInChunk).get(0).getValueCount(); } rowsReadByIterator++; return true; } /** Returns whether the next row in the chunk exists. */ boolean hasNextRow() { if (rowsReadByIterator >= resultChunk.numRows) return false; // If there are more rows in record batch return (rowCursorInRecordBatch < rowsInRecordBatch - 1) // or there are more record batches to be processed || (recordBatchCursorInChunk < recordBatchesInChunk - 1); } /** Returns object in the current row at the specified columnIndex. */ Object getColumnObjectAtCurrentRow(int columnIndex) { return this.resultChunk .getColumnVector(this.recordBatchCursorInChunk, columnIndex) .getObject(this.rowCursorInRecordBatch); } } @VisibleForTesting void setIsDataInitialized(boolean isDataInitialized) { this.isDataInitialized = isDataInitialized; } /** Sets link details for the given chunk. */ void setChunkLink(ExternalLink chunk) { this.chunkLink = chunk; this.expiryTime = Instant.parse(chunk.getExpiration()); this.status = ChunkStatus.URL_FETCHED; } /** Updates status for the chunk */ void setStatus(ChunkStatus status) { this.status = status; } /** Checks if the link is valid */ boolean isChunkLinkInvalid() { return status == ChunkStatus.PENDING || (!Boolean.parseBoolean(System.getProperty(IS_FAKE_SERVICE_TEST_PROP)) && expiryTime.minusSeconds(SECONDS_BUFFER_FOR_EXPIRY).isBefore(Instant.now())); } /** Returns the status for the chunk */ ChunkStatus getStatus() { return this.status; } void addHeaders(HttpGet getRequest, Map headers) { if (headers != null) { headers.forEach(getRequest::addHeader); } else { LOGGER.debug( String.format( "No encryption headers present for chunk index [%s] and statement [%s]", chunkIndex, statementId)); } } String getErrorMessage() { return this.errorMessage; } void downloadData(IDatabricksHttpClient httpClient, CompressionCodec compressionCodec) throws DatabricksParsingException, IOException { // Inject error if enabled for testing if (injectError && errorInjectionCount < errorInjectionCountMaxValue) { errorInjectionCount++; setStatus(ChunkStatus.DOWNLOAD_FAILED); throw new DatabricksParsingException( "Injected connection reset", new SocketException("Connection reset")); } CloseableHttpResponse response = null; try { URIBuilder uriBuilder = new URIBuilder(chunkLink.getExternalLink()); HttpGet getRequest = new HttpGet(uriBuilder.build()); addHeaders(getRequest, chunkLink.getHttpHeaders()); // Retry would be done in http client, we should not bother about that here response = httpClient.execute(getRequest); checkHTTPError(response); String context = String.format( "Data decompression for chunk index [%d] and statement [%s]", this.chunkIndex, this.statementId); InputStream uncompressedStream = DecompressionUtil.decompress( response.getEntity().getContent(), compressionCodec, context); initializeData(uncompressedStream); setStatus(ChunkStatus.DOWNLOAD_SUCCEEDED); } catch (IOException | DatabricksSQLException | URISyntaxException e) { handleFailure(e, ChunkStatus.DOWNLOAD_FAILED); } finally { if (response != null) { response.close(); } } } /** * Decompresses the given {@link InputStream} and initializes {@link #recordBatchList} from * decompressed stream. * * @param inputStream the input stream to decompress * @throws DatabricksSQLException if decompression fails * @throws IOException if reading from the stream fails */ void initializeData(InputStream inputStream) throws DatabricksSQLException, IOException { LOGGER.debug( String.format( "Parsing data for chunk index [%s] and statement [%s]", this.chunkIndex, this.statementId)); this.recordBatchList = getRecordBatchList(inputStream, this.rootAllocator, this.statementId, this.chunkIndex); LOGGER.debug( String.format( "Data parsed for chunk index [%s] and statement [%s]", this.chunkIndex, this.statementId)); this.isDataInitialized = true; } void handleFailure(Exception exception, ChunkStatus failedStatus) throws DatabricksParsingException { this.errorMessage = String.format( "Data parsing failed for chunk index [%d] and statement [%s]. Exception [%s]", this.chunkIndex, this.statementId, exception); LOGGER.error(this.errorMessage); setStatus(failedStatus); throw new DatabricksParsingException(this.errorMessage, exception); } /** * Releases chunk from memory * * @return true if chunk is released, false if it was already released */ synchronized boolean releaseChunk() { if (status == ChunkStatus.CHUNK_RELEASED) { return false; } if (isDataInitialized) { logAllocatorStats("BeforeRelease"); purgeArrowData(this.recordBatchList); rootAllocator.close(); } setStatus(ChunkStatus.CHUNK_RELEASED); return true; } /** Returns number of recordBatches in the chunk. */ int getRecordBatchCountInChunk() { return this.isDataInitialized ? this.recordBatchList.size() : 0; } ArrowResultChunkIterator getChunkIterator() { return new ArrowResultChunkIterator(this); } /** Returns the chunk download link */ String getChunkUrl() { return chunkLink.getExternalLink(); } /** Returns index for current chunk */ Long getChunkIndex() { return this.chunkIndex; } private ValueVector getColumnVector(int recordBatchIndex, int columnIndex) { return this.recordBatchList.get(recordBatchIndex).get(columnIndex); } private static List> getRecordBatchList( InputStream inputStream, BufferAllocator rootAllocator, String statementId, long chunkIndex) throws IOException { List> recordBatchList = new ArrayList<>(); try (ArrowStreamReader arrowStreamReader = new ArrowStreamReader(inputStream, rootAllocator)) { VectorSchemaRoot vectorSchemaRoot = arrowStreamReader.getVectorSchemaRoot(); while (arrowStreamReader.loadNextBatch()) { recordBatchList.add(getVectorsFromSchemaRoot(vectorSchemaRoot, rootAllocator)); vectorSchemaRoot.clear(); } } catch (ClosedByInterruptException e) { // release resources if thread is interrupted when reading arrow data LOGGER.error( e, "Data parsing interrupted for chunk index [%s] and statement [%s]. Error [%s]", chunkIndex, statementId, e.getMessage()); purgeArrowData(recordBatchList); } catch (IOException e) { LOGGER.error( "Error while reading arrow data, purging the local list and rethrowing the exception."); purgeArrowData(recordBatchList); throw e; } return recordBatchList; } private static List getVectorsFromSchemaRoot( VectorSchemaRoot vectorSchemaRoot, BufferAllocator rootAllocator) { return vectorSchemaRoot.getFieldVectors().stream() .map( fieldVector -> { TransferPair transferPair = fieldVector.getTransferPair(rootAllocator); transferPair.transfer(); return transferPair.getTo(); }) .collect(Collectors.toList()); } private static void purgeArrowData(List> recordBatchList) { recordBatchList.forEach(vectors -> vectors.forEach(ValueVector::close)); recordBatchList.clear(); } private void logAllocatorStats(String event) { long allocatedMemory = rootAllocator.getAllocatedMemory(); long peakMemory = rootAllocator.getPeakMemoryAllocation(); long headRoom = rootAllocator.getHeadroom(); long initReservation = rootAllocator.getInitReservation(); String allocatorStatsLog = String.format( "Chunk allocator stats Log - Event: %s, Chunk Index: %s, Allocated Memory: %s, Peak Memory: %s, Headroom: %s, Init Reservation: %s", event, chunkIndex, allocatedMemory, peakMemory, headRoom, initReservation); LOGGER.debug(allocatorStatsLog); } public static class Builder { private long chunkIndex; private long numRows; private long rowOffset; private ExternalLink chunkLink; private String statementId; private Instant expiryTime; private ChunkStatus status; private InputStream inputStream; public Builder statementId(String statementId) { this.statementId = statementId; return this; } public Builder withChunkInfo(BaseChunkInfo baseChunkInfo) { this.chunkIndex = baseChunkInfo.getChunkIndex(); this.numRows = baseChunkInfo.getRowCount(); this.rowOffset = baseChunkInfo.getRowOffset(); this.status = ChunkStatus.PENDING; return this; } public Builder withInputStream(InputStream stream, long rowCount) { this.numRows = rowCount; this.inputStream = stream; this.status = ChunkStatus.PENDING; return this; } public Builder withThriftChunkInfo(long chunkIndex, TSparkArrowResultLink chunkInfo) { this.chunkIndex = chunkIndex; this.numRows = chunkInfo.getRowCount(); this.rowOffset = chunkInfo.getStartRowOffset(); this.expiryTime = Instant.ofEpochMilli(chunkInfo.getExpiryTime()); this.status = ChunkStatus.URL_FETCHED; // URL has always been fetched in case of thrift this.chunkLink = createExternalLink(chunkInfo, chunkIndex); return this; } public ArrowResultChunk build() throws DatabricksParsingException { return new ArrowResultChunk(this); } } /** Method to enable error injection for testing */ public static void enableErrorInjection() { injectError = true; } /** Method to disable error injection after testing */ public static void disableErrorInjection() { injectError = false; } public static void setErrorInjectionCountMaxValue(int errorInjectionCountMaxValue) { ArrowResultChunk.errorInjectionCountMaxValue = errorInjectionCountMaxValue; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy