io.prestosql.plugin.hive.s3select.S3SelectLineRecordReader Maven / Gradle / Ivy
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.prestosql.plugin.hive.s3select;
import com.amazonaws.services.s3.model.AmazonS3Exception;
import com.amazonaws.services.s3.model.CompressionType;
import com.amazonaws.services.s3.model.SelectObjectContentRequest;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.io.Closer;
import io.airlift.units.Duration;
import io.prestosql.plugin.hive.s3.HiveS3Config;
import io.prestosql.spi.PrestoException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.util.LineReader;
import javax.annotation.concurrent.ThreadSafe;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.Properties;
import static com.google.common.base.Throwables.throwIfInstanceOf;
import static com.google.common.base.Throwables.throwIfUnchecked;
import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_MAX_BACKOFF_TIME;
import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_MAX_CLIENT_RETRIES;
import static io.prestosql.plugin.hive.s3.PrestoS3FileSystem.S3_MAX_RETRY_TIME;
import static io.prestosql.plugin.hive.util.RetryDriver.retry;
import static io.prestosql.spi.StandardErrorCode.NOT_SUPPORTED;
import static java.lang.String.format;
import static java.net.HttpURLConnection.HTTP_BAD_REQUEST;
import static java.net.HttpURLConnection.HTTP_FORBIDDEN;
import static java.net.HttpURLConnection.HTTP_NOT_FOUND;
import static java.util.Objects.requireNonNull;
import static java.util.concurrent.TimeUnit.SECONDS;
import static org.apache.hadoop.hive.serde.serdeConstants.FIELD_DELIM;
import static org.apache.hadoop.hive.serde.serdeConstants.LINE_DELIM;
import static org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_FORMAT;
@ThreadSafe
public abstract class S3SelectLineRecordReader
implements RecordReader
{
private InputStream selectObjectContent;
private long processedRecords;
private long recordsFromS3;
private long position;
private LineReader reader;
private boolean isFirstLine;
private static final Duration BACKOFF_MIN_SLEEP = new Duration(1, SECONDS);
private final PrestoS3SelectClient selectClient;
private final long start;
private final long end;
private final int maxAttempts;
private final Duration maxBackoffTime;
private final Duration maxRetryTime;
private final Closer closer = Closer.create();
private final SelectObjectContentRequest selectObjectContentRequest;
protected final CompressionCodecFactory compressionCodecFactory;
protected final String lineDelimiter;
S3SelectLineRecordReader(
Configuration configuration,
Path path,
long start,
long length,
Properties schema,
String ionSqlQuery,
PrestoS3ClientFactory s3ClientFactory)
{
requireNonNull(configuration, "configuration is null");
requireNonNull(schema, "schema is null");
requireNonNull(path, "path is null");
requireNonNull(ionSqlQuery, "ionSqlQuery is null");
requireNonNull(s3ClientFactory, "s3ClientFactory is null");
this.lineDelimiter = (schema).getProperty(LINE_DELIM, "\n");
this.processedRecords = 0;
this.recordsFromS3 = 0;
this.start = start;
this.position = this.start;
this.end = this.start + length;
this.isFirstLine = true;
this.compressionCodecFactory = new CompressionCodecFactory(configuration);
this.selectObjectContentRequest = buildSelectObjectRequest(schema, ionSqlQuery, path);
HiveS3Config defaults = new HiveS3Config();
this.maxAttempts = configuration.getInt(S3_MAX_CLIENT_RETRIES, defaults.getS3MaxClientRetries()) + 1;
this.maxBackoffTime = Duration.valueOf(configuration.get(S3_MAX_BACKOFF_TIME, defaults.getS3MaxBackoffTime().toString()));
this.maxRetryTime = Duration.valueOf(configuration.get(S3_MAX_RETRY_TIME, defaults.getS3MaxRetryTime().toString()));
this.selectClient = new PrestoS3SelectClient(configuration, s3ClientFactory);
closer.register(selectClient);
}
public abstract SelectObjectContentRequest buildSelectObjectRequest(Properties schema, String query, Path path);
protected CompressionType getCompressionType(Path path)
{
CompressionCodec codec = compressionCodecFactory.getCodec(path);
if (codec == null) {
return CompressionType.NONE;
}
if (codec instanceof GzipCodec) {
return CompressionType.GZIP;
}
if (codec instanceof BZip2Codec) {
return CompressionType.BZIP2;
}
throw new PrestoException(NOT_SUPPORTED, "Compression extension not supported for S3 Select: " + path);
}
private int readLine(Text value)
throws IOException
{
try {
return retry()
.maxAttempts(maxAttempts)
.exponentialBackoff(BACKOFF_MIN_SLEEP, maxBackoffTime, maxRetryTime, 2.0)
.stopOn(InterruptedException.class, UnrecoverableS3OperationException.class)
.run("readRecordsContentStream", () -> {
if (isFirstLine) {
recordsFromS3 = 0;
selectObjectContent = selectClient.getRecordsContent(selectObjectContentRequest);
closer.register(selectObjectContent);
reader = new LineReader(selectObjectContent, lineDelimiter.getBytes(StandardCharsets.UTF_8));
closer.register(reader);
isFirstLine = false;
}
try {
return reader.readLine(value);
}
catch (RuntimeException e) {
isFirstLine = true;
recordsFromS3 = 0;
if (e instanceof AmazonS3Exception) {
switch (((AmazonS3Exception) e).getStatusCode()) {
case HTTP_FORBIDDEN:
case HTTP_NOT_FOUND:
case HTTP_BAD_REQUEST:
throw new UnrecoverableS3OperationException(selectClient.getBucketName(), selectClient.getKeyName(), e);
}
}
throw e;
}
});
}
catch (Exception e) {
throwIfInstanceOf(e, IOException.class);
throwIfUnchecked(e);
throw new RuntimeException(e);
}
}
@Override
public synchronized boolean next(LongWritable key, Text value)
throws IOException
{
while (true) {
int bytes = readLine(value);
if (bytes <= 0) {
if (!selectClient.isRequestComplete()) {
throw new IOException("S3 Select request was incomplete as End Event was not received");
}
return false;
}
recordsFromS3++;
if (recordsFromS3 > processedRecords) {
position += bytes;
processedRecords++;
key.set(processedRecords);
return true;
}
}
}
@Override
public LongWritable createKey()
{
return new LongWritable();
}
@Override
public Text createValue()
{
return new Text();
}
@Override
public long getPos()
{
return position;
}
@Override
public void close()
throws IOException
{
closer.close();
}
@Override
public float getProgress()
{
return ((float) (position - start)) / (end - start);
}
String getFieldDelimiter(Properties schema)
{
return schema.getProperty(FIELD_DELIM, schema.getProperty(SERIALIZATION_FORMAT));
}
/**
* This exception is for stopping retries for S3 Select calls that shouldn't be retried.
* For example, "Caused by: com.amazonaws.services.s3.model.AmazonS3Exception: Forbidden (Service: Amazon S3; Status Code: 403 ..."
*/
@VisibleForTesting
static class UnrecoverableS3OperationException
extends RuntimeException
{
public UnrecoverableS3OperationException(String bucket, String key, Throwable cause)
{
// append bucket and key to the message
super(format("%s (Bucket: %s, Key: %s)", cause, bucket, key));
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy