io.trino.plugin.hive.s3select.S3SelectCsvRecordReader Maven / Gradle / Ivy
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.trino.plugin.hive.s3select;
import com.amazonaws.services.s3.model.CSVInput;
import com.amazonaws.services.s3.model.CSVOutput;
import com.amazonaws.services.s3.model.ExpressionType;
import com.amazonaws.services.s3.model.InputSerialization;
import com.amazonaws.services.s3.model.OutputSerialization;
import com.amazonaws.services.s3.model.SelectObjectContentRequest;
import io.trino.plugin.hive.s3.TrinoS3FileSystem;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import java.net.URI;
import java.util.Properties;
import static org.apache.hadoop.hive.serde.serdeConstants.ESCAPE_CHAR;
import static org.apache.hadoop.hive.serde.serdeConstants.QUOTE_CHAR;
class S3SelectCsvRecordReader
extends S3SelectLineRecordReader
{
/*
* Sentinel unicode comment character (http://www.unicode.org/faq/private_use.html#nonchar_codes).
* It is expected that \uFDD0 sentinel comment character is not the first character in any row of user's CSV S3 object.
* The rows starting with \uFDD0 will be skipped by S3Select and will not be a part of the result set or aggregations.
* To process CSV objects that may contain \uFDD0 as first row character please disable S3SelectPushdown.
* TODO: Remove this proxy logic when S3Select API supports disabling of row level comments.
*/
private static final String COMMENTS_CHAR_STR = "\uFDD0";
public S3SelectCsvRecordReader(
Configuration configuration,
Path path,
long start,
long length,
Properties schema,
String ionSqlQuery,
TrinoS3ClientFactory s3ClientFactory)
{
super(configuration, path, start, length, schema, ionSqlQuery, s3ClientFactory);
}
@Override
public SelectObjectContentRequest buildSelectObjectRequest(Properties schema, String query, Path path)
{
SelectObjectContentRequest selectObjectRequest = new SelectObjectContentRequest();
URI uri = path.toUri();
selectObjectRequest.setBucketName(TrinoS3FileSystem.extractBucketName(uri));
selectObjectRequest.setKey(TrinoS3FileSystem.keyFromPath(path));
selectObjectRequest.setExpression(query);
selectObjectRequest.setExpressionType(ExpressionType.SQL);
String fieldDelimiter = getFieldDelimiter(schema);
String quoteChar = schema.getProperty(QUOTE_CHAR, null);
String escapeChar = schema.getProperty(ESCAPE_CHAR, null);
CSVInput selectObjectCSVInputSerialization = new CSVInput();
selectObjectCSVInputSerialization.setRecordDelimiter(lineDelimiter);
selectObjectCSVInputSerialization.setFieldDelimiter(fieldDelimiter);
selectObjectCSVInputSerialization.setComments(COMMENTS_CHAR_STR);
selectObjectCSVInputSerialization.setQuoteCharacter(quoteChar);
selectObjectCSVInputSerialization.setQuoteEscapeCharacter(escapeChar);
InputSerialization selectObjectInputSerialization = new InputSerialization();
selectObjectInputSerialization.setCompressionType(getCompressionType(path));
selectObjectInputSerialization.setCsv(selectObjectCSVInputSerialization);
selectObjectRequest.setInputSerialization(selectObjectInputSerialization);
OutputSerialization selectObjectOutputSerialization = new OutputSerialization();
CSVOutput selectObjectCSVOutputSerialization = new CSVOutput();
selectObjectCSVOutputSerialization.setRecordDelimiter(lineDelimiter);
selectObjectCSVOutputSerialization.setFieldDelimiter(fieldDelimiter);
selectObjectCSVOutputSerialization.setQuoteCharacter(quoteChar);
selectObjectCSVOutputSerialization.setQuoteEscapeCharacter(escapeChar);
selectObjectOutputSerialization.setCsv(selectObjectCSVOutputSerialization);
selectObjectRequest.setOutputSerialization(selectObjectOutputSerialization);
return selectObjectRequest;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy