org.apache.gobblin.compaction.hive.AvroExternalTable Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.gobblin.compaction.hive;
import java.io.IOException;
import java.io.InputStream;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
import java.util.UUID;
import org.apache.avro.Schema;
import org.apache.avro.file.DataFileReader;
import org.apache.avro.file.SeekableInput;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.gobblin.util.HiveJdbcConnector;
/**
* A class for managing Hive external tables created on Avro files.
*/
public class AvroExternalTable extends HiveTable {
private static final Logger LOG = LoggerFactory.getLogger(AvroExternalTable.class);
private static final String HIVE_TMPSCHEMA_DIR = "hive.tmpschema.dir";
private static final String HIVE_TMPDATA_DIR = "hive.tmpdata.dir";
private static final String HIVE_TMPDATA_DIR_DEFAULT = "/";
private static final String CREATE_TABLE_STMT =
"CREATE EXTERNAL TABLE %1$s " + " ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'" + " STORED AS"
+ " INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'"
+ " OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'" + " LOCATION '%2$s'"
+ " TBLPROPERTIES ('avro.schema.url'='%3$s')";
private final String dataLocationInHdfs;
private final String schemaLocationInHdfs;
private final boolean deleteSchemaAfterDone;
private final boolean deleteDataAfterDone;
public static class Builder extends HiveTable.Builder {
private String dataLocationInHdfs = "";
private String schemaLocationInHdfs = "";
private boolean moveDataToTmpHdfsDir = false;
private String extensionToBeMoved;
public Builder withDataLocation(String dataLocationInHdfs) {
this.dataLocationInHdfs = dataLocationInHdfs;
return this;
}
public Builder withSchemaLocation(String schemaLocationInHdfs) {
this.schemaLocationInHdfs = schemaLocationInHdfs;
return this;
}
public Builder withMoveDataToTmpHdfsDir(String extensionToBeMoved) {
this.moveDataToTmpHdfsDir = true;
this.extensionToBeMoved = extensionToBeMoved;
return this;
}
public AvroExternalTable build() throws IOException {
return new AvroExternalTable(this);
}
}
private AvroExternalTable(AvroExternalTable.Builder builder) throws IOException {
super(builder);
if (builder.moveDataToTmpHdfsDir) {
this.dataLocationInHdfs = moveDataFileToSeparateHdfsDir(builder.dataLocationInHdfs, builder.extensionToBeMoved);
this.deleteDataAfterDone = true;
} else {
this.dataLocationInHdfs = builder.dataLocationInHdfs;
this.deleteDataAfterDone = false;
}
if (StringUtils.isNotBlank(builder.schemaLocationInHdfs)) {
this.schemaLocationInHdfs = builder.schemaLocationInHdfs;
this.attributes = getAttributesFromAvroSchemaFile();
this.deleteSchemaAfterDone = false;
} else {
Schema schema = getSchemaFromAvroDataFile();
this.attributes = parseSchema(schema);
this.schemaLocationInHdfs = writeSchemaToHdfs(schema);
this.deleteSchemaAfterDone = true;
}
}
private List getAttributesFromAvroSchemaFile() throws IOException {
try (InputStream schemaInputStream = new HdfsReader(this.schemaLocationInHdfs).getInputStream()) {
Schema schema = new Schema.Parser().parse(schemaInputStream);
return parseSchema(schema);
}
}
private Schema getSchemaFromAvroDataFile() throws IOException {
String firstDataFilePath = HdfsReader.getFirstDataFilePathInDir(this.dataLocationInHdfs);
LOG.info("Extracting schema for table " + this.name + " from avro data file " + firstDataFilePath);
SeekableInput sin = new HdfsReader(firstDataFilePath).getFsInput();
try (DataFileReader dfr = new DataFileReader<>(sin, new GenericDatumReader())) {
Schema schema = dfr.getSchema();
return schema;
}
}
private String writeSchemaToHdfs(Schema schema) throws IOException {
String defaultTmpSchemaDir = getParentDir(this.dataLocationInHdfs);
String tmpSchemaDir = CompactionRunner.jobProperties.getProperty(HIVE_TMPSCHEMA_DIR, defaultTmpSchemaDir);
tmpSchemaDir = addSlash(tmpSchemaDir);
String tmpSchemaPath = tmpSchemaDir + UUID.randomUUID().toString() + ".avsc";
HdfsWriter writer = new HdfsWriter(tmpSchemaPath);
LOG.info("writing schema to HDFS location " + tmpSchemaPath);
writer.write(schema.toString(true));
return tmpSchemaPath;
}
private static String getParentDir(String filePathInHdfs) {
return new Path(filePathInHdfs).getParent().toString();
}
private static List parseSchema(Schema schema) {
List attributes = new ArrayList<>();
List fields = schema.getFields();
for (Schema.Field field : fields) {
attributes.add(convertAvroSchemaFieldToHiveAttribute(field));
}
return attributes;
}
private static HiveAttribute convertAvroSchemaFieldToHiveAttribute(Schema.Field field) {
String avroFieldType = field.schema().getType().toString();
if (avroFieldType.equalsIgnoreCase("UNION")) {
avroFieldType = extractAvroTypeFromUnion(field);
}
if (HiveAttribute.fromAvroType(avroFieldType) == null) {
throw new RuntimeException("Hive does not support attribute type '" + avroFieldType + "'");
}
return new HiveAttribute(field.name(), HiveAttribute.fromAvroType(avroFieldType));
}
private static String extractAvroTypeFromUnion(Schema.Field field) {
if (field.schema().getTypes().size() >= 3) {
LOG.warn("Avro schema field " + field.name() + " has 3 or more types: using the first non-null type");
}
for (Schema schema : field.schema().getTypes()) {
if (!schema.getType().toString().equalsIgnoreCase("NULL")) {
return schema.getType().toString();
}
}
String message =
"Avro schema field " + field.name() + " is a union, but it does not contain a non-null field type.";
LOG.error(message);
throw new RuntimeException(message);
}
public String getDataLocationInHdfs() {
return this.dataLocationInHdfs;
}
public String getSchemaLocationInHdfs() {
return this.schemaLocationInHdfs;
}
@Override
public void createTable(HiveJdbcConnector conn, String jobID) throws SQLException {
String tableName = getNameWithJobId(jobID);
String dropTableStmt = String.format(DROP_TABLE_STMT, tableName);
String hdfsUri = HdfsIO.getHdfsUri();
String createTableStmt = String.format(CREATE_TABLE_STMT, tableName, hdfsUri + this.dataLocationInHdfs,
hdfsUri + this.schemaLocationInHdfs);
conn.executeStatements(dropTableStmt, createTableStmt);
}
@Override
public HiveTable addNewColumnsInSchema(HiveJdbcConnector conn, HiveTable table, String jobId) throws SQLException {
if (hasNoNewColumn(table)) {
return this;
}
HiveManagedTable managedTable = new HiveManagedTable.Builder().withName(this.name).withPrimaryKeys(this.primaryKeys)
.withAttributes(this.attributes).build();
return managedTable.addNewColumnsInSchema(null, table, jobId);
}
protected void deleteTmpFilesIfNeeded() throws IllegalArgumentException, IOException {
if (this.deleteSchemaAfterDone) {
new HdfsWriter(this.schemaLocationInHdfs).delete();
}
if (this.deleteDataAfterDone) {
new HdfsWriter(this.dataLocationInHdfs).delete();
}
}
private String moveDataFileToSeparateHdfsDir(String sourceDir, String extension) throws IOException {
String parentDir = CompactionRunner.jobProperties.getProperty(HIVE_TMPDATA_DIR, HIVE_TMPDATA_DIR_DEFAULT);
parentDir = addSlash(parentDir);
String destination = parentDir + UUID.randomUUID().toString();
LOG.info("Moving data file of table " + this.getName() + " to " + destination);
HdfsWriter.moveSelectFiles(extension, sourceDir, destination);
LOG.info("Moved data file of table " + this.getName() + " to " + destination);
return destination;
}
private static String addSlash(String dir) {
if (!dir.endsWith("/") && !dir.endsWith("\\")) {
return dir + "/";
}
return dir;
}
public boolean hasSamePrimaryKey(AvroExternalTable other) {
return this.primaryKeys.containsAll(other.primaryKeys) && other.primaryKeys.containsAll(this.primaryKeys);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy