com.bazaarvoice.emodb.hive.schema.RefreshSchema Maven / Gradle / Ivy
package com.bazaarvoice.emodb.hive.schema;
import com.bazaarvoice.emodb.hadoop.io.CloseableDataStore;
import com.bazaarvoice.emodb.hadoop.io.HadoopDataStoreManager;
import com.bazaarvoice.emodb.hadoop.io.LocationType;
import com.bazaarvoice.emodb.hadoop.io.LocationUtil;
import com.bazaarvoice.emodb.sor.api.DataStore;
import com.bazaarvoice.emodb.sor.api.Table;
import com.bazaarvoice.emodb.sor.client.DataStoreStreaming;
import com.codahale.metrics.MetricRegistry;
import com.google.common.base.Charsets;
import com.google.common.base.Function;
import com.google.common.base.Joiner;
import com.google.common.base.Throwables;
import com.google.common.collect.FluentIterable;
import com.google.common.collect.ImmutableMap;
import net.sourceforge.argparse4j.inf.ArgumentGroup;
import net.sourceforge.argparse4j.inf.ArgumentParser;
import net.sourceforge.argparse4j.inf.Namespace;
import javax.annotation.Nullable;
import javax.ws.rs.core.UriBuilder;
import java.io.IOException;
import java.io.PrintStream;
import java.io.UnsupportedEncodingException;
import java.net.URI;
import java.net.URLEncoder;
import java.util.Map;
import static com.google.common.base.Preconditions.checkArgument;
import static java.lang.String.format;
import static net.sourceforge.argparse4j.impl.Arguments.storeTrue;
/**
* Updates all tables in Hive to match the current set of tables in EmoDB. Note that this will not delete Hive tables
* that were created in the past if the EmoDB table has been deleted.
*/
public class RefreshSchema extends HiveScriptGenerator {
/**
* Command to create the schema.
*/
private final static String CREATE_SCHEMA_COMMAND =
"CREATE SCHEMA IF NOT EXISTS %s COMMENT 'location %s'";
/**
* Field name and type for generated tables.
*/
private final static Map EMO_FIELDS = ImmutableMap.builder()
.put("json", "STRING")
.put("id", "STRING")
.put("table", "STRING")
.put("version", "INT")
.put("signature", "STRING")
.put("first_update_at", "TIMESTAMP")
.put("last_update_at", "TIMESTAMP")
.build();
private final static Map DEFAULT_FIELD = ImmutableMap.builder()
.put("json", "STRING")
.build();
/**
* Command to create a single table.
*/
private final static String CREATE_TABLE_COMMAND =
"CREATE EXTERNAL TABLE%s %s.%s " + // if_not_exists, schema, table_name
"(%s) " + // fields
"COMMENT '%s' " + // comment
"%s " + // row_format_and_storage
"LOCATION '%s';"; // location
private final static String IF_NOT_EXISTS = " IF NOT EXISTS";
private final static String EMO_FORMAT_AND_STORAGE =
"ROW FORMAT SERDE 'com.bazaarvoice.emodb.hive.EmoSerDe' " +
"STORED AS INPUTFORMAT 'com.bazaarvoice.emodb.hadoop.mapred.EmoInputFormat' OUTPUTFORMAT 'org.apache.hadoop.mapred.SequenceFileOutputFormat'";
private final static String DEFAULT_FORMAT_AND_STORAGE =
"ROW FORMAT DELIMITED";
/**
* Command to drop a single table.
*/
private final static String DROP_TABLE_COMMAND =
"DROP TABLE IF EXISTS %s.%s;";
private static final MetricRegistry METRIC_REGISTRY = new MetricRegistry();
public static void main(String args[]) throws Exception {
RefreshSchema instance = new RefreshSchema();
instance.doMain(args);
}
@Override
protected void addArguments(ArgumentParser parser) {
ArgumentGroup schemaGroup = parser.addArgumentGroup("schemas");
schemaGroup.description("Schemas generated by this script");
schemaGroup.addArgument("--emoSchema")
.dest("emoSchema")
.metavar("NAME")
.nargs("?")
.help("Name of the schema where \"emodb\" located tables are generated");
schemaGroup.addArgument("--stashSchema")
.dest("stashSchema")
.metavar("NAME")
.nargs("?")
.help("Name of the schema where \"emostash\" located tables are generated");
parser.addArgument("--location")
.dest("location")
.metavar("EMOURL")
.nargs("?")
.required(true)
.help("URI location of the EmoDB server, such as \"emodb://ci.us\"");
parser.addArgument("--apiKey")
.dest("apiKey")
.metavar("KEY")
.nargs("?")
.required(true)
.help("API key for connecting to EmoDB");
parser.addArgument("--compatibility")
.dest("compatibility")
.nargs("?")
.choices("hive", "general")
.setDefault("hive")
.help("hive = Use custom SerDe and InputFormat to provide efficiency and rich column names (default). " +
"general = No custom SerDe or InputFormat so is more compatible with Hive alternatives like " +
"Presto and Impala but is slightly less efficient and only provides a single JSON column.");
parser.addArgument("--disableIfNotExists")
.dest("disableIfNotExists")
.action(storeTrue())
.setDefault(false)
.help("Disables \"IF NOT EXISTS\" from being included in each \"CREATE TABLE\" command");
parser.addArgument("--recreateTables")
.dest("recreateTables")
.action(storeTrue())
.setDefault(false)
.help("Drops each table if it exists prior to the \"CREATE TABLE\" command");
}
@Override
protected void generateScript(Namespace namespace, PrintStream out) {
String emoSchema = namespace.getString("emoSchema");
String stashSchema = namespace.getString("stashSchema");
String locationArg = namespace.getString("location");
String apiKey = namespace.getString("apiKey");
boolean useEmoSerDe = namespace.getString("compatibility").equals("hive");
boolean disableIfNotExists = namespace.getBoolean("disableIfNotExists");
boolean recreateTables = namespace.getBoolean("recreateTables");
if (emoSchema == null && stashSchema == null) {
System.err.println("At least one of \"emoSchema\" or \"stashSchema\" is required.");
System.exit(-1);
}
URI emoLocation = null;
URI stashLocation = null;
// Validate the location
try {
emoLocation = URI.create(locationArg);
LocationType locationType = LocationUtil.getLocationType(emoLocation);
checkArgument(LocationUtil.getLocationType(emoLocation) != LocationType.STASH, "Cannot use stash location");
if (locationType == LocationType.EMO_HOST_DISCOVERY) {
// When using host discovery for a stash schema make sure it is supported by stash
if (stashSchema != null) {
stashLocation = UriBuilder.fromUri(emoLocation).scheme("emostash").build();
// This will throw an exception is the stash location is invalid
LocationUtil.getStashLocation(stashLocation);
}
} else { // Fixed host
// Fixed host cannot be used to generate stash
checkArgument(stashSchema == null, "Cannot crate an EmoStash schema from a fixed host.");
}
} catch (Exception e) {
System.err.println(e.getMessage());
System.exit(-1);
}
// Create the DataStore that will be used to get table information.
try (CloseableDataStore dataStore = HadoopDataStoreManager.getInstance().getDataStore(emoLocation, apiKey, METRIC_REGISTRY)) {
if (emoSchema != null) {
createSchema(emoSchema, emoLocation, out);
}
if (stashSchema != null) {
createSchema(stashSchema, stashLocation, out);
}
createTables(emoLocation, stashLocation, emoSchema, stashSchema, dataStore, useEmoSerDe, disableIfNotExists, recreateTables, out);
} catch (IOException e) {
System.err.println("Script generation failed");
e.printStackTrace(System.err);
}
}
private void createSchema(String schema, URI location, PrintStream out) {
out.println(format(CREATE_SCHEMA_COMMAND, schema, escapeString(location.toString())));
out.println();
}
private void createTables(URI emoLocation, @Nullable URI stashLocation, @Nullable String emoSchema, @Nullable String stashSchema,
DataStore dataStore, boolean useEmoSerDe, boolean disableIfNotExists, boolean recreateTables, PrintStream out) {
String ifNotExists = disableIfNotExists ? "" : IF_NOT_EXISTS;
String rowFormat = useEmoSerDe ? EMO_FORMAT_AND_STORAGE : DEFAULT_FORMAT_AND_STORAGE;
Map fieldMap = useEmoSerDe ? EMO_FIELDS : DEFAULT_FIELD;
String fields = Joiner.on(", ").join(
FluentIterable.from(fieldMap.entrySet())
.transform(new Function, String>() {
@Override
public String apply(Map.Entry entry) {
return format("%s %s", entry.getKey(), entry.getValue());
}
}));
UriBuilder protoEmoUriBuilder = null;
UriBuilder protoStashUriBuilder = null;
if (emoSchema != null) {
protoEmoUriBuilder = UriBuilder.fromUri(emoLocation).path(encode(emoLocation.getPath()));
}
if (stashSchema != null && stashLocation != null) {
protoStashUriBuilder = UriBuilder.fromUri(stashLocation).path(encode(stashLocation.getPath()));
}
for (Table table : DataStoreStreaming.listTables(dataStore)) {
String rawTableName = table.getName();
if (emoSchema != null) {
addTable(protoEmoUriBuilder, rawTableName, emoSchema, ifNotExists, fields, rowFormat, recreateTables, out);
}
if (stashSchema != null) {
addTable(protoStashUriBuilder, rawTableName, stashSchema, ifNotExists, fields, rowFormat, recreateTables, out);
}
}
}
private void addTable(UriBuilder protoUriBuilder, String rawTableName, String schema, String ifNotExists, String fields,
String rowFormat, boolean recreateTable, PrintStream out) {
URI uri = protoUriBuilder.clone()
.path(encode(rawTableName))
.build();
String tableName = rawTableName.replaceAll("[^a-z0-9]", "_");
String comment = "source table " + rawTableName;
if (recreateTable) {
out.println(format(DROP_TABLE_COMMAND, schema, tableName));
}
out.println(format(CREATE_TABLE_COMMAND, ifNotExists, schema, tableName, fields, comment, rowFormat, uri));
out.println();
}
/**
* URL encodes the given string.
*/
private String encode(String str) {
if (str == null) {
return null;
}
try {
return URLEncoder.encode(str, Charsets.UTF_8.name());
} catch (UnsupportedEncodingException e) {
// Should never happen
throw Throwables.propagate(e);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy