All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.bazaarvoice.emodb.hive.schema.RefreshSchema Maven / Gradle / Ivy

There is a newer version: 6.2.3
Show newest version
package com.bazaarvoice.emodb.hive.schema;

import com.bazaarvoice.emodb.hadoop.io.CloseableDataStore;
import com.bazaarvoice.emodb.hadoop.io.HadoopDataStoreManager;
import com.bazaarvoice.emodb.hadoop.io.LocationType;
import com.bazaarvoice.emodb.hadoop.io.LocationUtil;
import com.bazaarvoice.emodb.sor.api.DataStore;
import com.bazaarvoice.emodb.sor.api.Table;
import com.bazaarvoice.emodb.sor.client.DataStoreStreaming;
import com.codahale.metrics.MetricRegistry;
import com.google.common.base.Charsets;
import com.google.common.base.Function;
import com.google.common.base.Joiner;
import com.google.common.base.Throwables;
import com.google.common.collect.FluentIterable;
import com.google.common.collect.ImmutableMap;
import net.sourceforge.argparse4j.inf.ArgumentGroup;
import net.sourceforge.argparse4j.inf.ArgumentParser;
import net.sourceforge.argparse4j.inf.Namespace;

import javax.annotation.Nullable;
import javax.ws.rs.core.UriBuilder;
import java.io.IOException;
import java.io.PrintStream;
import java.io.UnsupportedEncodingException;
import java.net.URI;
import java.net.URLEncoder;
import java.util.Map;

import static com.google.common.base.Preconditions.checkArgument;
import static java.lang.String.format;
import static net.sourceforge.argparse4j.impl.Arguments.storeTrue;

/**
 * Updates all tables in Hive to match the current set of tables in EmoDB.  Note that this will not delete Hive tables
 * that were created in the past if the EmoDB table has been deleted.
 */
public class RefreshSchema extends HiveScriptGenerator {

    /**
     * Command to create the schema.
     */
    private final static String CREATE_SCHEMA_COMMAND =
            "CREATE SCHEMA IF NOT EXISTS %s COMMENT 'location %s'";

    /**
     * Field name and type for generated tables.
     */
    private final static Map EMO_FIELDS = ImmutableMap.builder()
            .put("json", "STRING")
            .put("id", "STRING")
            .put("table", "STRING")
            .put("version", "INT")
            .put("signature", "STRING")
            .put("first_update_at", "TIMESTAMP")
            .put("last_update_at", "TIMESTAMP")
            .build();

    private final static Map DEFAULT_FIELD = ImmutableMap.builder()
            .put("json", "STRING")
            .build();

    /**
     * Command to create a single table.
     */
    private final static String CREATE_TABLE_COMMAND =
            "CREATE EXTERNAL TABLE%s %s.%s " +  // if_not_exists, schema, table_name
            "(%s) " +                           // fields
            "COMMENT '%s' " +                   // comment
            "%s " +                              // row_format_and_storage
            "LOCATION '%s';";                   // location

    private final static String IF_NOT_EXISTS = " IF NOT EXISTS";
    private final static String EMO_FORMAT_AND_STORAGE =
            "ROW FORMAT SERDE 'com.bazaarvoice.emodb.hive.EmoSerDe' " +
            "STORED AS INPUTFORMAT 'com.bazaarvoice.emodb.hadoop.mapred.EmoInputFormat' OUTPUTFORMAT 'org.apache.hadoop.mapred.SequenceFileOutputFormat'";

    private final static String DEFAULT_FORMAT_AND_STORAGE =
            "ROW FORMAT DELIMITED";

    /**
     * Command to drop a single table.
     */
    private final static String DROP_TABLE_COMMAND =
            "DROP TABLE IF EXISTS %s.%s;";

    private static final MetricRegistry METRIC_REGISTRY = new MetricRegistry();

    public static void main(String args[]) throws Exception {
        RefreshSchema instance = new RefreshSchema();
        instance.doMain(args);
    }

    @Override
    protected void addArguments(ArgumentParser parser) {
        ArgumentGroup schemaGroup = parser.addArgumentGroup("schemas");
        schemaGroup.description("Schemas generated by this script");

        schemaGroup.addArgument("--emoSchema")
                .dest("emoSchema")
                .metavar("NAME")
                .nargs("?")
                .help("Name of the schema where \"emodb\" located tables are generated");

        schemaGroup.addArgument("--stashSchema")
                .dest("stashSchema")
                .metavar("NAME")
                .nargs("?")
                .help("Name of the schema where \"emostash\" located tables are generated");

        parser.addArgument("--location")
                .dest("location")
                .metavar("EMOURL")
                .nargs("?")
                .required(true)
                .help("URI location of the EmoDB server, such as \"emodb://ci.us\"");

        parser.addArgument("--apiKey")
                .dest("apiKey")
                .metavar("KEY")
                .nargs("?")
                .required(true)
                .help("API key for connecting to EmoDB");

        parser.addArgument("--compatibility")
                .dest("compatibility")
                .nargs("?")
                .choices("hive", "general")
                .setDefault("hive")
                .help("hive = Use custom SerDe and InputFormat to provide efficiency and rich column names (default).  " +
                        "general = No custom SerDe or InputFormat so is more compatible with Hive alternatives like " +
                        "Presto and Impala but is slightly less efficient and only provides a single JSON column.");

        parser.addArgument("--disableIfNotExists")
                .dest("disableIfNotExists")
                .action(storeTrue())
                .setDefault(false)
                .help("Disables \"IF NOT EXISTS\" from being included in each \"CREATE TABLE\" command");

        parser.addArgument("--recreateTables")
                .dest("recreateTables")
                .action(storeTrue())
                .setDefault(false)
                .help("Drops each table if it exists prior to the \"CREATE TABLE\" command");
    }

    @Override
    protected void generateScript(Namespace namespace, PrintStream out) {
        String emoSchema = namespace.getString("emoSchema");
        String stashSchema = namespace.getString("stashSchema");
        String locationArg = namespace.getString("location");
        String apiKey = namespace.getString("apiKey");
        boolean useEmoSerDe = namespace.getString("compatibility").equals("hive");
        boolean disableIfNotExists = namespace.getBoolean("disableIfNotExists");
        boolean recreateTables = namespace.getBoolean("recreateTables");

        if (emoSchema == null && stashSchema == null) {
            System.err.println("At least one of \"emoSchema\" or \"stashSchema\" is required.");
            System.exit(-1);
        }

        URI emoLocation = null;
        URI stashLocation = null;

        // Validate the location
        try {
            emoLocation = URI.create(locationArg);
            LocationType locationType = LocationUtil.getLocationType(emoLocation);
            checkArgument(LocationUtil.getLocationType(emoLocation) != LocationType.STASH, "Cannot use stash location");

            if (locationType == LocationType.EMO_HOST_DISCOVERY) {
                // When using host discovery for a stash schema make sure it is supported by stash
                if (stashSchema != null) {
                    stashLocation = UriBuilder.fromUri(emoLocation).scheme("emostash").build();
                    // This will throw an exception is the stash location is invalid
                    LocationUtil.getStashLocation(stashLocation);
                }
            } else {  // Fixed host
                // Fixed host cannot be used to generate stash
                checkArgument(stashSchema == null, "Cannot crate an EmoStash schema from a fixed host.");
            }
        } catch (Exception e) {
            System.err.println(e.getMessage());
            System.exit(-1);
        }

        // Create the DataStore that will be used to get table information.
        try (CloseableDataStore dataStore = HadoopDataStoreManager.getInstance().getDataStore(emoLocation, apiKey, METRIC_REGISTRY)) {
            if (emoSchema != null) {
                createSchema(emoSchema, emoLocation, out);
            }
            if (stashSchema != null) {
                createSchema(stashSchema, stashLocation, out);
            }
            createTables(emoLocation, stashLocation, emoSchema, stashSchema, dataStore, useEmoSerDe, disableIfNotExists, recreateTables, out);
        } catch (IOException e) {
            System.err.println("Script generation failed");
            e.printStackTrace(System.err);
        }
    }

    private void createSchema(String schema, URI location, PrintStream out) {
        out.println(format(CREATE_SCHEMA_COMMAND, schema, escapeString(location.toString())));
        out.println();
    }

    private void createTables(URI emoLocation, @Nullable URI stashLocation, @Nullable String emoSchema, @Nullable String stashSchema,
                              DataStore dataStore, boolean useEmoSerDe, boolean disableIfNotExists, boolean recreateTables, PrintStream out) {

        String ifNotExists = disableIfNotExists ? "" : IF_NOT_EXISTS;
        String rowFormat = useEmoSerDe ? EMO_FORMAT_AND_STORAGE : DEFAULT_FORMAT_AND_STORAGE;
        Map fieldMap = useEmoSerDe ? EMO_FIELDS : DEFAULT_FIELD;

        String fields = Joiner.on(", ").join(
                FluentIterable.from(fieldMap.entrySet())
                        .transform(new Function, String>() {
                            @Override
                            public String apply(Map.Entry entry) {
                                return format("%s %s", entry.getKey(), entry.getValue());
                            }
                        }));

        UriBuilder protoEmoUriBuilder = null;
        UriBuilder protoStashUriBuilder = null;

        if (emoSchema != null) {
            protoEmoUriBuilder = UriBuilder.fromUri(emoLocation).path(encode(emoLocation.getPath()));
        }
        if (stashSchema != null && stashLocation != null) {
            protoStashUriBuilder = UriBuilder.fromUri(stashLocation).path(encode(stashLocation.getPath()));
        }

        for (Table table : DataStoreStreaming.listTables(dataStore)) {
            String rawTableName = table.getName();
            if (emoSchema != null) {
                addTable(protoEmoUriBuilder, rawTableName, emoSchema, ifNotExists, fields, rowFormat, recreateTables, out);
            }
            if (stashSchema != null) {
                addTable(protoStashUriBuilder, rawTableName, stashSchema, ifNotExists, fields, rowFormat, recreateTables, out);
            }
        }
    }

    private void addTable(UriBuilder protoUriBuilder, String rawTableName, String schema, String ifNotExists, String fields,
                          String rowFormat, boolean recreateTable, PrintStream out) {
        URI uri = protoUriBuilder.clone()
                .path(encode(rawTableName))
                .build();

        String tableName = rawTableName.replaceAll("[^a-z0-9]", "_");
        String comment = "source table " + rawTableName;

        if (recreateTable) {
            out.println(format(DROP_TABLE_COMMAND, schema, tableName));
        }
        out.println(format(CREATE_TABLE_COMMAND, ifNotExists, schema, tableName, fields, comment, rowFormat, uri));
        out.println();
    }

    /**
     * URL encodes the given string.
     */
    private String encode(String str) {
        if (str == null) {
            return null;
        }
        try {
            return URLEncoder.encode(str, Charsets.UTF_8.name());
        } catch (UnsupportedEncodingException e) {
            // Should never happen
            throw Throwables.propagate(e);
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy