All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hive.druid.DruidStorageHandler Maven / Gradle / Ivy

There is a newer version: 4.0.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 

* http://www.apache.org/licenses/LICENSE-2.0 *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.druid; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; import com.google.common.base.Strings; import com.google.common.base.Supplier; import com.google.common.base.Suppliers; import com.google.common.base.Throwables; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Sets; import com.metamx.common.RetryUtils; import com.metamx.common.lifecycle.Lifecycle; import com.metamx.http.client.HttpClient; import com.metamx.http.client.HttpClientConfig; import com.metamx.http.client.HttpClientInit; import com.metamx.http.client.Request; import com.metamx.http.client.response.StatusResponseHandler; import com.metamx.http.client.response.StatusResponseHolder; import io.druid.data.input.impl.DimensionSchema; import io.druid.data.input.impl.DimensionsSpec; import io.druid.data.input.impl.InputRowParser; import io.druid.data.input.impl.JSONParseSpec; import io.druid.data.input.impl.StringInputRowParser; import io.druid.data.input.impl.TimestampSpec; import io.druid.java.util.common.Pair; import io.druid.metadata.MetadataStorageConnectorConfig; import io.druid.metadata.MetadataStorageTablesConfig; import io.druid.metadata.SQLMetadataConnector; import io.druid.metadata.storage.derby.DerbyConnector; import io.druid.metadata.storage.derby.DerbyMetadataStorage; import io.druid.metadata.storage.mysql.MySQLConnector; import io.druid.metadata.storage.mysql.MySQLConnectorConfig; import io.druid.metadata.storage.postgresql.PostgreSQLConnector; import io.druid.query.aggregation.AggregatorFactory; import io.druid.segment.IndexSpec; import io.druid.segment.indexing.DataSchema; import io.druid.segment.indexing.granularity.GranularitySpec; import io.druid.segment.loading.DataSegmentPusher; import io.druid.segment.loading.SegmentLoadingException; import io.druid.storage.hdfs.HdfsDataSegmentPusher; import io.druid.storage.hdfs.HdfsDataSegmentPusherConfig; import io.druid.timeline.DataSegment; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.Constants; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.druid.io.DruidOutputFormat; import org.apache.hadoop.hive.druid.io.DruidQueryBasedInputFormat; import org.apache.hadoop.hive.druid.io.DruidRecordWriter; import org.apache.hadoop.hive.druid.json.KafkaSupervisorIOConfig; import org.apache.hadoop.hive.druid.json.KafkaSupervisorReport; import org.apache.hadoop.hive.druid.json.KafkaSupervisorSpec; import org.apache.hadoop.hive.druid.json.KafkaSupervisorTuningConfig; import org.apache.hadoop.hive.druid.security.KerberosHttpClient; import org.apache.hadoop.hive.druid.serde.DruidSerDe; import org.apache.hadoop.hive.metastore.DefaultHiveMetaHook; import org.apache.hadoop.hive.metastore.HiveMetaHook; import org.apache.hadoop.hive.metastore.Warehouse; import org.apache.hadoop.hive.metastore.api.EnvironmentContext; import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.metastore.api.LockType; import org.apache.hadoop.hive.metastore.api.MetaException; import org.apache.hadoop.hive.metastore.api.Table; import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils; import org.apache.hadoop.hive.ql.hooks.WriteEntity; import org.apache.hadoop.hive.ql.metadata.HiveStorageHandler; import org.apache.hadoop.hive.ql.metadata.StorageHandlerInfo; import org.apache.hadoop.hive.ql.plan.TableDesc; import org.apache.hadoop.hive.ql.security.authorization.DefaultHiveAuthorizationProvider; import org.apache.hadoop.hive.ql.security.authorization.HiveAuthorizationProvider; import org.apache.hadoop.hive.ql.session.SessionState; import org.apache.hadoop.hive.serde2.AbstractSerDe; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; import org.apache.hadoop.mapred.InputFormat; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputFormat; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hive.common.util.ShutdownHookManager; import org.jboss.netty.handler.codec.http.HttpMethod; import org.jboss.netty.handler.codec.http.HttpResponseStatus; import org.joda.time.DateTime; import org.joda.time.Period; import org.skife.jdbi.v2.exceptions.CallbackFailedException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.Set; import java.util.stream.Collectors; import javax.annotation.Nullable; import static org.apache.hadoop.hive.druid.DruidStorageHandlerUtils.JSON_MAPPER; /** * DruidStorageHandler provides a HiveStorageHandler implementation for Druid. */ @SuppressWarnings({ "rawtypes" }) public class DruidStorageHandler extends DefaultHiveMetaHook implements HiveStorageHandler { protected static final Logger LOG = LoggerFactory.getLogger(DruidStorageHandler.class); protected static final SessionState.LogHelper console = new SessionState.LogHelper(LOG); public static final String SEGMENTS_DESCRIPTOR_DIR_NAME = "segmentsDescriptorDir"; public static final String INTERMEDIATE_SEGMENT_DIR_NAME = "intermediateSegmentDir"; private static final HttpClient HTTP_CLIENT; private static List allowedAlterTypes = ImmutableList.of("ADDPROPS", "DROPPROPS", "ADDCOLS"); static { final Lifecycle lifecycle = new Lifecycle(); try { lifecycle.start(); } catch (Exception e) { LOG.error("Issues with lifecycle start", e); } HTTP_CLIENT = makeHttpClient(lifecycle); ShutdownHookManager.addShutdownHook(() -> lifecycle.stop()); } private SQLMetadataConnector connector; private MetadataStorageTablesConfig druidMetadataStorageTablesConfig = null; private String uniqueId = null; private String rootWorkingDir = null; private Configuration conf; public DruidStorageHandler() { } @VisibleForTesting public DruidStorageHandler(SQLMetadataConnector connector, MetadataStorageTablesConfig druidMetadataStorageTablesConfig ) { this.connector = connector; this.druidMetadataStorageTablesConfig = druidMetadataStorageTablesConfig; } @Override public Class getInputFormatClass() { return DruidQueryBasedInputFormat.class; } @Override public Class getOutputFormatClass() { return DruidOutputFormat.class; } @Override public Class getSerDeClass() { return DruidSerDe.class; } @Override public HiveMetaHook getMetaHook() { return this; } @Override public HiveAuthorizationProvider getAuthorizationProvider() { return new DefaultHiveAuthorizationProvider(); } @Override public void configureInputJobProperties(TableDesc tableDesc, Map jobProperties ) { } @Override public void configureInputJobCredentials(TableDesc tableDesc, Map jobSecrets ) { } @Override public void preCreateTable(Table table) throws MetaException { // Do safety checks if (MetaStoreUtils.isExternalTable(table) && !StringUtils .isEmpty(table.getSd().getLocation())) { throw new MetaException("LOCATION may not be specified for Druid"); } if (table.getPartitionKeysSize() != 0) { throw new MetaException("PARTITIONED BY may not be specified for Druid"); } if (table.getSd().getBucketColsSize() != 0) { throw new MetaException("CLUSTERED BY may not be specified for Druid"); } String dataSourceName = table.getParameters().get(Constants.DRUID_DATA_SOURCE); if (MetaStoreUtils.isExternalTable(table)) { if (dataSourceName == null) { throw new MetaException( String.format("Datasource name should be specified using [%s] for external tables " + "using Druid", Constants.DRUID_DATA_SOURCE)); } // If it is an external table, we are done return; } // It is not an external table // We need to check that datasource was not specified by user if (dataSourceName != null) { throw new MetaException( String.format("Datasource name cannot be specified using [%s] for managed tables " + "using Druid", Constants.DRUID_DATA_SOURCE)); } // We need to check the Druid metadata dataSourceName = Warehouse.getQualifiedName(table); try { getConnector().createSegmentTable(); } catch (Exception e) { LOG.error("Exception while trying to create druid segments table", e); throw new MetaException(e.getMessage()); } Collection existingDataSources = DruidStorageHandlerUtils .getAllDataSourceNames(getConnector(), getDruidMetadataStorageTablesConfig()); LOG.debug("pre-create data source with name {}", dataSourceName); if (existingDataSources.contains(dataSourceName)) { throw new MetaException(String.format("Data source [%s] already existing", dataSourceName)); } table.getParameters().put(Constants.DRUID_DATA_SOURCE, dataSourceName); } @Override public void rollbackCreateTable(Table table) { if (MetaStoreUtils.isExternalTable(table)) { return; } final Path segmentDescriptorDir = getSegmentDescriptorDir(); try { List dataSegmentList = DruidStorageHandlerUtils .getCreatedSegments(segmentDescriptorDir, getConf()); for (DataSegment dataSegment : dataSegmentList) { try { deleteSegment(dataSegment); } catch (SegmentLoadingException e) { LOG.error(String.format("Error while trying to clean the segment [%s]", dataSegment), e); } } } catch (IOException e) { LOG.error("Exception while rollback", e); throw Throwables.propagate(e); } finally { cleanWorkingDir(); } } @Override public void commitCreateTable(Table table) throws MetaException { if (MetaStoreUtils.isExternalTable(table)) { // For external tables, we do not need to do anything else return; } if(isKafkaStreamingTable(table)){ updateKafkaIngestion(table); } this.commitInsertTable(table, true); } private void updateKafkaIngestion(Table table){ final String overlordAddress = HiveConf .getVar(getConf(), HiveConf.ConfVars.HIVE_DRUID_OVERLORD_DEFAULT_ADDRESS); final String dataSourceName = Preconditions.checkNotNull(getTableProperty(table, Constants.DRUID_DATA_SOURCE), "Druid datasource name is null"); final String kafkaTopic = Preconditions.checkNotNull(getTableProperty(table, Constants.KAFKA_TOPIC), "kafka topic is null"); final String kafka_servers = Preconditions.checkNotNull(getTableProperty(table, Constants.KAFKA_BOOTSTRAP_SERVERS), "kafka connect string is null"); Properties tableProperties = new Properties(); tableProperties.putAll(table.getParameters()); final GranularitySpec granularitySpec = DruidStorageHandlerUtils.getGranularitySpec(getConf(), tableProperties); List columns = table.getSd().getCols(); List columnNames = new ArrayList<>(columns.size()); List columnTypes = new ArrayList<>(columns.size()); for(FieldSchema schema: columns) { columnNames.add(schema.getName()); columnTypes.add(TypeInfoUtils.getTypeInfoFromTypeString(schema.getType())); } Pair, AggregatorFactory[]> dimensionsAndAggregates = DruidStorageHandlerUtils .getDimensionsAndAggregates(getConf(), columnNames, columnTypes); if (!columnNames.contains(DruidStorageHandlerUtils.DEFAULT_TIMESTAMP_COLUMN)) { throw new IllegalStateException( "Timestamp column (' " + DruidStorageHandlerUtils.DEFAULT_TIMESTAMP_COLUMN + "') not specified in create table; list of columns is : " + columnNames); } final InputRowParser inputRowParser = new StringInputRowParser( new JSONParseSpec( new TimestampSpec(DruidStorageHandlerUtils.DEFAULT_TIMESTAMP_COLUMN, "auto", null), new DimensionsSpec(dimensionsAndAggregates.lhs, null, null), null, null ), "UTF-8"); Map inputParser = JSON_MAPPER .convertValue(inputRowParser, Map.class); final DataSchema dataSchema = new DataSchema( dataSourceName, inputParser, dimensionsAndAggregates.rhs, granularitySpec, null, DruidStorageHandlerUtils.JSON_MAPPER ); IndexSpec indexSpec = DruidStorageHandlerUtils.getIndexSpec(getConf()); KafkaSupervisorSpec spec = createKafkaSupervisorSpec(table, kafkaTopic, kafka_servers, dataSchema, indexSpec); // Fetch existing Ingestion Spec from Druid, if any KafkaSupervisorSpec existingSpec = fetchKafkaIngestionSpec(table); String targetState = getTableProperty(table, Constants.DRUID_KAFKA_INGESTION); if(targetState == null){ // Case when user has not specified any ingestion state in the current command // if there is a kafka supervisor running then keep it last known state is START otherwise STOP. targetState = existingSpec == null ? "STOP" : "START"; } if(targetState.equalsIgnoreCase("STOP")){ if(existingSpec != null){ stopKafkaIngestion(overlordAddress, dataSourceName); } } else if(targetState.equalsIgnoreCase("START")){ if(existingSpec == null || !existingSpec.equals(spec)){ updateKafkaIngestionSpec(overlordAddress, spec); } } else if(targetState.equalsIgnoreCase("RESET")){ // Case when there are changes in multiple table properties. if(existingSpec != null && !existingSpec.equals(spec)){ updateKafkaIngestionSpec(overlordAddress, spec); } resetKafkaIngestion(overlordAddress, dataSourceName); } else { throw new IllegalArgumentException(String.format("Invalid value for property [%s], Valid values are [START, STOP, RESET]", Constants.DRUID_KAFKA_INGESTION)); } // We do not want to keep state in two separate places so remove from hive table properties. table.getParameters().remove(Constants.DRUID_KAFKA_INGESTION); } private static KafkaSupervisorSpec createKafkaSupervisorSpec(Table table, String kafkaTopic, String kafka_servers, DataSchema dataSchema, IndexSpec indexSpec) { return new KafkaSupervisorSpec(dataSchema, new KafkaSupervisorTuningConfig( getIntegerProperty(table, Constants.DRUID_KAFKA_INGESTION_PROPERTY_PREFIX + "maxRowsInMemory"), getIntegerProperty(table, Constants.DRUID_KAFKA_INGESTION_PROPERTY_PREFIX + "maxRowsPerSegment"), getPeriodProperty(table, Constants.DRUID_KAFKA_INGESTION_PROPERTY_PREFIX + "intermediatePersistPeriod"), null, // basePersistDirectory - use druid default, no need to be configured by user getIntegerProperty(table, Constants.DRUID_KAFKA_INGESTION_PROPERTY_PREFIX + "maxPendingPersists"), indexSpec, null, // buildV9Directly - use druid default, no need to be configured by user getBooleanProperty(table, Constants.DRUID_KAFKA_INGESTION_PROPERTY_PREFIX + "reportParseExceptions"), getLongProperty(table, Constants.DRUID_KAFKA_INGESTION_PROPERTY_PREFIX + "handoffConditionTimeout"), getBooleanProperty(table, Constants.DRUID_KAFKA_INGESTION_PROPERTY_PREFIX + "resetOffsetAutomatically"), getIntegerProperty(table, Constants.DRUID_KAFKA_INGESTION_PROPERTY_PREFIX + "workerThreads"), getIntegerProperty(table, Constants.DRUID_KAFKA_INGESTION_PROPERTY_PREFIX + "chatThreads"), getLongProperty(table, Constants.DRUID_KAFKA_INGESTION_PROPERTY_PREFIX + "chatRetries"), getPeriodProperty(table, Constants.DRUID_KAFKA_INGESTION_PROPERTY_PREFIX + "httpTimeout"), getPeriodProperty(table, Constants.DRUID_KAFKA_INGESTION_PROPERTY_PREFIX + "shutdownTimeout"), getPeriodProperty(table, Constants.DRUID_KAFKA_INGESTION_PROPERTY_PREFIX + "offsetFetchPeriod")), new KafkaSupervisorIOConfig(kafkaTopic, // Mandatory Property getIntegerProperty(table, Constants.DRUID_KAFKA_INGESTION_PROPERTY_PREFIX + "replicas"), getIntegerProperty(table, Constants.DRUID_KAFKA_INGESTION_PROPERTY_PREFIX + "taskCount"), getPeriodProperty(table, Constants.DRUID_KAFKA_INGESTION_PROPERTY_PREFIX + "taskDuration"), getKafkaConsumerProperties(table, kafka_servers), // Mandatory Property getPeriodProperty(table, Constants.DRUID_KAFKA_INGESTION_PROPERTY_PREFIX + "startDelay"), getPeriodProperty(table, Constants.DRUID_KAFKA_INGESTION_PROPERTY_PREFIX + "period"), getBooleanProperty(table, Constants.DRUID_KAFKA_INGESTION_PROPERTY_PREFIX + "useEarliestOffset"), getPeriodProperty(table, Constants.DRUID_KAFKA_INGESTION_PROPERTY_PREFIX + "completionTimeout"), getPeriodProperty(table, Constants.DRUID_KAFKA_INGESTION_PROPERTY_PREFIX + "lateMessageRejectionPeriod"), getPeriodProperty(table, Constants.DRUID_KAFKA_INGESTION_PROPERTY_PREFIX + "earlyMessageRejectionPeriod"), getBooleanProperty(table, Constants.DRUID_KAFKA_INGESTION_PROPERTY_PREFIX + "skipOffsetGaps")), new HashMap() ); } private static Map getKafkaConsumerProperties(Table table, String kafka_servers) { ImmutableMap.Builder builder = ImmutableMap.builder(); builder.put(KafkaSupervisorIOConfig.BOOTSTRAP_SERVERS_KEY, kafka_servers); for (Map.Entry entry : table.getParameters().entrySet()) { if (entry.getKey().startsWith(Constants.DRUID_KAFKA_CONSUMER_PROPERTY_PREFIX)) { String propertyName = entry.getKey() .substring(Constants.DRUID_KAFKA_CONSUMER_PROPERTY_PREFIX.length()); builder.put(propertyName, entry.getValue()); } } return builder.build(); } private static void updateKafkaIngestionSpec(String overlordAddress, KafkaSupervisorSpec spec) { try { String task = JSON_MAPPER.writeValueAsString(spec); console.printInfo("submitting kafka Spec {}", task); LOG.info("submitting kafka Supervisor Spec {}", task); StatusResponseHolder response = getHttpClient().go(new Request(HttpMethod.POST, new URL(String.format("http://%s/druid/indexer/v1/supervisor", overlordAddress))) .setContent( "application/json", JSON_MAPPER.writeValueAsBytes(spec)), new StatusResponseHandler( Charset.forName("UTF-8"))).get(); if (response.getStatus().equals(HttpResponseStatus.OK)) { String msg = String.format("Kafka Supervisor for [%s] Submitted Successfully to druid.", spec.getDataSchema().getDataSource()); LOG.info(msg); console.printInfo(msg); } else { throw new IOException(String .format("Unable to update Kafka Ingestion for Druid status [%d] full response [%s]", response.getStatus().getCode(), response.getContent())); } } catch (Exception e) { throw new RuntimeException(e); } } private void resetKafkaIngestion(String overlordAddress, String dataSourceName) { try { StatusResponseHolder response = RetryUtils .retry(() -> getHttpClient().go(new Request(HttpMethod.POST, new URL(String .format("http://%s/druid/indexer/v1/supervisor/%s/reset", overlordAddress, dataSourceName))), new StatusResponseHandler( Charset.forName("UTF-8"))).get(), input -> input instanceof IOException, getMaxRetryCount()); if (response.getStatus().equals(HttpResponseStatus.OK)) { console.printInfo("Druid Kafka Ingestion Reset successful."); } else { throw new IOException(String .format("Unable to reset Kafka Ingestion Druid status [%d] full response [%s]", response.getStatus().getCode(), response.getContent())); } } catch (Exception e) { throw new RuntimeException(e); } } private void stopKafkaIngestion(String overlordAddress, String dataSourceName) { try { StatusResponseHolder response = RetryUtils.retry(() -> getHttpClient() .go(new Request(HttpMethod.POST, new URL(String .format("http://%s/druid/indexer/v1/supervisor/%s/shutdown", overlordAddress, dataSourceName))), new StatusResponseHandler( Charset.forName("UTF-8"))).get(), input -> input instanceof IOException, getMaxRetryCount()); if (response.getStatus().equals(HttpResponseStatus.OK)) { console.printInfo("Druid Kafka Ingestion shutdown successful."); } else { throw new IOException(String .format("Unable to stop Kafka Ingestion Druid status [%d] full response [%s]", response.getStatus().getCode(), response.getContent())); } } catch (Exception e) { throw new RuntimeException(e); } } private KafkaSupervisorSpec fetchKafkaIngestionSpec(Table table) { // Stop Kafka Ingestion first final String overlordAddress = Preconditions.checkNotNull(HiveConf .getVar(getConf(), HiveConf.ConfVars.HIVE_DRUID_OVERLORD_DEFAULT_ADDRESS), "Druid Overlord Address is null"); String dataSourceName = Preconditions .checkNotNull(getTableProperty(table, Constants.DRUID_DATA_SOURCE), "Druid Datasource name is null"); try { StatusResponseHolder response = RetryUtils.retry(() -> getHttpClient().go(new Request(HttpMethod.GET, new URL(String .format("http://%s/druid/indexer/v1/supervisor/%s", overlordAddress, dataSourceName))), new StatusResponseHandler( Charset.forName("UTF-8"))).get(), input -> input instanceof IOException, getMaxRetryCount()); if (response.getStatus().equals(HttpResponseStatus.OK)) { return JSON_MAPPER .readValue(response.getContent(), KafkaSupervisorSpec.class); // Druid Returns 400 Bad Request when not found. } else if (response.getStatus().equals(HttpResponseStatus.NOT_FOUND) || response.getStatus().equals(HttpResponseStatus.BAD_REQUEST)) { LOG.debug("No Kafka Supervisor found for datasource[%s]", dataSourceName); return null; } else { throw new IOException(String .format("Unable to fetch Kafka Ingestion Spec from Druid status [%d] full response [%s]", response.getStatus().getCode(), response.getContent())); } } catch (Exception e) { throw new RuntimeException("Exception while fetching kafka ingestion spec from druid", e); } } /** * Fetches kafka supervisor status report from druid overlod. * @param table * @return kafka supervisor report or null when druid overlord is unreachable. */ @Nullable private KafkaSupervisorReport fetchKafkaSupervisorReport(Table table) { final String overlordAddress = Preconditions.checkNotNull(HiveConf .getVar(getConf(), HiveConf.ConfVars.HIVE_DRUID_OVERLORD_DEFAULT_ADDRESS), "Druid Overlord Address is null"); String dataSourceName = Preconditions .checkNotNull(getTableProperty(table, Constants.DRUID_DATA_SOURCE), "Druid Datasource name is null"); try { StatusResponseHolder response = RetryUtils.retry(() -> getHttpClient().go(new Request(HttpMethod.GET, new URL(String .format("http://%s/druid/indexer/v1/supervisor/%s/status", overlordAddress, dataSourceName))), new StatusResponseHandler( Charset.forName("UTF-8"))).get(), input -> input instanceof IOException, getMaxRetryCount()); if (response.getStatus().equals(HttpResponseStatus.OK)) { return DruidStorageHandlerUtils.JSON_MAPPER .readValue(response.getContent(), KafkaSupervisorReport.class); // Druid Returns 400 Bad Request when not found. } else if (response.getStatus().equals(HttpResponseStatus.NOT_FOUND) || response.getStatus().equals(HttpResponseStatus.BAD_REQUEST)) { LOG.info("No Kafka Supervisor found for datasource[%s]", dataSourceName); return null; } else { LOG.error("Unable to fetch Kafka Supervisor status [%d] full response [%s]", response.getStatus().getCode(), response.getContent()); return null; } } catch (Exception e) { LOG.error("Exception while fetching kafka ingestion spec from druid", e); return null; } } /** * Creates metadata moves then commit the Segment's metadata to Druid metadata store in one TxN * * @param table Hive table * @param overwrite true if it is an insert overwrite table * * @throws MetaException if errors occurs. */ protected List loadAndCommitDruidSegments(Table table, boolean overwrite, List segmentsToLoad) throws IOException, CallbackFailedException { final String dataSourceName = table.getParameters().get(Constants.DRUID_DATA_SOURCE); final String segmentDirectory = table.getParameters().get(Constants.DRUID_SEGMENT_DIRECTORY) != null ? table.getParameters().get(Constants.DRUID_SEGMENT_DIRECTORY) : HiveConf.getVar(getConf(), HiveConf.ConfVars.DRUID_SEGMENT_DIRECTORY); final HdfsDataSegmentPusherConfig hdfsSegmentPusherConfig = new HdfsDataSegmentPusherConfig(); List publishedDataSegmentList; LOG.info(String.format( "Moving [%s] Druid segments from staging directory [%s] to Deep storage [%s]", segmentsToLoad.size(), getStagingWorkingDir().toString(), segmentDirectory )); hdfsSegmentPusherConfig.setStorageDirectory(segmentDirectory); DataSegmentPusher dataSegmentPusher = new HdfsDataSegmentPusher(hdfsSegmentPusherConfig, getConf(), JSON_MAPPER ); publishedDataSegmentList = DruidStorageHandlerUtils.publishSegmentsAndCommit( getConnector(), getDruidMetadataStorageTablesConfig(), dataSourceName, segmentsToLoad, overwrite, getConf(), dataSegmentPusher ); return publishedDataSegmentList; } /** * This function checks the load status of Druid segments by polling druid coordinator. * @param segments List of druid segments to check for * * @return count of yet to load segments. */ private int checkLoadStatus(List segments){ final String coordinatorAddress = HiveConf .getVar(getConf(), HiveConf.ConfVars.HIVE_DRUID_COORDINATOR_DEFAULT_ADDRESS); int maxTries = getMaxRetryCount(); if (maxTries == 0) { return segments.size(); } LOG.debug("checking load status from coordinator {}", coordinatorAddress); String coordinatorResponse; try { coordinatorResponse = RetryUtils.retry(() -> DruidStorageHandlerUtils.getURL(getHttpClient(), new URL(String.format("http://%s/status", coordinatorAddress)) ), input -> input instanceof IOException, maxTries); } catch (Exception e) { console.printInfo( "Will skip waiting for data loading, coordinator unavailable"); return segments.size(); } if (Strings.isNullOrEmpty(coordinatorResponse)) { console.printInfo( "Will skip waiting for data loading empty response from coordinator"); return segments.size(); } console.printInfo( String.format("Waiting for the loading of [%s] segments", segments.size())); long passiveWaitTimeMs = HiveConf .getLongVar(getConf(), HiveConf.ConfVars.HIVE_DRUID_PASSIVE_WAIT_TIME); Set UrlsOfUnloadedSegments = segments.stream().map(dataSegment -> { try { //Need to make sure that we are using segment identifier return new URL(String.format("http://%s/druid/coordinator/v1/datasources/%s/segments/%s", coordinatorAddress, dataSegment.getDataSource(), dataSegment.getIdentifier() )); } catch (MalformedURLException e) { Throwables.propagate(e); } return null; }).collect(Collectors.toSet()); int numRetries = 0; while (numRetries++ < maxTries && !UrlsOfUnloadedSegments.isEmpty()) { UrlsOfUnloadedSegments = ImmutableSet.copyOf(Sets.filter(UrlsOfUnloadedSegments, input -> { try { String result = DruidStorageHandlerUtils.getURL(getHttpClient(), input); LOG.debug("Checking segment [{}] response is [{}]", input, result); return Strings.isNullOrEmpty(result); } catch (IOException e) { LOG.error(String.format("Error while checking URL [%s]", input), e); return true; } })); try { if (!UrlsOfUnloadedSegments.isEmpty()) { Thread.sleep(passiveWaitTimeMs); } } catch (InterruptedException e) { Thread.interrupted(); Throwables.propagate(e); } } if (!UrlsOfUnloadedSegments.isEmpty()) { // We are not Throwing an exception since it might be a transient issue that is blocking loading console.printError(String.format( "Wait time exhausted and we have [%s] out of [%s] segments not loaded yet", UrlsOfUnloadedSegments.size(), segments.size() )); } return UrlsOfUnloadedSegments.size(); } @VisibleForTesting protected void deleteSegment(DataSegment segment) throws SegmentLoadingException { final Path path = DruidStorageHandlerUtils.getPath(segment); LOG.info("removing segment {}, located at path {}", segment.getIdentifier(), path); try { if (path.getName().endsWith(".zip")) { final FileSystem fs = path.getFileSystem(getConf()); if (!fs.exists(path)) { LOG.warn("Segment Path {} does not exist. It appears to have been deleted already.", path); return; } // path format -- > .../dataSource/interval/version/partitionNum/xxx.zip Path partitionNumDir = path.getParent(); if (!fs.delete(partitionNumDir, true)) { throw new SegmentLoadingException( "Unable to kill segment, failed to delete dir [%s]", partitionNumDir.toString() ); } //try to delete other directories if possible Path versionDir = partitionNumDir.getParent(); if (safeNonRecursiveDelete(fs, versionDir)) { Path intervalDir = versionDir.getParent(); if (safeNonRecursiveDelete(fs, intervalDir)) { Path dataSourceDir = intervalDir.getParent(); safeNonRecursiveDelete(fs, dataSourceDir); } } } else { throw new SegmentLoadingException("Unknown file type[%s]", path); } } catch (IOException e) { throw new SegmentLoadingException(e, "Unable to kill segment"); } } private static boolean safeNonRecursiveDelete(FileSystem fs, Path path) { try { return fs.delete(path, false); } catch (Exception ex) { return false; } } @Override public void preDropTable(Table table) { // Nothing to do } @Override public void rollbackDropTable(Table table) { // Nothing to do } @Override public void commitDropTable(Table table, boolean deleteData) { if (MetaStoreUtils.isExternalTable(table)) { return; } if(isKafkaStreamingTable(table)) { // Stop Kafka Ingestion first final String overlordAddress = Preconditions.checkNotNull(HiveConf .getVar(getConf(), HiveConf.ConfVars.HIVE_DRUID_OVERLORD_DEFAULT_ADDRESS), "Druid Overlord Address is null"); String dataSourceName = Preconditions .checkNotNull(getTableProperty(table, Constants.DRUID_DATA_SOURCE), "Druid Datasource name is null"); stopKafkaIngestion(overlordAddress, dataSourceName); } String dataSourceName = Preconditions .checkNotNull(table.getParameters().get(Constants.DRUID_DATA_SOURCE), "DataSource name is null !" ); if (deleteData == true) { LOG.info("Dropping with purge all the data for data source {}", dataSourceName); List dataSegmentList = DruidStorageHandlerUtils .getDataSegmentList(getConnector(), getDruidMetadataStorageTablesConfig(), dataSourceName); if (dataSegmentList.isEmpty()) { LOG.info("Nothing to delete for data source {}", dataSourceName); return; } for (DataSegment dataSegment : dataSegmentList) { try { deleteSegment(dataSegment); } catch (SegmentLoadingException e) { LOG.error(String.format("Error while deleting segment [%s]", dataSegment.getIdentifier()), e); } } } if (DruidStorageHandlerUtils .disableDataSource(getConnector(), getDruidMetadataStorageTablesConfig(), dataSourceName)) { LOG.info("Successfully dropped druid data source {}", dataSourceName); } } @Override public void commitInsertTable(Table table, boolean overwrite) throws MetaException { LOG.debug("commit insert into table {} overwrite {}", table.getTableName(), overwrite); if (MetaStoreUtils.isExternalTable(table)) { throw new MetaException("Cannot insert data into external table backed by Druid"); } try { // Check if there segments to load final Path segmentDescriptorDir = getSegmentDescriptorDir(); final List segmentsToLoad = fetchSegmentsMetadata(segmentDescriptorDir); final String dataSourceName = table.getParameters().get(Constants.DRUID_DATA_SOURCE); //No segments to load still need to honer overwrite if (segmentsToLoad.isEmpty() && overwrite) { //disable datasource //Case it is an insert overwrite we have to disable the existing Druid DataSource DruidStorageHandlerUtils .disableDataSource(getConnector(), getDruidMetadataStorageTablesConfig(), dataSourceName ); return; } else if (!segmentsToLoad.isEmpty()) { // at this point we have Druid segments from reducers but we need to atomically // rename and commit to metadata // Moving Druid segments and committing to druid metadata as one transaction. checkLoadStatus(loadAndCommitDruidSegments(table, overwrite, segmentsToLoad)); } } catch (IOException e) { throw new MetaException(e.getMessage()); } catch (CallbackFailedException c) { throw new MetaException(c.getCause().getMessage()); } finally { cleanWorkingDir(); } } private List fetchSegmentsMetadata(Path segmentDescriptorDir) throws IOException { if (!segmentDescriptorDir.getFileSystem(getConf()).exists(segmentDescriptorDir)) { LOG.info( "Directory {} does not exist, ignore this if it is create statement or inserts of 0 rows," + " no Druid segments to move, cleaning working directory {}", segmentDescriptorDir.toString(), getStagingWorkingDir().toString() ); return Collections.EMPTY_LIST; } return DruidStorageHandlerUtils.getCreatedSegments(segmentDescriptorDir, getConf()); } @Override public void preInsertTable(Table table, boolean overwrite) { } @Override public void rollbackInsertTable(Table table, boolean overwrite) { // do nothing } @Override public void configureOutputJobProperties(TableDesc tableDesc, Map jobProperties) { jobProperties.put(Constants.DRUID_DATA_SOURCE, tableDesc.getTableName()); jobProperties.put(Constants.DRUID_SEGMENT_VERSION, new DateTime().toString()); jobProperties.put(Constants.DRUID_JOB_WORKING_DIRECTORY, getStagingWorkingDir().toString()); // DruidOutputFormat will write segments in an intermediate directory jobProperties.put(Constants.DRUID_SEGMENT_INTERMEDIATE_DIRECTORY, getIntermediateSegmentDir().toString()); } @Override public void configureTableJobProperties(TableDesc tableDesc, Map jobProperties) { } @Override public void configureJobConf(TableDesc tableDesc, JobConf jobConf) { if (UserGroupInformation.isSecurityEnabled()) { // AM can not do Kerberos Auth so will do the input split generation in the HS2 LOG.debug("Setting {} to {} to enable split generation on HS2", HiveConf.ConfVars.HIVE_AM_SPLIT_GENERATION.toString(), Boolean.FALSE.toString() ); jobConf.set(HiveConf.ConfVars.HIVE_AM_SPLIT_GENERATION.toString(), Boolean.FALSE.toString()); } try { DruidStorageHandlerUtils.addDependencyJars(jobConf, DruidRecordWriter.class); } catch (IOException e) { Throwables.propagate(e); } } @Override public void setConf(Configuration conf) { this.conf = conf; } @Override public Configuration getConf() { return conf; } @Override public LockType getLockType(WriteEntity writeEntity ) { if (writeEntity.getWriteType().equals(WriteEntity.WriteType.INSERT)) { return LockType.SHARED_READ; } return LockType.SHARED_WRITE; } @Override public String toString() { return Constants.DRUID_HIVE_STORAGE_HANDLER_ID; } public String getUniqueId() { if (uniqueId == null) { uniqueId = Preconditions.checkNotNull( Strings.emptyToNull(HiveConf.getVar(getConf(), HiveConf.ConfVars.HIVEQUERYID)), "Hive query id is null" ); } return uniqueId; } private Path getStagingWorkingDir() { return new Path(getRootWorkingDir(), makeStagingName()); } private MetadataStorageTablesConfig getDruidMetadataStorageTablesConfig() { if (druidMetadataStorageTablesConfig != null) { return druidMetadataStorageTablesConfig; } final String base = HiveConf .getVar(getConf(), HiveConf.ConfVars.DRUID_METADATA_BASE); druidMetadataStorageTablesConfig = MetadataStorageTablesConfig.fromBase(base); return druidMetadataStorageTablesConfig; } private SQLMetadataConnector getConnector() { return Suppliers.memoize(this::buildConnector).get(); } private SQLMetadataConnector buildConnector() { if (connector != null) { return connector; } final String dbType = HiveConf.getVar(getConf(), HiveConf.ConfVars.DRUID_METADATA_DB_TYPE); final String username = HiveConf.getVar(getConf(), HiveConf.ConfVars.DRUID_METADATA_DB_USERNAME); final String password = HiveConf.getVar(getConf(), HiveConf.ConfVars.DRUID_METADATA_DB_PASSWORD); final String uri = HiveConf.getVar(getConf(), HiveConf.ConfVars.DRUID_METADATA_DB_URI); LOG.debug("Supplying SQL Connector with DB type {}, URI {}, User {}", dbType, uri, username); final Supplier storageConnectorConfigSupplier = Suppliers.ofInstance(new MetadataStorageConnectorConfig() { @Override public String getConnectURI() { return uri; } @Override public String getUser() { return Strings.emptyToNull(username); } @Override public String getPassword() { return Strings.emptyToNull(password); } }); if (dbType.equals("mysql")) { connector = new MySQLConnector(storageConnectorConfigSupplier, Suppliers.ofInstance(getDruidMetadataStorageTablesConfig()), new MySQLConnectorConfig() ); } else if (dbType.equals("postgresql")) { connector = new PostgreSQLConnector(storageConnectorConfigSupplier, Suppliers.ofInstance(getDruidMetadataStorageTablesConfig()) ); } else if (dbType.equals("derby")) { connector = new DerbyConnector(new DerbyMetadataStorage(storageConnectorConfigSupplier.get()), storageConnectorConfigSupplier, Suppliers.ofInstance(getDruidMetadataStorageTablesConfig()) ); } else { throw new IllegalStateException(String.format("Unknown metadata storage type [%s]", dbType)); } return connector; } @VisibleForTesting protected String makeStagingName() { return ".staging-".concat(getUniqueId().replace(":", "")); } private Path getSegmentDescriptorDir() { return new Path(getStagingWorkingDir(), SEGMENTS_DESCRIPTOR_DIR_NAME); } private Path getIntermediateSegmentDir() { return new Path(getStagingWorkingDir(), INTERMEDIATE_SEGMENT_DIR_NAME); } private void cleanWorkingDir() { final FileSystem fileSystem; try { fileSystem = getStagingWorkingDir().getFileSystem(getConf()); fileSystem.delete(getStagingWorkingDir(), true); } catch (IOException e) { LOG.error("Got Exception while cleaning working directory", e); } } private String getRootWorkingDir() { if (Strings.isNullOrEmpty(rootWorkingDir)) { rootWorkingDir = HiveConf.getVar(getConf(), HiveConf.ConfVars.DRUID_WORKING_DIR); } return rootWorkingDir; } private static HttpClient makeHttpClient(Lifecycle lifecycle) { final int numConnection = HiveConf .getIntVar(SessionState.getSessionConf(), HiveConf.ConfVars.HIVE_DRUID_NUM_HTTP_CONNECTION ); final Period readTimeout = new Period( HiveConf.getVar(SessionState.getSessionConf(), HiveConf.ConfVars.HIVE_DRUID_HTTP_READ_TIMEOUT )); LOG.info("Creating Druid HTTP client with {} max parallel connections and {}ms read timeout", numConnection, readTimeout.toStandardDuration().getMillis() ); final HttpClient httpClient = HttpClientInit.createClient( HttpClientConfig.builder().withNumConnections(numConnection) .withReadTimeout(new Period(readTimeout).toStandardDuration()).build(), lifecycle ); if (UserGroupInformation.isSecurityEnabled()) { LOG.info("building Kerberos Http Client"); return new KerberosHttpClient(httpClient); } return httpClient; } public static HttpClient getHttpClient() { return HTTP_CLIENT; } @Override public void preAlterTable(Table table, EnvironmentContext context) throws MetaException { String alterOpType = context == null ? null : context.getProperties().get(ALTER_TABLE_OPERATION_TYPE); // alterOpType is null in case of stats update if (alterOpType != null && !allowedAlterTypes.contains(alterOpType)) { throw new MetaException( "ALTER TABLE can not be used for " + alterOpType + " to a non-native table "); } if(isKafkaStreamingTable(table)){ updateKafkaIngestion(table); } } private static Boolean getBooleanProperty(Table table, String propertyName) { String val = getTableProperty(table, propertyName); if (val == null) { return null; } return Boolean.parseBoolean(val); } private static Integer getIntegerProperty(Table table, String propertyName) { String val = getTableProperty(table, propertyName); if (val == null) { return null; } try { return Integer.parseInt(val); } catch (NumberFormatException e) { throw new NumberFormatException(String .format("Exception while parsing property[%s] with Value [%s] as Integer", propertyName, val)); } } private static Long getLongProperty(Table table, String propertyName) { String val = getTableProperty(table, propertyName); if (val == null) { return null; } try { return Long.parseLong(val); } catch (NumberFormatException e) { throw new NumberFormatException(String .format("Exception while parsing property[%s] with Value [%s] as Long", propertyName, val)); } } private static Period getPeriodProperty(Table table, String propertyName) { String val = getTableProperty(table, propertyName); if (val == null) { return null; } try { return Period.parse(val); } catch (IllegalArgumentException e) { throw new IllegalArgumentException(String .format("Exception while parsing property[%s] with Value [%s] as Period", propertyName, val)); } } private static String getTableProperty(Table table, String propertyName) { return table.getParameters().get(propertyName); } private static boolean isKafkaStreamingTable(Table table){ // For kafka Streaming tables it is mandatory to set a kafka topic. return getTableProperty(table, Constants.KAFKA_TOPIC) != null; } private int getMaxRetryCount() { return HiveConf.getIntVar(getConf(), HiveConf.ConfVars.HIVE_DRUID_MAX_TRIES); } @Override public StorageHandlerInfo getStorageHandlerInfo(Table table) throws MetaException { if(isKafkaStreamingTable(table)){ KafkaSupervisorReport kafkaSupervisorReport = fetchKafkaSupervisorReport(table); if(kafkaSupervisorReport == null){ return DruidStorageHandlerInfo.UNREACHABLE; } return new DruidStorageHandlerInfo(kafkaSupervisorReport); } else // TODO: Currently we do not expose any runtime info for non-streaming tables. // In future extend this add more information regarding table status. // e.g. Total size of segments in druid, loadstatus of table on historical nodes etc. return null; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy