org.apache.hadoop.hive.druid.DruidStorageHandlerUtils Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.druid;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.jsontype.NamedType;
import com.fasterxml.jackson.dataformat.smile.SmileFactory;
import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Interner;
import com.google.common.collect.Interners;
import com.google.common.collect.Lists;
import com.google.common.io.CharStreams;
import com.metamx.common.MapUtils;
import com.metamx.emitter.EmittingLogger;
import com.metamx.emitter.core.NoopEmitter;
import com.metamx.emitter.service.ServiceEmitter;
import com.metamx.http.client.HttpClient;
import com.metamx.http.client.Request;
import com.metamx.http.client.response.InputStreamResponseHandler;
import io.druid.jackson.DefaultObjectMapper;
import io.druid.metadata.MetadataStorageTablesConfig;
import io.druid.metadata.SQLMetadataConnector;
import io.druid.metadata.storage.mysql.MySQLConnector;
import io.druid.query.BaseQuery;
import io.druid.segment.IndexIO;
import io.druid.segment.IndexMergerV9;
import io.druid.segment.column.ColumnConfig;
import io.druid.timeline.DataSegment;
import io.druid.timeline.partition.LinearShardSpec;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.JavaUtils;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.io.retry.RetryPolicies;
import org.apache.hadoop.io.retry.RetryProxy;
import org.apache.hadoop.util.StringUtils;
import org.jboss.netty.handler.codec.http.HttpHeaders;
import org.jboss.netty.handler.codec.http.HttpMethod;
import org.skife.jdbi.v2.FoldController;
import org.skife.jdbi.v2.Folder3;
import org.skife.jdbi.v2.Handle;
import org.skife.jdbi.v2.StatementContext;
import org.skife.jdbi.v2.TransactionCallback;
import org.skife.jdbi.v2.TransactionStatus;
import org.skife.jdbi.v2.tweak.HandleCallback;
import org.skife.jdbi.v2.util.ByteArrayMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.Reader;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.net.InetAddress;
import java.net.URL;
import java.net.URLDecoder;
import java.net.UnknownHostException;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TimeZone;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import static org.apache.hadoop.hive.ql.exec.Utilities.jarFinderGetJar;
/**
* Utils class for Druid storage handler.
*/
public final class DruidStorageHandlerUtils {
private static final Logger LOG = LoggerFactory.getLogger(DruidStorageHandlerUtils.class);
private static final String SMILE_CONTENT_TYPE = "application/x-jackson-smile";
/**
* Mapper to use to serialize/deserialize Druid objects (JSON)
*/
public static final ObjectMapper JSON_MAPPER = new DefaultObjectMapper();
/**
* Mapper to use to serialize/deserialize Druid objects (SMILE)
*/
public static final ObjectMapper SMILE_MAPPER = new DefaultObjectMapper(new SmileFactory());
private static final int NUM_RETRIES = 8;
private static final int SECONDS_BETWEEN_RETRIES = 2;
private static final int DEFAULT_FS_BUFFER_SIZE = 1 << 18; // 256KB
private static final int DEFAULT_STREAMING_RESULT_SIZE = 100;
/**
* Used by druid to perform IO on indexes
*/
public static final IndexIO INDEX_IO = new IndexIO(JSON_MAPPER, new ColumnConfig() {
@Override
public int columnCacheSizeBytes() {
return 0;
}
});
/**
* Used by druid to merge indexes
*/
public static final IndexMergerV9 INDEX_MERGER_V9 = new IndexMergerV9(JSON_MAPPER,
DruidStorageHandlerUtils.INDEX_IO
);
/**
* Generic Interner implementation used to read segments object from metadata storage
*/
public static final Interner DATA_SEGMENT_INTERNER = Interners.newWeakInterner();
static {
// Register the shard sub type to be used by the mapper
JSON_MAPPER.registerSubtypes(new NamedType(LinearShardSpec.class, "linear"));
// set the timezone of the object mapper
// THIS IS NOT WORKING workaround is to set it as part of java opts -Duser.timezone="UTC"
JSON_MAPPER.setTimeZone(TimeZone.getTimeZone("UTC"));
try {
// No operation emitter will be used by some internal druid classes.
EmittingLogger.registerEmitter(
new ServiceEmitter("druid-hive-indexer", InetAddress.getLocalHost().getHostName(),
new NoopEmitter()
));
} catch (UnknownHostException e) {
throw Throwables.propagate(e);
}
}
/**
* Method that creates a request for Druid JSON query (using SMILE).
*
* @param address
* @param query
*
* @return
*
* @throws IOException
*/
public static Request createRequest(String address, BaseQuery> query)
throws IOException {
return new Request(HttpMethod.POST, new URL(String.format("%s/druid/v2/", "http://" + address)))
.setContent(SMILE_MAPPER.writeValueAsBytes(query))
.setHeader(HttpHeaders.Names.CONTENT_TYPE, SMILE_CONTENT_TYPE);
}
/**
* Method that submits a request to an Http address and retrieves the result.
* The caller is responsible for closing the stream once it finishes consuming it.
*
* @param client
* @param request
*
* @return
*
* @throws IOException
*/
public static InputStream submitRequest(HttpClient client, Request request)
throws IOException {
InputStream response;
try {
response = client.go(request, new InputStreamResponseHandler()).get();
} catch (ExecutionException e) {
throw new IOException(e.getCause());
} catch (InterruptedException e) {
throw new IOException(e.getCause());
}
return response;
}
public static String getURL(HttpClient client, URL url) throws IOException {
try (Reader reader = new InputStreamReader(
DruidStorageHandlerUtils.submitRequest(client, new Request(HttpMethod.GET, url)))) {
return CharStreams.toString(reader);
}
}
/**
* @param taskDir path to the directory containing the segments descriptor info
* the descriptor path will be .../workingPath/task_id/{@link DruidStorageHandler#SEGMENTS_DESCRIPTOR_DIR_NAME}/*.json
* @param conf hadoop conf to get the file system
*
* @return List of DataSegments
*
* @throws IOException can be for the case we did not produce data.
*/
public static List getPublishedSegments(Path taskDir, Configuration conf)
throws IOException {
ImmutableList.Builder publishedSegmentsBuilder = ImmutableList.builder();
FileSystem fs = taskDir.getFileSystem(conf);
for (FileStatus fileStatus : fs.listStatus(taskDir)) {
final DataSegment segment = JSON_MAPPER
.readValue(fs.open(fileStatus.getPath()), DataSegment.class);
publishedSegmentsBuilder.add(segment);
}
List publishedSegments = publishedSegmentsBuilder.build();
return publishedSegments;
}
/**
* This function will write to filesystem serialized from of segment descriptor
* if an existing file exists it will try to replace it.
*
* @param outputFS filesystem
* @param segment DataSegment object
* @param descriptorPath path
*
* @throws IOException
*/
public static void writeSegmentDescriptor(
final FileSystem outputFS,
final DataSegment segment,
final Path descriptorPath
)
throws IOException {
final DataPusher descriptorPusher = (DataPusher) RetryProxy.create(
DataPusher.class, new DataPusher() {
@Override
public long push() throws IOException {
try {
if (outputFS.exists(descriptorPath)) {
if (!outputFS.delete(descriptorPath, false)) {
throw new IOException(
String.format("Failed to delete descriptor at [%s]", descriptorPath));
}
}
try (final OutputStream descriptorOut = outputFS.create(
descriptorPath,
true,
DEFAULT_FS_BUFFER_SIZE
)) {
JSON_MAPPER.writeValue(descriptorOut, segment);
descriptorOut.flush();
}
} catch (RuntimeException | IOException ex) {
throw ex;
}
return -1;
}
},
RetryPolicies
.exponentialBackoffRetry(NUM_RETRIES, SECONDS_BETWEEN_RETRIES, TimeUnit.SECONDS)
);
descriptorPusher.push();
}
/**
* @param connector SQL metadata connector to the metadata storage
* @param metadataStorageTablesConfig Table config
*
* @return all the active data sources in the metadata storage
*/
public static Collection getAllDataSourceNames(SQLMetadataConnector connector,
final MetadataStorageTablesConfig metadataStorageTablesConfig
) {
return connector.getDBI().withHandle(
new HandleCallback>() {
@Override
public List withHandle(Handle handle) throws Exception {
return handle.createQuery(
String.format("SELECT DISTINCT(datasource) FROM %s WHERE used = true",
metadataStorageTablesConfig.getSegmentsTable()
))
.fold(Lists.newArrayList(),
new Folder3, Map>() {
@Override
public ArrayList fold(ArrayList druidDataSources,
Map stringObjectMap,
FoldController foldController,
StatementContext statementContext
) throws SQLException {
druidDataSources.add(
MapUtils.getString(stringObjectMap, "datasource")
);
return druidDataSources;
}
}
);
}
}
);
}
/**
* @param connector SQL connector to metadata
* @param metadataStorageTablesConfig Tables configuration
* @param dataSource Name of data source
*
* @return true if the data source was successfully disabled false otherwise
*/
public static boolean disableDataSource(SQLMetadataConnector connector,
final MetadataStorageTablesConfig metadataStorageTablesConfig, final String dataSource
) {
try {
if (!getAllDataSourceNames(connector, metadataStorageTablesConfig).contains(dataSource)) {
DruidStorageHandler.LOG
.warn(String.format("Cannot delete data source [%s], does not exist", dataSource));
return false;
}
connector.getDBI().withHandle(
new HandleCallback() {
@Override
public Void withHandle(Handle handle) throws Exception {
handle.createStatement(
String.format("UPDATE %s SET used=false WHERE dataSource = :dataSource",
metadataStorageTablesConfig.getSegmentsTable()
)
)
.bind("dataSource", dataSource)
.execute();
return null;
}
}
);
} catch (Exception e) {
DruidStorageHandler.LOG.error(String.format("Error removing dataSource %s", dataSource), e);
return false;
}
return true;
}
/**
* @param connector SQL connector to metadata
* @param metadataStorageTablesConfig Tables configuration
* @param dataSource Name of data source
*
* @return List of all data segments part of the given data source
*/
public static List getDataSegmentList(final SQLMetadataConnector connector,
final MetadataStorageTablesConfig metadataStorageTablesConfig, final String dataSource
) {
List segmentList = connector.retryTransaction(
new TransactionCallback>() {
@Override
public List inTransaction(
Handle handle, TransactionStatus status
) throws Exception {
return handle
.createQuery(String.format(
"SELECT payload FROM %s WHERE dataSource = :dataSource",
metadataStorageTablesConfig.getSegmentsTable()
))
.setFetchSize(getStreamingFetchSize(connector))
.bind("dataSource", dataSource)
.map(ByteArrayMapper.FIRST)
.fold(
new ArrayList(),
new Folder3, byte[]>() {
@Override
public List fold(List accumulator,
byte[] payload, FoldController control,
StatementContext ctx
) throws SQLException {
try {
final DataSegment segment = DATA_SEGMENT_INTERNER.intern(
JSON_MAPPER.readValue(
payload,
DataSegment.class
));
accumulator.add(segment);
return accumulator;
} catch (Exception e) {
throw new SQLException(e.toString());
}
}
}
);
}
}
, 3, SQLMetadataConnector.DEFAULT_MAX_TRIES);
return segmentList;
}
/**
* @param connector
*
* @return streaming fetch size.
*/
private static int getStreamingFetchSize(SQLMetadataConnector connector) {
if (connector instanceof MySQLConnector) {
return Integer.MIN_VALUE;
}
return DEFAULT_STREAMING_RESULT_SIZE;
}
/**
* @param pushedSegment
* @param segmentsDescriptorDir
*
* @return a sanitize file name
*/
public static Path makeSegmentDescriptorOutputPath(DataSegment pushedSegment,
Path segmentsDescriptorDir
) {
return new Path(
segmentsDescriptorDir,
String.format("%s.json", pushedSegment.getIdentifier().replace(":", ""))
);
}
/**
* Simple interface for retry operations
*/
public interface DataPusher {
long push() throws IOException;
}
// Thanks, HBase Storage handler
public static void addDependencyJars(Configuration conf, Class>... classes) throws IOException {
FileSystem localFs = FileSystem.getLocal(conf);
Set jars = new HashSet();
jars.addAll(conf.getStringCollection("tmpjars"));
for (Class> clazz : classes) {
if (clazz == null) {
continue;
}
String path = Utilities.jarFinderGetJar(clazz);
if (path == null) {
throw new RuntimeException(
"Could not find jar for class " + clazz + " in order to ship it to the cluster.");
}
if (!localFs.exists(new Path(path))) {
throw new RuntimeException("Could not validate jar file " + path + " for class " + clazz);
}
jars.add(path.toString());
}
if (jars.isEmpty()) {
return;
}
conf.set("tmpjars", StringUtils.arrayToString(jars.toArray(new String[jars.size()])));
}
}