oracle.kv.hadoop.table.TableInputFormatBase Maven / Gradle / Ivy
Show all versions of oracle-nosql-server Show documentation
/*-
* Copyright (C) 2011, 2018 Oracle and/or its affiliates. All rights reserved.
*
* This file was distributed by Oracle as part of a version of Oracle NoSQL
* Database made available at:
*
* http://www.oracle.com/technetwork/database/database-technologies/nosqldb/downloads/index.html
*
* Please see the LICENSE file included in the top-level directory of the
* appropriate version of Oracle NoSQL Database for a copy of the license and
* additional information.
*/
package oracle.kv.hadoop.table;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import oracle.kv.Consistency;
import oracle.kv.Direction;
import oracle.kv.KVSecurityConstants;
import oracle.kv.KVStoreException;
import oracle.kv.ParamConstant;
import oracle.kv.PasswordCredentials;
import oracle.kv.impl.security.PasswordManager;
import oracle.kv.impl.security.PasswordStore;
import oracle.kv.impl.security.login.LoginManager;
import oracle.kv.impl.security.util.KVStoreLogin;
import oracle.kv.impl.topo.PartitionId;
import oracle.kv.impl.topo.RepGroupId;
import oracle.kv.impl.topo.Topology;
import oracle.kv.impl.topo.split.SplitBuilder;
import oracle.kv.impl.topo.split.TopoSplit;
import oracle.kv.impl.util.ExternalDataSourceUtils;
import oracle.kv.impl.util.TopologyLocator;
import oracle.kv.impl.util.registry.ClientSocketFactory;
import oracle.kv.impl.util.registry.RegistryUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
/**
* This is the base class for Oracle NoSQL Database InputFormat classes that
* can be used to run MapReduce against data stored via the Table API.
* Keys are of type PrimaryKey. Values are always of type Row.
*
* Parameters may be passed using either the static setters on this class or
* through the Hadoop JobContext configuration parameters. The following
* parameters are recognized:
*
*
* oracle.kv.kvstore
- the KV Store name for this InputFormat
* to operate on. This is equivalent to the {@link #setKVStoreName} method.
*
* oracle.kv.hosts
- one or more hostname:port
* pairs separated by commas naming hosts in the KV Store. This is equivalent
* to the {@link #setKVHelperHosts} method.
*
* oracle.kv.hadoop.hosts
- one or more hostname
* strings separated by commas naming the Hadoop data node hosts in the
* Hadoop cluster that will support MapReduce jobs and/or service Hive
* queries. This is equivalent to the {@link #setKVHadoopHosts} method.
* The value(s) specified by this property will be returned by the
* getLocations
method of TableInputSplit
. If this
* property is not specified, or if the {@link #setKVHadoopHosts} method is
* not called, then the values specified via the oracle.kv.hosts
* property (or the {@link #setKVHelperHosts} method) will be used instead.
*
* oracle.kv.tableName
- the name of the table in the
* store from which data will be retrieved. This is equivalent
* to the {@link #setTableName} method.
*
* oracle.kv.primaryKey
- Property whose value consists of
* the components to use when constructing the key to employ when iterating
* the table. The format of this property's value must be a list of name:value
* pairs in JSON FORMAT like the following:
*
* -Doracle.kv.primaryKey="{\"name\":\"stringVal\",\"name\":floatVal}"
*
* where the list itself is enclosed in un-escaped double quotes and
* corresponding curly brace; and each field name component -- as well
* as each STRING type field value component -- is enclosed in ESCAPED
* double quotes.
*
* In addition to the JSON format requirement above, the values referenced by
* the various fieldValue components of this Property must satisfy the
* semantics of PrimaryKey for the given table; that is, they must represent
* a first-to-last subset of the table's primary key fields, and they must be
* specified in the same order as those primary key fields. If the components
* of this property do not satisfy these requirements, a full primary key
* wildcard will be used when iterating the table.
*
* This is equivalent to the {@link #setPrimaryKeyProperty} method.
*
*
oracle.kv.fieldRange
- Property whose value consists of
* the components to use when constructing the field range to employ when
* iterating the table. The format of this property's value must be a list
* of name:value pairs in JSON FORMAT like the following:
*
* -Doracle.kv.fieldRange="{\"name\":\"fieldName\",
* \"start\":\"startVal\",[\"startInclusive\":true|false],
* \"end\"\"endVal\",[\"endInclusive\":true|false]}"
*
* where for the given field over which to range, the 'start', and 'end'
* components are required, and the 'startInclusive' and 'endInclusive'
* components are optional; defaulting to 'true' if not included. Note
* that the list itself is enclosed in un-escaped double quotes and
* corresponding curly brace; and each name component and string type
* value component is enclosed in ESCAPED double quotes.
*
* In addition to the JSON format requirement above, the values referenced
* by the components of this Property's value must also satisfy the semantics
* of FieldRange; that is,
*
* - the values associated with the target key must correspond to a
* valid primary key in the table
*
- the value associated with the fieldName must be the name of a valid
* field of the primary key over which iteration will be performed
*
- the values associated with the start and end of the range must
* correspond to valid values of the given fieldName
*
- the value associated with either of the inclusive components
* must be either 'true' or 'false'
*
* If the components of this property do not satisfy these requirements, then
* table iteration will be performed over the full range of values of the
* PrimaryKey iteration rather than a sub-range.
*
* This is equivalent to the {@link #setFieldRangeProperty} method.
*
*
oracle.kv.consistency
- Specifies the read consistency
* associated with the lookup of the child KV pairs. Version- and Time-based
* consistency may not be used. If null, the default consistency is used.
*
* This is equivalent to the {@link #setConsistency} method.
*
*
oracle.kv.timeout
- Specifies an upper bound on the time
* interval for processing a particular KV retrieval. A best effort is made to
* not exceed the specified limit. If zero, the default request timeout is
* used. This value is always in milliseconds.
*
* This is equivalent to the {@link #setTimeout} and {@link #setTimeoutUnit}
* methods.
*
*
oracle.kv.maxRequests
- Specifies the maximum number of
* client side threads to use when running an iteration; where a value of 1
* causes the iteration to be performed using only the current thread, and a
* value of 0 causes the client to base the number of threads to employ on
* the current store topology.
*
* This is equivalent to the {@link #setMaxRequests} method.
*
*
oracle.kv.batchSize
- Specifies the suggested number of
* keys to fetch during each network round trip by the InputFormat. If 0, an
* internally determined default is used. This is equivalent to the {@link
* #setBatchSize} method.
*
* oracle.kv.maxBatches
- Specifies the maximum number of
* result batches that can be held in memory on the client side before
* processing on the server side pauses. This parameter can be used to prevent
* the client side memory from being exceeded if the client cannot consume
* results as fast as they are generated by the server side.
*
* This is equivalent to the {@link #setMaxBatches} method.
*
*
*
*
* Internally, the TableInputFormatBase class utilizes the method
*
* oracle.kv.table.TableIterator<oracle.kv.table.Row>
* TableAPI.tableIterator
*
* to retrieve records. You should refer to the javadoc for that method
* for information about the various parameters.
*
*
* TableInputFormatBase
dynamically generates a number of
* splits, each encapsulating a list of sets in which the elements of each
* set are partition ids over which can be retrieved in parallel; to
* optimize retrieval performance. The "size" of each split that is
* generated -- which will be the value returned by the getLength
* method of TableInputSplit
-- is the number of that
* encapsulated list of partition id sets. If the consistency passed to
* TableInputFormatBase
is {@link Consistency#NONE_REQUIRED
* NONE_REQUIRED} (the default), then {@link InputSplit#getLocations
* InputSplit.getLocations()} will return an array of the names of the
* master and the replica(s) which contain the partition.
* Alternatively, if the consistency is {@link
* Consistency#NONE_REQUIRED_NO_MASTER NONE_REQUIRED_NO_MASTER}, then
* the array returned will contain only the names of the replica(s);
* not the master. Finally, if the consistency is {@link
* Consistency#ABSOLUTE ABSOLUTE}, then the array returned will
* contain only the name of the master. This means that if Hadoop job
* trackers are running on the nodes named in the returned
* location
array, Hadoop will generally attempt to run
* the subtasks for a particular partition on those nodes where the
* data is stored and replicated. Hadoop and Oracle NoSQL DB
* administrators should be careful about co-location of Oracle NoSQL
* DB and Hadoop processes since they may compete for resources.
*
*
* {@link InputSplit#getLength InputSplit.getLength()} always returns 1.
*
*
* A simple example demonstrating the Oracle NoSQL DB Hadoop
* oracle.kv.hadoop.table.TableInputFormat
class reading
* data from Hadoop in a MapReduce job and counting the number of rows
* in a given table in the store can be found in the
* KVHOME/examples/hadoop/table
directory. The javadoc
* for that program describes the simple MapReduce processing as well as
* how to invoke the program in Hadoop.
*
* @since 3.1
*/
abstract class TableInputFormatBase extends InputFormat {
private static final Log LOG = LogFactory.getLog(
"oracle.kv.hadoop.table.TableInputFormatBase");
private static final String FILE_SEP =
System.getProperty("file.separator");
private static final String USER_SECURITY_DIR =
System.getProperty("user.dir") + FILE_SEP +
"TABLE_INPUT_FORMAT_SECURITY_DIR";
/*
* Static fields are used to support the MapReduce programming model;
* where the MapReduce job is initalized with this class, and the
* static setter methods of this class are used to initialize the
* fields below for use in the job.
*/
private static String kvStoreName = null;
private static String[] kvHelperHosts = null;
private static String[] kvHadoopHosts = null;
private static String tableName = null;
private static String primaryKeyProperty = null;
/* For MultiRowOptions */
private static String fieldRangeProperty = null;
/* For TableIteratorOptions */
private static Direction direction = Direction.UNORDERED;
private static Consistency consistency = null;
private static long timeout = 0;
private static TimeUnit timeoutUnit = TimeUnit.MILLISECONDS;
private static int maxRequests = 0;
private static int batchSize = 0;
private static int maxBatches = 0;
/* Used by getSplits to initialize the splits that are created. */
private static String loginFlnm = null;
private static PasswordCredentials passwordCredentials = null;
private static String trustFlnm = null;
/* Used by getSplits to contact secure store locally. */
private static String localLoginFile = null;
private static int queryBy =
TableInputSplit.QUERY_BY_PRIMARY_ALL_PARTITIONS;
private static String whereClause = null;
private static Integer shardKeyPartitionId = null;
/**
* @hidden
*/
protected TableInputFormatBase() { }
/**
* @hidden
* Logically split the set of input data for the job.
*
* @param context job configuration.
*
* @return an array of {@link InputSplit}s for the job.
*/
@Override
public List getSplits(JobContext context)
throws IOException, InterruptedException {
if (context != null) {
final Configuration conf = context.getConfiguration();
initializeParameters(conf);
}
if (kvStoreName == null) {
throw new IllegalArgumentException
("No KV Store Name provided. Use either the " +
ParamConstant.KVSTORE_NAME.getName() +
" parameter or call " + TableInputFormatBase.class.getName() +
".setKVStoreName().");
}
if (kvHelperHosts == null) {
throw new IllegalArgumentException
("No KV Helper Hosts were provided. Use either the " +
ParamConstant.KVSTORE_NODES.getName() +
" parameter or call " + TableInputFormatBase.class.getName() +
".setKVHelperHosts().");
}
if (kvHadoopHosts == null) {
kvHadoopHosts = new String[kvHelperHosts.length];
for (int i = 0; i < kvHelperHosts.length; i++) {
/* Strip off the ':port' suffix */
final String[] hostPort = (kvHelperHosts[i]).trim().split(":");
kvHadoopHosts[i] = hostPort[0];
}
}
if (tableName == null) {
throw new IllegalArgumentException
("No Table Name provided. Use either the " +
ParamConstant.TABLE_NAME.getName() +
" parameter or call " + TableInputFormatBase.class.getName() +
".setTableName().");
}
final String userName = (passwordCredentials == null ? null :
passwordCredentials.getUsername());
final KVStoreLogin storeLogin = new KVStoreLogin(
userName, localLoginFile);
storeLogin.loadSecurityProperties();
storeLogin.prepareRegistryCSF();
LoginManager loginMgr = null;
if (storeLogin.foundTransportSettings()) {
loginMgr = KVStoreLogin.getRepNodeLoginMgr(
kvHelperHosts, passwordCredentials, kvStoreName);
}
/*
* Retrieve the topology of the store.
*
* Note that if the same Hive CLI session is used to run queries that
* must connect to different KVStores where one store is non-secure
* and the other is secure, then if the most recent call to this method
* invoked the code below to retrieve the topology from the secure
* store, then the security information is stored in the system
* properties and the state of the splits, and the client socket
* factory used when communicating with the RMI registry while
* retrieving the topology is configured for SSL communication. As
* a result, if the current call to this method invokes the code below
* to retrieve the topology of the non-secure store, and if the client
* socket factory is not reconfigured for non-SSL communication, then
* a KVServerException (wrapping a java.rmi.ConnectIOException) will
* be encountered. To address this, KVStoreException is caught, the
* client socket factory is reconfigured for non-SSL communication,
* and the attempt to retrieve the topology is retried with no
* security information.
*
* If both secure and non-secure attempts fail, then the stack trace
* is sent to both the DataNode's stderr log file and the Hive CLI
* display screen.
*/
Topology topology;
try {
topology = TopologyLocator.get(kvHelperHosts, 0, loginMgr,
kvStoreName);
} catch (KVStoreException e) {
if (passwordCredentials != null) {
/* Retry with no security */
LOG.debug(
"Failure on topology retrieval: attempt to " +
"communicate with RMI registry over SSL unsuccessful. " +
"Changing from SSLClientSocketFactory to " +
"ClientSocketFactory and retrying ...");
ClientSocketFactory.setRMIPolicy(null, kvStoreName);
RegistryUtils.initRegistryCSF();
try {
topology = TopologyLocator.get(kvHelperHosts, 0, null,
kvStoreName);
} catch (KVStoreException e1) {
e1.printStackTrace(); /* Send to DataNode's stderr file. */
throw new IOException(e1); /* Send to Hive CLI. */
}
} else {
e.printStackTrace(); /* Send to DataNode's stderr file. */
throw new IOException(e); /* Send to Hive CLI. */
}
}
/* Create splits based on the store's partitions or its shards. */
final List splits =
getSplitInfo(topology, consistency, queryBy, shardKeyPartitionId);
final List ret = new ArrayList(splits.size());
for (TopoSplitWrapper ts : splits) {
final TableInputSplit split = new TableInputSplit();
split.setKVStoreName(kvStoreName);
split.setKVHelperHosts(kvHelperHosts);
split.setLocations(kvHadoopHosts);
split.setTableName(tableName);
split.setKVStoreSecurity(
loginFlnm, passwordCredentials, trustFlnm);
split.setPrimaryKeyProperty(primaryKeyProperty);
/* For MultiRowOptions */
split.setFieldRangeProperty(fieldRangeProperty);
/* For TableIteratorOptions */
split.setDirection(direction);
split.setConsistency(consistency);
split.setTimeout(timeout);
split.setTimeoutUnit(timeoutUnit);
split.setMaxRequests(maxRequests);
split.setBatchSize(batchSize);
split.setMaxBatches(maxBatches);
split.setPartitionSets(ts.getPartitionSets());
split.setQueryInfo(queryBy, whereClause);
split.setShardSet(ts.getShardSet());
ret.add(split);
}
return ret;
}
/**
* Set the KV Store name for this InputFormat to operate on. This is
* equivalent to passing the oracle.kv.kvstore
Hadoop
* property.
*
* @param newStoreName the new KV Store name to set
*/
public static void setKVStoreName(String newStoreName) {
TableInputFormatBase.kvStoreName = newStoreName;
}
/**
* Set the KV Helper host:port pair(s) for this InputFormat to operate on.
* This is equivalent to passing the oracle.kv.hosts
Hadoop
* property.
*
* @param newHelperHosts array of hostname:port strings of any hosts
* in the KV Store.
*/
public static void setKVHelperHosts(String[] newHelperHosts) {
TableInputFormatBase.kvHelperHosts = newHelperHosts;
}
/**
* Set the KV Hadoop data node host name(s) for this InputFormat
* to operate on. This is equivalent to passing the
* oracle.kv.hadoop.hosts
property.
*
* @param newHadoopHosts array of hostname strings corresponding to the
* names of the Hadoop data node hosts in the Hadoop cluster that this
* InputFormat will use to support MapReduce jobs and/or service Hive
* queries.
*/
public static void setKVHadoopHosts(String[] newHadoopHosts) {
TableInputFormatBase.kvHadoopHosts = newHadoopHosts;
}
/**
* Set the name of the table in the KV store that this InputFormat
* will operate on. This is equivalent to passing the
* oracle.kv.tableName
property.
*
* @param newTableName the new table name to set.
*/
public static void setTableName(String newTableName) {
TableInputFormatBase.tableName = newTableName;
}
/**
* Sets the String to use for the property value whose contents are used
* to construct the primary key to employ when iterating the table. The
* format of the String input to this method must be a comma-separated
* String of the form:
*
* fieldName,fieldValue,fieldType,fieldName,fieldValue,fieldType,..
*
* where the number of elements separated by commas must be a multiple
* of 3, and each fieldType must be 'STRING', 'INTEGER', 'LONG', 'FLOAT',
* 'DOUBLE', or 'BOOLEAN'. Additionally, the values referenced by the
* various fieldType and fieldValue components of this String must
* satisfy the semantics of PrimaryKey for the given table; that is,
* they must represent a first-to-last subset of the table's primary
* key fields, and they must be specified in the same order as those
* primary key fields. If the String referenced by this property
* does not satisfy these requirements, a full primary key wildcard
* will be used when iterating the table.
*
* This is equivalent to passing the oracle.kv.primaryKey
* Hadoop property.
*
* @param newProperty the new shard key property to set
*/
public static void setPrimaryKeyProperty(String newProperty) {
TableInputFormatBase.primaryKeyProperty = newProperty;
}
/* Methods related to MultiRowOptions */
/**
* Sets the String to use for the property value whose contents are used
* to construct the field range to employ when iterating the table. The
* format of this property's value must be a list of name:value pairs in
* JSON FORMAT like the following:
*
* -Doracle.kv.fieldRange="{\"name\":\"fieldName\",
* \"start\":\"startVal\",[\"startInclusive\":true|false],
* \"end\"\"endVal\",[\"endInclusive\":true|false]}"
*
* where for the given field over which to range, the 'start', and 'end'
* components are required, and the 'startInclusive' and 'endInclusive'
* components are optional; defaulting to 'true' if not included. Note
* that the list itself is enclosed in un-escaped double quotes and
* corresponding curly brace; and each name component and string type
* value component is enclosed in ESCAPED double quotes.
*
* In addition to the JSON format requirement above, the values referenced
* by the components of this Property's value must also satisfy the
* semantics of FieldRange; that is,
*
* - the values associated with the target key must correspond to a
* valid primary key in the table
*
- the value associated with the fieldName must be the name of a
* valid field of the primary key over which iteration will be
* performed
*
- the values associated with the start and end of the range must
* correspond to valid values of the given fieldName
*
- the value associated with either of the inclusive components
* must be either 'true' or 'false'
*
* If the components of this property do not satisfy these requirements,
* then table iteration will be performed over the full range of values
* of the PrimaryKey iteration rather than a sub-range.
*
* This is equivalent to passing the oracle.kv.fieldRange
* Hadoop property.
*
* @param newProperty the new field range property to set
*/
public static void setFieldRangeProperty(String newProperty) {
TableInputFormatBase.fieldRangeProperty = newProperty;
}
/* Methods related to TableIteratorOptions */
/**
* Specifies the order in which records are returned by the InputFormat.
* Note that when doing PrimaryKey iteration, only Direction.UNORDERED
* is allowed.
*
* @param newDirection the direction to retrieve data
*/
public static void setDirection(Direction newDirection) {
TableInputFormatBase.direction = newDirection;
}
/**
* Specifies the read consistency associated with the lookup of the child
* KV pairs. Version- and Time-based consistency may not be used. If
* null, the default consistency is used. This is equivalent to passing
* the oracle.kv.consistency
Hadoop property.
*
* @param consistency the consistency
*/
@SuppressWarnings("deprecation")
public static void setConsistency(Consistency consistency) {
if (consistency == Consistency.ABSOLUTE ||
consistency == Consistency.NONE_REQUIRED_NO_MASTER ||
consistency == Consistency.NONE_REQUIRED ||
consistency == null) {
TableInputFormatBase.consistency = consistency;
} else {
throw new IllegalArgumentException
("Consistency may only be ABSOLUTE, " +
"NONE_REQUIRED_NO_MASTER, or NONE_REQUIRED");
}
}
/**
* Specifies an upper bound on the time interval for processing a
* particular KV retrieval. A best effort is made to not exceed the
* specified limit. If zero, the default request timeout is used. This is
* equivalent to passing the oracle.kv.timeout
Hadoop
* property.
*
* @param timeout the timeout
*/
public static void setTimeout(long timeout) {
TableInputFormatBase.timeout = timeout;
}
/**
* Specifies the unit of the timeout parameter. It may be null only if
* timeout is zero. This is equivalent to passing the
* oracle.kv.timeout
Hadoop property.
*
* @param timeoutUnit the timeout unit
*/
public static void setTimeoutUnit(TimeUnit timeoutUnit) {
TableInputFormatBase.timeoutUnit = timeoutUnit;
}
/**
* Specifies the maximum number of client side threads to use when running
* an iteration; where a value of 1 causes the iteration to be performed
* using only the current thread, and a value of 0 causes the client to
* base the number of threads to employ on the current store topology.
*
* This is equivalent to passing the oracle.kv.maxRequests
* Hadoop property.
*
* @param newMaxRequests the suggested number of threads to employ when
* an iteration.
*/
public static void setMaxRequests(int newMaxRequests) {
TableInputFormatBase.maxRequests = newMaxRequests;
}
/**
* Specifies the suggested number of keys to fetch during each network
* round trip by the InputFormat. If 0, an internally determined default
* is used. This is equivalent to passing the
* oracle.kv.batchSize
Hadoop property.
*
* @param batchSize the suggested number of keys to fetch during each
* network round trip.
*/
public static void setBatchSize(int batchSize) {
TableInputFormatBase.batchSize = batchSize;
}
/**
* Specifies the maximum number of result batches that can be held in
* memory on the client side before processing on the server side
* pauses. This parameter can be used to prevent the client side memory
* from being exceeded if the client cannot consume results as fast as
* they are generated by the server side.
*
* This is equivalent to passing the oracle.kv.maxBatches
* Hadoop property.
*
* @param newMaxBatches the suggested number of threads to employ when
* an iteration.
*/
public static void setMaxBatches(int newMaxBatches) {
TableInputFormatBase.maxBatches = newMaxBatches;
}
/**
* Sets the login properties file and the public trust file (keys
* and/or certificates), as well as the PasswordCredentials
* for authentication. The value of the loginFile
and
* trustFile
parameters must be either a fully qualified
* path referencing a file located on the local file system, or the
* name of a file (no path) whose contents can be retrieved as a
* resource from the current VM's classpath.
*
* Note that this class provides the getSplits
method;
* which must be able to contact a secure store, and so will need
* access to local copies of the login properties and trust files.
* As a result, if the values input for the loginFile
and
* trustFile
parameters are simple file names rather
* than fully qualified paths, this method will retrieve the contents
* of each from the classpath and generate private, local copies of
* the associated file for availability to the getSplits
* method.
*/
public static void setKVSecurity(
final String loginFile,
final PasswordCredentials userPasswordCredentials,
final String trustFile)
throws IOException {
setLocalKVSecurity(loginFile, userPasswordCredentials, trustFile);
}
private void initializeParameters(Configuration conf) throws IOException {
/*
* Must always reinitialize all fields of this class; because there
* are use cases in which the field values of this class change over
* time. For example, if this class is employed as the InputFormat
* for a Hive session, then each Hive query can/may specify a
* different tableName and/or kvStoreName and/or kvHelperHosts.
* If these values are not reinitialized on each call to this
* method, then because the fields are static, the values from the
* previous query will be incorrectly used for the current query;
* which can result in errors or incorrect results.
*
* On the other hand, for a basic MapReduce job, the values of the
* static fields are set once (via either command line arguments or
* a system property), when the job is initiated . In that case, the
* system property will not survive the serialization/deserialization
* process and thus will return null below. For the purposes of this
* method then, a null system property is taken to be an indication
* that this class is being employed in a MapReduce job initiated
* from somewhere other than a hive query (ex. the command line).
* And it is then assumed that the static fields processed by this
* method must have been set during job initiation and that each
* field's value survived its journey from the job's client side to
* the MapReduce server side.
*/
if (conf != null) {
final String kvStoreNameProp =
conf.get(ParamConstant.KVSTORE_NAME.getName());
if (kvStoreNameProp != null) {
kvStoreName = kvStoreNameProp;
}
final String helperHosts =
conf.get(ParamConstant.KVSTORE_NODES.getName());
if (helperHosts != null) {
kvHelperHosts = helperHosts.trim().split(",");
}
final String hadoopHosts =
conf.get(ParamConstant.KVHADOOP_NODES.getName());
if (hadoopHosts != null) {
kvHadoopHosts = hadoopHosts.trim().split(",");
} else {
if (kvHelperHosts != null) {
kvHadoopHosts = new String[kvHelperHosts.length];
for (int i = 0; i < kvHelperHosts.length; i++) {
/* Strip off the ':port' suffix */
final String[] hostPort =
(kvHelperHosts[i]).trim().split(":");
kvHadoopHosts[i] = hostPort[0];
}
}
}
final String tableNameProp =
conf.get(ParamConstant.TABLE_NAME.getName());
if (tableNameProp != null) {
tableName = tableNameProp;
}
final String primaryKeyProp =
conf.get(ParamConstant.PRIMARY_KEY.getName());
if (primaryKeyProp != null) {
primaryKeyProperty = primaryKeyProp;
}
/* For MultiRowOptions. */
final String fieldRangeProp =
conf.get(ParamConstant.FIELD_RANGE.getName());
if (fieldRangeProp != null) {
fieldRangeProperty = fieldRangeProp;
}
/*
* For TableIteratorOptions. Note that when doing PrimaryKey
* iteration, Direction must be UNORDERED.
*/
final String consistencyStr =
conf.get(ParamConstant.CONSISTENCY.getName());
if (consistencyStr != null) {
consistency = ExternalDataSourceUtils.
parseConsistency(consistencyStr);
}
final String timeoutParamName = ParamConstant.TIMEOUT.getName();
final String timeoutStr = conf.get(timeoutParamName);
if (timeoutStr != null) {
timeout = ExternalDataSourceUtils.parseTimeout(timeoutStr);
timeoutUnit = TimeUnit.MILLISECONDS;
}
final String maxRequestsStr =
conf.get(ParamConstant.MAX_REQUESTS.getName());
if (maxRequestsStr != null) {
try {
maxRequests = Integer.parseInt(maxRequestsStr);
} catch (NumberFormatException NFE) {
throw new IllegalArgumentException
("Invalid value for " +
ParamConstant.MAX_REQUESTS.getName() + ": " +
maxRequestsStr);
}
}
final String batchSizeStr =
conf.get(ParamConstant.BATCH_SIZE.getName());
if (batchSizeStr != null) {
try {
batchSize = Integer.parseInt(batchSizeStr);
} catch (NumberFormatException NFE) {
throw new IllegalArgumentException
("Invalid value for " +
ParamConstant.BATCH_SIZE.getName() + ": " +
batchSizeStr);
}
}
final String maxBatchesStr =
conf.get(ParamConstant.MAX_BATCHES.getName());
if (maxBatchesStr != null) {
try {
maxBatches = Integer.parseInt(maxBatchesStr);
} catch (NumberFormatException NFE) {
throw new IllegalArgumentException
("Invalid value for " +
ParamConstant.MAX_BATCHES.getName() + ": " +
maxBatchesStr);
}
}
/* Handle the properties related to security. */
final String loginFile =
conf.get(KVSecurityConstants.SECURITY_FILE_PROPERTY);
final String trustFile =
conf.get(KVSecurityConstants.SSL_TRUSTSTORE_FILE_PROPERTY);
final String username =
conf.get(KVSecurityConstants.AUTH_USERNAME_PROPERTY);
final String passwordStr =
conf.get(ParamConstant.AUTH_USER_PWD_PROPERTY.getName());
/* Create the PasswordCredentials needed to contact the store. */
PasswordCredentials passwordCreds = null;
if (username != null && passwordStr != null) {
final char[] userPassword = passwordStr.toCharArray();
passwordCreds =
new PasswordCredentials(username, userPassword);
}
if (passwordCreds == null) {
String passwordLoc = conf.get(
KVSecurityConstants.AUTH_WALLET_PROPERTY);
PasswordManager storeMgr = null;
if (passwordLoc != null) {
/* Retrieve the password from the given wallet. */
final File walletDirFd = new File(passwordLoc);
try {
storeMgr = PasswordManager.load(
PasswordManager.WALLET_MANAGER_CLASS);
} catch (Exception e) {
e.printStackTrace(); /* Send to DataNode stderr file */
throw new IOException(e); /* Send to Hive CLI. */
}
final PasswordStore fileStore =
storeMgr.getStoreHandle(walletDirFd);
fileStore.open(null);
final Collection secretAliases =
fileStore.getSecretAliases();
final Iterator aliasItr = secretAliases.iterator();
final char[] userPassword = (aliasItr.hasNext() ?
fileStore.getSecret(aliasItr.next()) : null);
if (username != null) {
passwordCreds =
new PasswordCredentials(username, userPassword);
}
fileStore.discard();
} else {
passwordLoc = conf.get(
KVSecurityConstants.AUTH_PWDFILE_PROPERTY);
if (passwordLoc != null) {
/* Retrieve password from the given password file. */
final File passwordFileFd = new File(passwordLoc);
try {
storeMgr = PasswordManager.load(
PasswordManager.FILE_STORE_MANAGER_CLASS);
} catch (Exception e) {
e.printStackTrace(); /* Send to DataNode stderr. */
throw new IOException(e); /* Send to Hive CLI. */
}
final PasswordStore fileStore =
storeMgr.getStoreHandle(passwordFileFd);
fileStore.open(null);
final Collection secretAliases =
fileStore.getSecretAliases();
final Iterator aliasItr =
secretAliases.iterator();
final char[] userPassword = (aliasItr.hasNext() ?
fileStore.getSecret(aliasItr.next()) : null);
if (username != null) {
passwordCreds = new PasswordCredentials(
username, userPassword);
}
fileStore.discard();
}
}
}
setLocalKVSecurity(loginFile, passwordCreds, trustFile);
}
}
/**
* Set/create the artifacts required to connect to and interact
* with a secure store; specifically, a login properties file,
* a trust file containing public keys and/or certificates, and
* PasswordCredentials
. If the value input for the
* loginFile
and trustFile
parameter
* is a fully-qualified path, then this method initializes the
* corresponding static variables to those values so that the
* getSplits
method can contact a secure store, extracts
* the filenames from those paths, uses those values to initialize
* the corresponding static filename variables used to initialize
* the splits that are created, and returns.
*
* If the value input for the loginFile
and
* trustFile
parameter is not a fully-qualified path,
* then this method uses the given file names to retrieve the contents
* of the associated login file and trust file as resources from
* the classpath, and writes that information to corresponding files
* on the local file system (in a directory owned by the user under
* which the application is executed). After generating the local
* files, the fully-qualified paths to those files are used to
* initialize the corresponding static variables so that the
* getSplits
method can contact a secure store.
*/
private static void setLocalKVSecurity(
final String loginFile,
final PasswordCredentials userPasswordCredentials,
final String trustFile)
throws IOException {
if (loginFile == null) {
return;
}
if (userPasswordCredentials == null) {
return;
}
if (trustFile == null) {
return;
}
final File loginFd = new File(loginFile);
boolean loginIsAbsolute = false;
if (loginFd.isAbsolute()) {
loginIsAbsolute = true;
TableInputFormatBase.localLoginFile = loginFile;
TableInputFormatBase.loginFlnm = loginFd.getName();
} else {
TableInputFormatBase.loginFlnm = loginFile;
}
TableInputFormatBase.passwordCredentials = userPasswordCredentials;
final File trustFd = new File(trustFile);
boolean trustIsAbsolute = false;
if (trustFd.isAbsolute()) {
trustIsAbsolute = true;
TableInputFormatBase.trustFlnm = trustFd.getName();
} else {
TableInputFormatBase.trustFlnm = trustFile;
}
if (loginIsAbsolute && trustIsAbsolute) {
return;
}
/*
* If loginFile and/or trustFile is a filename and not an absolute
* path, then generate local versions of the file.
*/
final File userSecurityDirFd = new File(USER_SECURITY_DIR);
if (!userSecurityDirFd.exists()) {
if (!userSecurityDirFd.mkdirs()) {
throw new IOException("failed to create " + userSecurityDirFd);
}
}
final ClassLoader cl = TableInputFormatBase.class.getClassLoader();
if (!loginIsAbsolute) {
InputStream loginStream = null;
if (cl != null) {
loginStream = cl.getResourceAsStream(loginFlnm);
} else {
loginStream = ClassLoader.getSystemResourceAsStream(loginFlnm);
}
/*
* Retrieve the login configuration as a resource from the
* classpath, and write that information to the user's local
* file system. But exclude any properties related to user
* authorization; that is, exclude all properties of the form
* 'oracle.kv.auth.*", to prevent those property values from
* being sent and cached on the DataNodes.
*/
final Properties loginProps = new Properties();
if (loginStream != null) {
loginProps.load(loginStream);
}
/* Exclude 'oracle.kv.auth.*" properties. */
loginProps.remove(KVSecurityConstants.AUTH_USERNAME_PROPERTY);
loginProps.remove(KVSecurityConstants.AUTH_WALLET_PROPERTY);
loginProps.remove(KVSecurityConstants.AUTH_PWDFILE_PROPERTY);
/* Strip off the path of the trust file. */
final String trustProp =
loginProps.getProperty(
KVSecurityConstants.SSL_TRUSTSTORE_FILE_PROPERTY);
if (trustProp != null) {
final File trustPropFd = new File(trustProp);
if (!trustPropFd.exists()) {
loginProps.setProperty(
KVSecurityConstants.SSL_TRUSTSTORE_FILE_PROPERTY,
trustPropFd.getName());
}
}
final File absoluteLoginFd =
new File(USER_SECURITY_DIR + FILE_SEP + loginFlnm);
final FileOutputStream loginFos =
new FileOutputStream(absoluteLoginFd);
try {
loginProps.store(loginFos, null);
} finally {
loginFos.close();
}
TableInputFormatBase.localLoginFile = absoluteLoginFd.toString();
}
if (!trustIsAbsolute) {
InputStream trustStream = null;
if (cl != null) {
trustStream = cl.getResourceAsStream(trustFlnm);
} else {
trustStream = ClassLoader.getSystemResourceAsStream(trustFlnm);
}
/*
* Retrieve the trust credentials as a resource from the classpath,
* and write that information to the user's local file system.
*/
final File absoluteTrustFd =
new File(USER_SECURITY_DIR + FILE_SEP + trustFlnm);
final FileOutputStream trustFlnmFos =
new FileOutputStream(absoluteTrustFd);
try {
int nextByte = trustStream.read();
while (nextByte != -1) {
trustFlnmFos.write(nextByte);
nextByte = trustStream.read();
}
} finally {
trustFlnmFos.close();
}
}
}
public void setQueryInfo(final int newQueryBy,
final String newWhereClause,
final Integer newPartitionId) {
queryBy = newQueryBy;
whereClause = newWhereClause;
shardKeyPartitionId = newPartitionId;
}
/**
* Convenience method that returns a list whose elements encapsulate
* the information needed to create the necessary splits; based on
* whether a TableScan (partition based) or an IndexScan (shard based)
* will be used to satisfy the read request (query).
*/
private List getSplitInfo(
final Topology topology,
final Consistency readConsistency,
final int whereQueryBy,
final Integer singlePartitionId) {
final List retList =
new ArrayList();
if (topology == null) {
return retList;
}
/* Determine how the splits should be generated. */
boolean buildSplits = false;
boolean singleSplit = false;
int singleId = 1;
switch (whereQueryBy) {
case TableInputSplit.QUERY_BY_PRIMARY_ALL_PARTITIONS:
case TableInputSplit.QUERY_BY_ONQL_ALL_PARTITIONS:
buildSplits = true;
break;
case TableInputSplit.QUERY_BY_PRIMARY_SINGLE_PARTITION:
case TableInputSplit.QUERY_BY_ONQL_SINGLE_PARTITION:
if (singlePartitionId == null) {
buildSplits = true;
} else {
singleSplit = true;
singleId = singlePartitionId.intValue();
}
break;
default:
/* Skip partition based splits for shard based splits. */
break;
}
/*
* If a table scan (ALL_PARTITIONS) will be used when pushing the
* predicate, then the SplitBuilder is first used to compute disjoint
* subsets of the store's partitions, wrapped in a TopoSplit class.
* Then a TopoSplitWrapper is used to map each of those partition
* subsets to the shards corresponding to the partitions of the
* subset; so that one split per partition subset is created.
*/
if (buildSplits) {
final SplitBuilder sb = new SplitBuilder(topology);
final List topoSplits =
sb.createShardSplits(readConsistency);
for (TopoSplit topoSplit : topoSplits) {
final Set shardSet = new HashSet();
final List> partitionSets =
topoSplit.getPartitionSets();
for (Set partitionIds : partitionSets) {
for (Integer pId : partitionIds) {
final PartitionId partitionId = new PartitionId(pId);
final RepGroupId repGroupId =
topology.getRepGroupId(partitionId);
shardSet.add(repGroupId);
}
}
retList.add(new TopoSplitWrapper(topoSplit, shardSet));
}
return retList;
}
/* For executing the query against a SINGLE_PARTITION. */
if (singleSplit) {
final Set partitionSet = new HashSet();
partitionSet.add(singlePartitionId);
final Set shardSet = new HashSet();
shardSet.add(topology.getRepGroupId(new PartitionId(singleId)));
retList.add(
new TopoSplitWrapper(
new TopoSplit(0, partitionSet), shardSet));
return retList;
}
/*
* Either QUERY_BY_PRIMARY_ALL_SHARDS or QUERY_BY_ONQL_ALL_SHARDS
* remains. For either case, since an index is involved, simply
* return the store's shards; so that one split per shard is
* created.
*/
final Set shardIds = topology.getRepGroupIds();
for (RepGroupId shardId : shardIds) {
final Set shardSet = new HashSet();
shardSet.add(shardId);
retList.add(new TopoSplitWrapper(null, shardSet));
}
return retList;
}
/**
* Convenience class that wraps a TopoSplit containing a subset of the
* store's partitions and a set containing one of the store's shards.
*/
private static class TopoSplitWrapper {
private final TopoSplit topoSplit;
private final Set shardSet;
TopoSplitWrapper(TopoSplit topoSplit, Set shardSet) {
this.topoSplit = topoSplit;
this.shardSet = shardSet;
}
List> getPartitionSets() {
if (topoSplit != null) {
return topoSplit.getPartitionSets();
}
/* Avoid NPE in write method during split serialization. */
return Collections.emptyList();
}
Set getShardSet() {
if (shardSet != null) {
return shardSet;
}
/* Avoid NPE in write method during split serialization. */
return Collections.emptySet();
}
}
}