All Downloads are FREE. Search and download functionalities are using the official Maven repository.

oracle.kv.hadoop.KVInputFormatBase Maven / Gradle / Ivy

Go to download

NoSQL Database Server - supplies build and runtime support for the server (store) side of the Oracle NoSQL Database.

The newest version!
/*-
 * Copyright (C) 2011, 2018 Oracle and/or its affiliates. All rights reserved.
 *
 * This file was distributed by Oracle as part of a version of Oracle NoSQL
 * Database made available at:
 *
 * http://www.oracle.com/technetwork/database/database-technologies/nosqldb/downloads/index.html
 *
 * Please see the LICENSE file included in the top-level directory of the
 * appropriate version of Oracle NoSQL Database for a copy of the license and
 * additional information.
 */

package oracle.kv.hadoop;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.TimeUnit;

import oracle.kv.Consistency;
import oracle.kv.Depth;
import oracle.kv.Direction;
import oracle.kv.KVStoreException;
import oracle.kv.Key;
import oracle.kv.KeyRange;
import oracle.kv.KeyValueVersion;
import oracle.kv.ParamConstant;
import oracle.kv.impl.security.login.LoginManager;
import oracle.kv.impl.security.util.KVStoreLogin;
import oracle.kv.impl.topo.StorageNode;
import oracle.kv.impl.topo.Topology;
import oracle.kv.impl.topo.split.SplitBuilder;
import oracle.kv.impl.topo.split.TopoSplit;
import oracle.kv.impl.util.TopologyLocator;
import oracle.kv.impl.util.ExternalDataSourceUtils;
import oracle.kv.impl.util.registry.RegistryUtils;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;

/**
 * This is the base class for Oracle NoSQL Database InputFormat classes.
 * Keys and Value types are determined by the specific subclass.
 * 

* * Parameters may be passed using either the static setters on this class or * through the Hadoop JobContext configuration parameters. The following * parameters are recognized: *

    * *
  • oracle.kv.kvstore - the KV Store name for this InputFormat * to operate on. This is equivalent to the {@link #setKVStoreName} method. * *
  • oracle.kv.hosts - one or more hostname:port * pairs separated by commas naming hosts in the KV Store. This is equivalent * to the {@link #setKVHelperHosts} method. * *
  • oracle.kv.batchSize - Specifies the suggested number of * keys to fetch during each network round trip by the InputFormat. If 0, an * internally determined default is used. This is equivalent to the {@link * #setBatchSize} method. * *
  • oracle.kv.parentKey - Specifies the parent key whose * "child" KV pairs are to be returned by the InputFormat. null will result * in fetching all keys in the store. If non-null, the major key path must be a * partial path and the minor key path must be empty. This is equivalent to the * {@link #setParentKey} method. * *
  • oracle.kv.subRange - Specifies a sub range to further * restrict the range under the parentKey to the major path components in this * sub range. It may be null. This is equivalent to the {@link #setSubRange} * method. * *
  • oracle.kv.depth - Specifies whether the parent and only * children or all descendents are returned. If null, * Depth.PARENT_AND_DESCENDENTS is implied. This is equivalent to the * {@link #setDepth} method. * *
  • oracle.kv.consistency - Specifies the read consistency * associated with the lookup of the child KV pairs. Version- and Time-based * consistency may not be used. If null, the default consistency is used. * This is equivalent to the {@link #setConsistency} method. * *
  • oracle.kv.timeout - Specifies an upper bound on the time * interval for processing a particular KV retrieval. A best effort is made to * not exceed the specified limit. If zero, the default request timeout is * used. This value is always in milliseconds. This is equivalent to the * {@link #setTimeout} and {@link #setTimeoutUnit} methods. * *
  • oracle.kv.formatterClass - Specifies the name of a class * that implements {@link AvroFormatter} to (optionally) format {@link * KeyValueVersion} instances into Avro IndexedRecords. This is only meaningful * when {@link oracle.kv.hadoop.KVAvroInputFormat} is used. *

    * One case where specifying a value for this parameter is useful is when you * are using Oracle Loader for Hadoop (OLH) to read Avro records from Oracle * NoSQL Database. Since the Avro records (the NoSQL Database record values) * are passed directly to OLH, the NoSQL Database keys are not available for * mapping into the target Oracle Database table. However, the formatter class * is passed both the NoSQL Database key and value so a new Avro record * containing both the value and key can be created and returned to be passed * to OLH. *

    * This is equivalent to the {@link #setFormatterClassName} method. * *

* *

* Internally, the KVInputFormatBase class utilizes {@link * oracle.kv.KVStore#storeIterator(Direction, int, Key, KeyRange, * Depth, Consistency, long, TimeUnit) KVStore.storeIterator} to * retrieve records. You should refer to the javadoc for that method * for information about the various parameters. *

* * KVInputFormatBase creates one split per Oracle NoSQL * DB partition. The location value for each split is an * array of hosts holding the partition. If the consistency passed to * KVInputFormatBase is {@link Consistency#NONE_REQUIRED * NONE_REQUIRED} (the default), then {@link InputSplit#getLocations * InputSplit.getLocations()} will return an array of the names of the * master and the replica(s) which contain the partition. * Alternatively, if the consistency is {@link * Consistency#NONE_REQUIRED_NO_MASTER NONE_REQUIRED_NO_MASTER}, then * the array returned will contain only the names of the replica(s); * not the master. Finally, if the consistency is {@link * Consistency#ABSOLUTE ABSOLUTE}, then the array returned will * contain only the name of the master. This means that if Hadoop job * trackers are running on the nodes named in the returned * location array, Hadoop will generally attempt to run * the subtasks for a particular partition on those nodes where the * data is stored and replicated. Hadoop and Oracle NoSQL DB * administrators should be careful about co-location of Oracle NoSQL * DB and Hadoop processes since they may compete for resources. * *

* Partitions in Oracle NoSQL DB are considered to be roughly equal in size; * therefore {@link InputSplit#getLength InputSplit.getLength()} always returns * 1. *

* * A simple example demonstrating the Oracle NoSQL DB Hadoop * oracle.kv.hadoop.InputFormat class reading data from Hadoop in a Map/Reduce * job and counting the number of records for each major key in the store can * be found in the KVHOME/examples/hadoop directory. The javadoc * for that program describes the simple Map/Reduce processing as well as how * to invoke the program in Hadoop. * * @since 2.0 */ @SuppressWarnings("javadoc") public abstract class KVInputFormatBase extends InputFormat { private static String kvStoreName = null; private static String[] kvHelperHosts = null; private static Direction direction = Direction.UNORDERED; private static int batchSize = 0; private static Key parentKey = null; private static KeyRange subRange = null; private static Depth depth = Depth.PARENT_AND_DESCENDANTS; private static Consistency consistency = null; private static long timeout = 0; private static TimeUnit timeoutUnit = null; private static String formatterClassName = null; private static String kvStoreSecurityFile = null; /** * @hidden */ protected KVInputFormatBase() { } /** * @hidden * Logically split the set of input data for the job. * * @param context job configuration. * * @return an array of {@link InputSplit}s for the job. */ @Override public List getSplits(JobContext context) throws IOException, InterruptedException { if (context != null) { final Configuration conf = context.getConfiguration(); initializeParameters(conf); } if (kvStoreName == null) { throw new IllegalArgumentException ("No KV Store Name provided. Use either the " + ParamConstant.KVSTORE_NAME.getName() + " parameter or call " + KVInputFormatBase.class.getName() + ".setKVStoreName()."); } if (kvHelperHosts == null) { throw new IllegalArgumentException ("No KV Helper Hosts were provided. Use either the " + ParamConstant.KVSTORE_NODES.getName() + " parameter or call " + KVInputFormatBase.class.getName() + ".setKVHelperHosts()."); } final KVStoreLogin storeLogin = new KVStoreLogin(null, kvStoreSecurityFile); storeLogin.loadSecurityProperties(); storeLogin.prepareRegistryCSF(); LoginManager loginMgr = null; if (storeLogin.foundTransportSettings()) { loginMgr = KVStoreLogin.getRepNodeLoginMgr( kvHelperHosts, storeLogin.getLoginCredentials(), kvStoreName); } Topology topology = null; try { topology = TopologyLocator.get(kvHelperHosts, 0, loginMgr, kvStoreName); } catch (KVStoreException KVSE) { KVSE.printStackTrace(); return null; } /* Create a set of splits based on shards and consistency */ final SplitBuilder sb = new SplitBuilder(topology); final List splits = sb.createShardSplits(consistency); final List ret = new ArrayList(splits.size()); final RegistryUtils regUtils = new RegistryUtils(topology, loginMgr); for (TopoSplit ts : splits) { if (ts.isEmpty()) { /* Split is empty, skip */ continue; } final List repNodeNames = new ArrayList(); final List repNodeNamesAndPorts = new ArrayList(); for (StorageNode sn : ts.getSns(consistency, topology, regUtils)) { repNodeNames.add(sn.getHostname()); repNodeNamesAndPorts.add(sn.getHostname() + ":" + sn.getRegistryPort()); } ret.add(new KVInputSplit(). setKVHelperHosts (repNodeNamesAndPorts.toArray(new String[0])). setKVStoreName(kvStoreName). setKVStoreSecurityFile(storeLogin.getSecurityFilePath()). setLocations(repNodeNames.toArray(new String[0])). setDirection(direction). setBatchSize(batchSize). setParentKey(parentKey). setSubRange(subRange). setDepth(depth). setConsistency(consistency). setTimeout(timeout). setTimeoutUnit(timeoutUnit). setFormatterClassName(formatterClassName). setPartitionSets(ts.getPartitionSets())); } return ret; } /** * Set the KV Store name for this InputFormat to operate on. This is * equivalent to passing the oracle.kv.kvstore Hadoop * property. * * @param kvStoreName the KV Store name */ public static void setKVStoreName(String kvStoreName) { KVInputFormatBase.kvStoreName = kvStoreName; } /** * Set the KV Helper host:port pair(s) for this InputFormat to operate on. * This is equivalent to passing the oracle.kv.hosts Hadoop * property. * * @param kvHelperHosts array of hostname:port strings of any hosts in the * KV Store. */ public static void setKVHelperHosts(String[] kvHelperHosts) { KVInputFormatBase.kvHelperHosts = kvHelperHosts; } /** * Specifies the order in which records are returned by the InputFormat. * Only Direction.UNORDERED is allowed. * * @param direction the direction to retrieve data * @hidden */ @Deprecated public static void setDirection(Direction direction) { if (!direction.equals(Direction.UNORDERED)) { throw new IllegalArgumentException("Direction " + direction + " is not supported"); } KVInputFormatBase.direction = direction; } /** * Specifies the suggested number of keys to fetch during each network * round trip by the InputFormat. If 0, an internally determined default * is used. This is equivalent to passing the * oracle.kv.batchSize Hadoop property. * * @param batchSize the suggested number of keys to fetch during each * network round trip. */ public static void setBatchSize(int batchSize) { KVInputFormatBase.batchSize = batchSize; } /** * Specifies the parent key whose "child" KV pairs are to be returned by * the InputFormat. null will result in fetching all keys in the store. * If non-null, the major key path must be a partial path and the minor key * path must be empty. This is equivalent to passing the * oracle.kv.parentKey Hadoop property. * * @param parentKey the parentKey */ public static void setParentKey(Key parentKey) { KVInputFormatBase.parentKey = parentKey; } /** * Specifies a sub range to further restrict the range under the parentKey * to the major path components in this sub range. It may be null. This * is equivalent to passing the oracle.kv.subRange Hadoop * property. * * @param subRange the sub range. */ public static void setSubRange(KeyRange subRange) { KVInputFormatBase.subRange = subRange; } /** * Specifies whether the parent and only children or all descendents are * returned. If null, Depth.PARENT_AND_DESCENDENTS is implied. * This is equivalent to passing the oracle.kv.depth Hadoop * property. * * @param depth the depth. */ public static void setDepth(Depth depth) { KVInputFormatBase.depth = depth; } /** * Specifies the read consistency associated with the lookup of the child * KV pairs. Version- and Time-based consistency may not be used. If * null, the default consistency is used. This is equivalent to passing * the oracle.kv.consistency Hadoop property. * * @param consistency the consistency */ @SuppressWarnings("deprecation") public static void setConsistency(Consistency consistency) { if (consistency == Consistency.ABSOLUTE || consistency == Consistency.NONE_REQUIRED_NO_MASTER || consistency == Consistency.NONE_REQUIRED || consistency == null) { KVInputFormatBase.consistency = consistency; } else { throw new IllegalArgumentException ("Consistency may only be ABSOLUTE, " + "NONE_REQUIRED_NO_MASTER, or NONE_REQUIRED"); } } /** * Specifies an upper bound on the time interval for processing a * particular KV retrieval. A best effort is made to not exceed the * specified limit. If zero, the default request timeout is used. This is * equivalent to passing the oracle.kv.timeout Hadoop * property. * * @param timeout the timeout */ public static void setTimeout(long timeout) { KVInputFormatBase.timeout = timeout; } /** * Specifies the unit of the timeout parameter. It may be null only if * timeout is zero. This is equivalent to passing the * oracle.kv.timeout Hadoop property. * * @param timeoutUnit the timeout unit */ public static void setTimeoutUnit(TimeUnit timeoutUnit) { KVInputFormatBase.timeoutUnit = timeoutUnit; } /** * Specifies the name of a class that implements {@link AvroFormatter} * to (optionally) format {@link KeyValueVersion} instances into * Avro IndexedRecords. * * @param formatterClassName the name of the class implementing * AvroFormatter. */ public static void setFormatterClassName(String formatterClassName) { KVInputFormatBase.formatterClassName = formatterClassName; } /** * Allows KVStore security to be set. The kvStoreSecurity file is a property * file utilizing the format supported by the CLI tools. This security file * and any wallet or password store needed to support it must be * distributed on the hadoop cluster. * * @since 3.0 */ public static void setKVSecurity(String kvStoreSecurity) { KVInputFormatBase.kvStoreSecurityFile = kvStoreSecurity; } private void initializeParameters(Configuration conf) { if (conf != null) { if (kvStoreName == null) { kvStoreName = conf.get(ParamConstant.KVSTORE_NAME.getName()); } if (kvHelperHosts == null) { final String helperHosts = conf.get(ParamConstant.KVSTORE_NODES.getName()); if (helperHosts != null) { kvHelperHosts = helperHosts.trim().split(","); } } final String batchSizeStr = conf.get(ParamConstant.BATCH_SIZE.getName()); if (batchSizeStr != null) { try { batchSize = Integer.parseInt(batchSizeStr); } catch (NumberFormatException NFE) { throw new IllegalArgumentException ("Invalid value for " + ParamConstant.BATCH_SIZE.getName() + ": " + batchSizeStr); } } final String parentKeyStr = conf.get(ParamConstant.PARENT_KEY.getName()); if (parentKeyStr != null) { parentKey = Key.fromString(parentKeyStr); } final String subRangeStr = conf.get(ParamConstant.SUB_RANGE.getName()); if (subRangeStr != null) { subRange = KeyRange.fromString(subRangeStr); } final String depthStr = conf.get(ParamConstant.DEPTH.getName()); if (depthStr != null) { depth = Depth.valueOf(depthStr); } final String consistencyStr = conf.get(ParamConstant.CONSISTENCY.getName()); if (consistencyStr != null) { consistency = ExternalDataSourceUtils. parseConsistency(consistencyStr); } final String timeoutParamName = ParamConstant.TIMEOUT.getName(); final String timeoutStr = conf.get(timeoutParamName); if (timeoutStr != null) { timeout = ExternalDataSourceUtils.parseTimeout(timeoutStr); timeoutUnit = TimeUnit.MILLISECONDS; } final String formatterClassNameStr = conf.get(ParamConstant.FORMATTER_CLASS.getName()); if (formatterClassNameStr != null) { formatterClassName = formatterClassNameStr; } final String kvStoreSecurityStr = conf.get(ParamConstant.KVSTORE_SECURITY.getName()); if (kvStoreSecurityStr != null && kvStoreSecurityFile == null) { kvStoreSecurityFile = kvStoreSecurityStr; } } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy