org.apache.cassandra.hadoop.pig.AbstractCassandraStorage Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of cassandra-all Show documentation
Show all versions of cassandra-all Show documentation
The Apache Cassandra Project develops a highly scalable second-generation distributed database, bringing together Dynamo's fully distributed design and Bigtable's ColumnFamily-based data model.
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cassandra.hadoop.pig;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.math.BigInteger;
import java.net.URLDecoder;
import java.nio.ByteBuffer;
import java.nio.charset.CharacterCodingException;
import java.util.*;
import com.google.common.collect.Iterables;
import org.apache.cassandra.db.Cell;
import org.apache.cassandra.exceptions.ConfigurationException;
import org.apache.cassandra.exceptions.SyntaxException;
import org.apache.cassandra.auth.IAuthenticator;
import org.apache.cassandra.config.CFMetaData;
import org.apache.cassandra.config.ColumnDefinition;
import org.apache.cassandra.db.marshal.*;
import org.apache.cassandra.db.marshal.AbstractCompositeType.CompositeComponent;
import org.apache.cassandra.serializers.CollectionSerializer;
import org.apache.cassandra.hadoop.*;
import org.apache.cassandra.thrift.*;
import org.apache.cassandra.utils.ByteBufferUtil;
import org.apache.cassandra.utils.FBUtilities;
import org.apache.cassandra.utils.Hex;
import org.apache.cassandra.utils.UUIDGen;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.*;
import org.apache.pig.*;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.*;
import org.apache.pig.impl.util.UDFContext;
import org.apache.thrift.TDeserializer;
import org.apache.thrift.TException;
import org.apache.thrift.TSerializer;
import org.apache.thrift.protocol.TBinaryProtocol;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* A LoadStoreFunc for retrieving data from and storing data to Cassandra
*/
public abstract class AbstractCassandraStorage extends LoadFunc implements StoreFuncInterface, LoadMetadata
{
protected enum MarshallerType { COMPARATOR, DEFAULT_VALIDATOR, KEY_VALIDATOR, SUBCOMPARATOR };
// system environment variables that can be set to configure connection info:
// alternatively, Hadoop JobConf variables can be set using keys from ConfigHelper
public final static String PIG_INPUT_RPC_PORT = "PIG_INPUT_RPC_PORT";
public final static String PIG_INPUT_INITIAL_ADDRESS = "PIG_INPUT_INITIAL_ADDRESS";
public final static String PIG_INPUT_PARTITIONER = "PIG_INPUT_PARTITIONER";
public final static String PIG_OUTPUT_RPC_PORT = "PIG_OUTPUT_RPC_PORT";
public final static String PIG_OUTPUT_INITIAL_ADDRESS = "PIG_OUTPUT_INITIAL_ADDRESS";
public final static String PIG_OUTPUT_PARTITIONER = "PIG_OUTPUT_PARTITIONER";
public final static String PIG_RPC_PORT = "PIG_RPC_PORT";
public final static String PIG_INITIAL_ADDRESS = "PIG_INITIAL_ADDRESS";
public final static String PIG_PARTITIONER = "PIG_PARTITIONER";
public final static String PIG_INPUT_FORMAT = "PIG_INPUT_FORMAT";
public final static String PIG_OUTPUT_FORMAT = "PIG_OUTPUT_FORMAT";
public final static String PIG_INPUT_SPLIT_SIZE = "PIG_INPUT_SPLIT_SIZE";
protected String DEFAULT_INPUT_FORMAT;
protected String DEFAULT_OUTPUT_FORMAT;
public final static String PARTITION_FILTER_SIGNATURE = "cassandra.partition.filter";
private static final Logger logger = LoggerFactory.getLogger(AbstractCassandraStorage.class);
protected String username;
protected String password;
protected String keyspace;
protected String column_family;
protected String loadSignature;
protected String storeSignature;
protected Configuration conf;
protected String inputFormatClass;
protected String outputFormatClass;
protected int splitSize = 64 * 1024;
protected String partitionerClass;
protected boolean usePartitionFilter = false;
protected String initHostAddress;
protected String rpcPort;
protected int nativeProtocolVersion = 1;
public AbstractCassandraStorage()
{
super();
}
/** Deconstructs a composite type to a Tuple. */
protected Tuple composeComposite(AbstractCompositeType comparator, ByteBuffer name) throws IOException
{
List result = comparator.deconstruct(name);
Tuple t = TupleFactory.getInstance().newTuple(result.size());
for (int i=0; i validators = getValidatorMap(cfDef);
if (cfInfo.cql3Table && !cfInfo.compactCqlTable)
{
ByteBuffer[] names = ((AbstractCompositeType) parseType(cfDef.comparator_type)).split(colName);
colName = names[names.length-1];
}
if (validators.get(colName) == null)
{
Map marshallers = getDefaultMarshallers(cfDef);
setTupleValue(pair, 1, cassandraToObj(marshallers.get(MarshallerType.DEFAULT_VALIDATOR), col.value()));
}
else
setTupleValue(pair, 1, cassandraToObj(validators.get(colName), col.value()));
return pair;
}
/** set the value to the position of the tuple */
protected void setTupleValue(Tuple pair, int position, Object value) throws ExecException
{
if (value instanceof BigInteger)
pair.set(position, ((BigInteger) value).intValue());
else if (value instanceof ByteBuffer)
pair.set(position, new DataByteArray(ByteBufferUtil.getArray((ByteBuffer) value)));
else if (value instanceof UUID)
pair.set(position, new DataByteArray(UUIDGen.decompose((java.util.UUID) value)));
else if (value instanceof Date)
pair.set(position, TimestampType.instance.decompose((Date) value).getLong());
else
pair.set(position, value);
}
/** get the columnfamily definition for the signature */
protected CfInfo getCfInfo(String signature) throws IOException
{
UDFContext context = UDFContext.getUDFContext();
Properties property = context.getUDFProperties(AbstractCassandraStorage.class);
String prop = property.getProperty(signature);
CfInfo cfInfo = new CfInfo();
cfInfo.cfDef = cfdefFromString(prop.substring(2));
cfInfo.compactCqlTable = prop.charAt(0) == '1' ? true : false;
cfInfo.cql3Table = prop.charAt(1) == '1' ? true : false;
return cfInfo;
}
/** construct a map to store the mashaller type to cassandra data type mapping */
protected Map getDefaultMarshallers(CfDef cfDef) throws IOException
{
Map marshallers = new EnumMap(MarshallerType.class);
AbstractType comparator;
AbstractType subcomparator;
AbstractType default_validator;
AbstractType key_validator;
comparator = parseType(cfDef.getComparator_type());
subcomparator = parseType(cfDef.getSubcomparator_type());
default_validator = parseType(cfDef.getDefault_validation_class());
key_validator = parseType(cfDef.getKey_validation_class());
marshallers.put(MarshallerType.COMPARATOR, comparator);
marshallers.put(MarshallerType.DEFAULT_VALIDATOR, default_validator);
marshallers.put(MarshallerType.KEY_VALIDATOR, key_validator);
marshallers.put(MarshallerType.SUBCOMPARATOR, subcomparator);
return marshallers;
}
/** get the validators */
protected Map getValidatorMap(CfDef cfDef) throws IOException
{
Map validators = new HashMap();
for (ColumnDef cd : cfDef.getColumn_metadata())
{
if (cd.getValidation_class() != null && !cd.getValidation_class().isEmpty())
{
AbstractType validator = null;
try
{
validator = TypeParser.parse(cd.getValidation_class());
if (validator instanceof CounterColumnType)
validator = LongType.instance;
validators.put(cd.name, validator);
}
catch (ConfigurationException e)
{
throw new IOException(e);
}
catch (SyntaxException e)
{
throw new IOException(e);
}
}
}
return validators;
}
/** parse the string to a cassandra data type */
protected AbstractType parseType(String type) throws IOException
{
try
{
// always treat counters like longs, specifically CCT.compose is not what we need
if (type != null && type.equals("org.apache.cassandra.db.marshal.CounterColumnType"))
return LongType.instance;
return TypeParser.parse(type);
}
catch (ConfigurationException e)
{
throw new IOException(e);
}
catch (SyntaxException e)
{
throw new IOException(e);
}
}
@Override
public InputFormat getInputFormat() throws IOException
{
try
{
return FBUtilities.construct(inputFormatClass, "inputformat");
}
catch (ConfigurationException e)
{
throw new IOException(e);
}
}
/** decompose the query to store the parameters in a map */
public static Map getQueryMap(String query) throws UnsupportedEncodingException
{
String[] params = query.split("&");
Map map = new HashMap();
for (String param : params)
{
String[] keyValue = param.split("=");
map.put(keyValue[0], URLDecoder.decode(keyValue[1],"UTF-8"));
}
return map;
}
/** set hadoop cassandra connection settings */
protected void setConnectionInformation() throws IOException
{
if (System.getenv(PIG_RPC_PORT) != null)
{
ConfigHelper.setInputRpcPort(conf, System.getenv(PIG_RPC_PORT));
ConfigHelper.setOutputRpcPort(conf, System.getenv(PIG_RPC_PORT));
}
if (System.getenv(PIG_INPUT_RPC_PORT) != null)
ConfigHelper.setInputRpcPort(conf, System.getenv(PIG_INPUT_RPC_PORT));
if (System.getenv(PIG_OUTPUT_RPC_PORT) != null)
ConfigHelper.setOutputRpcPort(conf, System.getenv(PIG_OUTPUT_RPC_PORT));
if (System.getenv(PIG_INITIAL_ADDRESS) != null)
{
ConfigHelper.setInputInitialAddress(conf, System.getenv(PIG_INITIAL_ADDRESS));
ConfigHelper.setOutputInitialAddress(conf, System.getenv(PIG_INITIAL_ADDRESS));
}
if (System.getenv(PIG_INPUT_INITIAL_ADDRESS) != null)
ConfigHelper.setInputInitialAddress(conf, System.getenv(PIG_INPUT_INITIAL_ADDRESS));
if (System.getenv(PIG_OUTPUT_INITIAL_ADDRESS) != null)
ConfigHelper.setOutputInitialAddress(conf, System.getenv(PIG_OUTPUT_INITIAL_ADDRESS));
if (System.getenv(PIG_PARTITIONER) != null)
{
ConfigHelper.setInputPartitioner(conf, System.getenv(PIG_PARTITIONER));
ConfigHelper.setOutputPartitioner(conf, System.getenv(PIG_PARTITIONER));
}
if(System.getenv(PIG_INPUT_PARTITIONER) != null)
ConfigHelper.setInputPartitioner(conf, System.getenv(PIG_INPUT_PARTITIONER));
if(System.getenv(PIG_OUTPUT_PARTITIONER) != null)
ConfigHelper.setOutputPartitioner(conf, System.getenv(PIG_OUTPUT_PARTITIONER));
if (System.getenv(PIG_INPUT_FORMAT) != null)
inputFormatClass = getFullyQualifiedClassName(System.getenv(PIG_INPUT_FORMAT));
else
inputFormatClass = DEFAULT_INPUT_FORMAT;
if (System.getenv(PIG_OUTPUT_FORMAT) != null)
outputFormatClass = getFullyQualifiedClassName(System.getenv(PIG_OUTPUT_FORMAT));
else
outputFormatClass = DEFAULT_OUTPUT_FORMAT;
}
/** get the full class name */
protected String getFullyQualifiedClassName(String classname)
{
return classname.contains(".") ? classname : "org.apache.cassandra.hadoop." + classname;
}
/** get pig type for the cassandra data type*/
protected byte getPigType(AbstractType type)
{
if (type instanceof LongType || type instanceof DateType || type instanceof TimestampType) // DateType is bad and it should feel bad
return DataType.LONG;
else if (type instanceof IntegerType || type instanceof Int32Type) // IntegerType will overflow at 2**31, but is kept for compatibility until pig has a BigInteger
return DataType.INTEGER;
else if (type instanceof AsciiType || type instanceof UTF8Type || type instanceof DecimalType || type instanceof InetAddressType)
return DataType.CHARARRAY;
else if (type instanceof FloatType)
return DataType.FLOAT;
else if (type instanceof DoubleType)
return DataType.DOUBLE;
else if (type instanceof AbstractCompositeType || type instanceof CollectionType)
return DataType.TUPLE;
return DataType.BYTEARRAY;
}
public ResourceStatistics getStatistics(String location, Job job)
{
return null;
}
@Override
public String relativeToAbsolutePath(String location, Path curDir) throws IOException
{
return location;
}
@Override
public void setUDFContextSignature(String signature)
{
this.loadSignature = signature;
}
/** StoreFunc methods */
public void setStoreFuncUDFContextSignature(String signature)
{
this.storeSignature = signature;
}
public String relToAbsPathForStoreLocation(String location, Path curDir) throws IOException
{
return relativeToAbsolutePath(location, curDir);
}
/** output format */
public OutputFormat getOutputFormat() throws IOException
{
try
{
return FBUtilities.construct(outputFormatClass, "outputformat");
}
catch (ConfigurationException e)
{
throw new IOException(e);
}
}
public void checkSchema(ResourceSchema schema) throws IOException
{
// we don't care about types, they all get casted to ByteBuffers
}
protected abstract ByteBuffer nullToBB();
/** convert object to ByteBuffer */
protected ByteBuffer objToBB(Object o)
{
if (o == null)
return nullToBB();
if (o instanceof java.lang.String)
return ByteBuffer.wrap(new DataByteArray((String)o).get());
if (o instanceof Integer)
return Int32Type.instance.decompose((Integer)o);
if (o instanceof Long)
return LongType.instance.decompose((Long)o);
if (o instanceof Float)
return FloatType.instance.decompose((Float)o);
if (o instanceof Double)
return DoubleType.instance.decompose((Double)o);
if (o instanceof UUID)
return ByteBuffer.wrap(UUIDGen.decompose((UUID) o));
if(o instanceof Tuple) {
List
© 2015 - 2025 Weber Informatics LLC | Privacy Policy