Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pig.data;
import java.io.ByteArrayInputStream;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.nio.ByteBuffer;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapred.JobConf;
import org.apache.pig.PigException;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.classification.InterfaceAudience;
import org.apache.pig.classification.InterfaceStability;
import org.apache.pig.data.utils.SedesHelper;
import org.apache.pig.impl.util.ObjectSerializer;
import org.joda.time.DateTime;
import org.joda.time.DateTimeZone;
/**
* A class to handle reading and writing of intermediate results of data types. The serialization format used by this
* class more efficient than what was used in DataReaderWriter . The format used by the functions in this class is
* subject to change, so it should be used ONLY to store intermediate results within a pig query.
*/
@InterfaceAudience.Private
@InterfaceStability.Stable
public class BinInterSedes implements InterSedes {
private static final int ONE_MINUTE = 60000;
public static final byte BOOLEAN_TRUE = 0;
public static final byte BOOLEAN_FALSE = 1;
public static final byte BYTE = 2;
public static final byte INTEGER = 3;
// since boolean is not supported yet(v0.7) as external type, lot of people use int instead and some data with old
// schema is likely stay for some time. so optimizing for that case as well
public static final byte INTEGER_0 = 4;
public static final byte INTEGER_1 = 5;
public static final byte INTEGER_INSHORT = 6;
public static final byte INTEGER_INBYTE = 7;
public static final byte LONG = 8;
public static final byte FLOAT = 9;
public static final byte DOUBLE = 10;
public static final byte BYTEARRAY = 11;
public static final byte SMALLBYTEARRAY = 12;
public static final byte TINYBYTEARRAY = 13;
public static final byte CHARARRAY = 14;
public static final byte SMALLCHARARRAY = 15;
public static final byte MAP = 16;
public static final byte SMALLMAP = 17;
public static final byte TINYMAP = 18;
public static final byte TUPLE = 19;
public static final byte SMALLTUPLE = 20;
public static final byte TINYTUPLE = 21;
public static final byte BAG = 22;
public static final byte SMALLBAG = 23;
public static final byte TINYBAG = 24;
public static final byte GENERIC_WRITABLECOMPARABLE = 25;
public static final byte INTERNALMAP = 26;
public static final byte NULL = 27;
public static final byte SCHEMA_TUPLE_BYTE_INDEX = 28;
public static final byte SCHEMA_TUPLE_SHORT_INDEX = 29;
public static final byte SCHEMA_TUPLE = 30;
public static final byte LONG_INBYTE = 31;
public static final byte LONG_INSHORT = 32;
public static final byte LONG_ININT = 33;
public static final byte LONG_0 = 34;
public static final byte LONG_1 = 35;
public static final byte TUPLE_0 = 36;
public static final byte TUPLE_1 = 37;
public static final byte TUPLE_2 = 38;
public static final byte TUPLE_3 = 39;
public static final byte TUPLE_4 = 40;
public static final byte TUPLE_5 = 41;
public static final byte TUPLE_6 = 42;
public static final byte TUPLE_7 = 43;
public static final byte TUPLE_8 = 44;
public static final byte TUPLE_9 = 45;
public static final byte BIGINTEGER = 46;
public static final byte BIGDECIMAL = 47;
public static final byte DATETIME = 48;
private static TupleFactory mTupleFactory = TupleFactory.getInstance();
private static BagFactory mBagFactory = BagFactory.getInstance();
public static final int UNSIGNED_SHORT_MAX = 65535;
public static final int UNSIGNED_BYTE_MAX = 255;
public static final String UTF8 = "UTF-8";
public Tuple readTuple(DataInput in, byte type) throws IOException {
switch (type) {
case TUPLE_0:
case TUPLE_1:
case TUPLE_2:
case TUPLE_3:
case TUPLE_4:
case TUPLE_5:
case TUPLE_6:
case TUPLE_7:
case TUPLE_8:
case TUPLE_9:
case TUPLE:
case TINYTUPLE:
case SMALLTUPLE:
return SedesHelper.readGenericTuple(in, type);
case SCHEMA_TUPLE_BYTE_INDEX:
case SCHEMA_TUPLE_SHORT_INDEX:
case SCHEMA_TUPLE:
return readSchemaTuple(in, type);
default:
throw new ExecException("Unknown Tuple type found in stream: " + type);
}
}
private Tuple readSchemaTuple(DataInput in, byte type) throws IOException {
int id;
switch (type) {
case (SCHEMA_TUPLE_BYTE_INDEX): id = in.readUnsignedByte(); break;
case (SCHEMA_TUPLE_SHORT_INDEX): id = in.readUnsignedShort(); break;
case (SCHEMA_TUPLE): id = in.readInt(); break;
default: throw new RuntimeException("Invalid type given to readSchemaTuple: " + type);
}
Tuple st = SchemaTupleFactory.getInstance(id).newTuple();
st.readFields(in);
return st;
}
public int getTupleSize(DataInput in, byte type) throws IOException {
int sz;
switch (type) {
case TUPLE_0:
return 0;
case TUPLE_1:
return 1;
case TUPLE_2:
return 2;
case TUPLE_3:
return 3;
case TUPLE_4:
return 4;
case TUPLE_5:
return 5;
case TUPLE_6:
return 6;
case TUPLE_7:
return 7;
case TUPLE_8:
return 8;
case TUPLE_9:
return 9;
case TINYTUPLE:
sz = in.readUnsignedByte();
break;
case SMALLTUPLE:
sz = in.readUnsignedShort();
break;
case TUPLE:
sz = in.readInt();
break;
default: {
int errCode = 2112;
String msg = "Unexpected datatype " + type + " while reading tuple" + "from binary file.";
throw new ExecException(msg, errCode, PigException.BUG);
}
}
// if sz == 0, we construct an "empty" tuple - presumably the writer wrote an empty tuple!
if (sz < 0) {
throw new IOException("Invalid size " + sz + " for a tuple");
}
return sz;
}
private DataBag readBag(DataInput in, byte type) throws IOException {
DataBag bag = mBagFactory.newDefaultBag();
long size;
// determine size of bag
switch (type) {
case TINYBAG:
size = in.readUnsignedByte();
break;
case SMALLBAG:
size = in.readUnsignedShort();
break;
case BAG:
size = in.readLong();
break;
default:
int errCode = 2219;
String msg = "Unexpected data while reading bag " + "from binary file.";
throw new ExecException(msg, errCode, PigException.BUG);
}
for (long i = 0; i < size; i++) {
try {
Object o = readDatum(in);
bag.add((Tuple) o);
} catch (ExecException ee) {
throw ee;
}
}
return bag;
}
private Map readMap(DataInput in, byte type) throws IOException {
int size;
switch (type) {
case TINYMAP:
size = in.readUnsignedByte();
break;
case SMALLMAP:
size = in.readUnsignedShort();
break;
case MAP:
size = in.readInt();
break;
default: {
int errCode = 2220;
String msg = "Unexpected data while reading map" + "from binary file.";
throw new ExecException(msg, errCode, PigException.BUG);
}
}
Map m = new HashMap(size);
for (int i = 0; i < size; i++) {
String key = (String) readDatum(in);
m.put(key, readDatum(in));
}
return m;
}
private InternalMap readInternalMap(DataInput in) throws IOException {
int size = in.readInt();
InternalMap m = new InternalMap(size);
for (int i = 0; i < size; i++) {
Object key = readDatum(in);
m.put(key, readDatum(in));
}
return m;
}
private WritableComparable readWritable(DataInput in) throws IOException {
String className = (String) readDatum(in);
// create the writeable class . It needs to have a default constructor
Class objClass = null;
try {
objClass = Class.forName(className);
} catch (ClassNotFoundException e) {
throw new IOException("Could not find class " + className + ", while attempting to de-serialize it ", e);
}
WritableComparable writable = null;
try {
writable = (WritableComparable) objClass.newInstance();
} catch (Exception e) {
String msg = "Could create instance of class " + className
+ ", while attempting to de-serialize it. (no default constructor ?)";
throw new IOException(msg, e);
}
// read the fields of the object from DataInput
writable.readFields(in);
return writable;
}
/*
* (non-Javadoc)
*
* @see org.apache.pig.data.InterSedes#readDatum(java.io.DataInput)
*/
@Override
public Object readDatum(DataInput in) throws IOException, ExecException {
// Read the data type
byte b = in.readByte();
return readDatum(in, b);
}
private static Object readBytes(DataInput in, int size) throws IOException {
byte[] ba = new byte[size];
in.readFully(ba);
return new DataByteArray(ba);
}
/**
* Expects binInterSedes data types (NOT DataType types!)
*
*
* @see org.apache.pig.data.InterSedes#readDatum(java.io.DataInput, byte)
*/
@Override
public Object readDatum(DataInput in, byte type) throws IOException, ExecException {
switch (type) {
case TUPLE_0:
case TUPLE_1:
case TUPLE_2:
case TUPLE_3:
case TUPLE_4:
case TUPLE_5:
case TUPLE_6:
case TUPLE_7:
case TUPLE_8:
case TUPLE_9:
case TUPLE:
case TINYTUPLE:
case SMALLTUPLE:
return SedesHelper.readGenericTuple(in, type);
case BAG:
case TINYBAG:
case SMALLBAG:
return readBag(in, type);
case MAP:
case TINYMAP:
case SMALLMAP:
return readMap(in, type);
case INTERNALMAP:
return readInternalMap(in);
case INTEGER_0:
return Integer.valueOf(0);
case INTEGER_1:
return Integer.valueOf(1);
case INTEGER_INBYTE:
return Integer.valueOf(in.readByte());
case INTEGER_INSHORT:
return Integer.valueOf(in.readShort());
case INTEGER:
return Integer.valueOf(in.readInt());
case LONG_0:
return Long.valueOf(0);
case LONG_1:
return Long.valueOf(1);
case LONG_INBYTE:
return Long.valueOf(in.readByte());
case LONG_INSHORT:
return Long.valueOf(in.readShort());
case LONG_ININT:
return Long.valueOf(in.readInt());
case LONG:
return Long.valueOf(in.readLong());
case DATETIME:
return new DateTime(in.readLong(), DateTimeZone.forOffsetMillis(in.readShort() * ONE_MINUTE));
case FLOAT:
return Float.valueOf(in.readFloat());
case DOUBLE:
return Double.valueOf(in.readDouble());
case BIGINTEGER:
return readBigInteger(in);
case BIGDECIMAL:
return readBigDecimal(in);
case BOOLEAN_TRUE:
return Boolean.valueOf(true);
case BOOLEAN_FALSE:
return Boolean.valueOf(false);
case BYTE:
return Byte.valueOf(in.readByte());
case TINYBYTEARRAY:
case SMALLBYTEARRAY:
case BYTEARRAY:
return new DataByteArray(SedesHelper.readBytes(in, type));
case CHARARRAY:
case SMALLCHARARRAY:
return SedesHelper.readChararray(in, type);
case GENERIC_WRITABLECOMPARABLE:
return readWritable(in);
case SCHEMA_TUPLE_BYTE_INDEX:
case SCHEMA_TUPLE_SHORT_INDEX:
case SCHEMA_TUPLE:
return readSchemaTuple(in, type);
case NULL:
return null;
default:
throw new RuntimeException("Unexpected data type " + type + " found in stream.");
}
}
private Object readBigDecimal(DataInput in) throws IOException {
return new BigDecimal((String)readDatum(in));
}
private Object readBigInteger(DataInput in) throws IOException {
return new BigInteger((String)readDatum(in));
}
private void writeBigInteger(DataOutput out, BigInteger bi) throws IOException {
writeDatum(out, bi.toString());
}
private void writeBigDecimal(DataOutput out, BigDecimal bd) throws IOException {
writeDatum(out, bd.toString());
}
/*
* (non-Javadoc)
*
* @see org.apache.pig.data.InterSedes#writeDatum(java.io.DataOutput, java.lang.Object)
*/
@Override
public void writeDatum(DataOutput out, Object val) throws IOException {
// Read the data type
byte type = DataType.findType(val);
writeDatum(out, val, type);
}
@Override
@SuppressWarnings("unchecked")
public void writeDatum(DataOutput out, Object val, byte type) throws IOException {
switch (type) {
case DataType.TUPLE:
writeTuple(out, (Tuple) val);
break;
case DataType.BAG:
writeBag(out, (DataBag) val);
break;
case DataType.MAP: {
writeMap(out, (Map) val);
break;
}
case DataType.INTERNALMAP: {
out.writeByte(INTERNALMAP);
Map