com.splout.db.hadoop.NullableTuple Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of splout-hadoop Show documentation
Splout is a read only, horizontally scalable SQL database that plays well with Hadoop.
There is a newer version: 0.3.0
package com.splout.db.hadoop;

/*
 * #%L
 * Splout SQL Hadoop library
 * %%
 * Copyright (C) 2012 Datasalt Systems S.L.
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */

import java.io.PrintStream;
import java.nio.ByteBuffer;

import com.datasalt.pangool.io.ITuple;
import com.datasalt.pangool.io.Schema;
import com.datasalt.pangool.io.Schema.Field;
import com.datasalt.pangool.io.Schema.Field.Type;
import com.datasalt.pangool.io.Tuple;

/**
 * A wrapper over a Pangool (http://pangool.net) Tuple for being able to serialize null values. Because Pangool defines
 * an intermediate Hadoop serialization, it doesn't support serializing nulls. We add a Tuple on top of Pangool's Tuple
 * that supports serializing null values and only handles primitive types (not OBJECT or ENUM). It can't be used as an
 * intermediate Tuple in Pangool but it can otherwise be serialized as a Tuple Field inside another Tuple, or persisted
 * using a normal TupleRecordWriter and a corresponding {@link NullableSchema}. NullableTuple also adds deep-copy
 * semantics to the Pangool Tuple by restricting its usage to primitive types. Deep-copy can be enabled by a flag in
 * appropriate methods.
 * 
 * A NullableTuple adds one more Field to the wrapped Tuple (see {@link NullableSchema}. This new fields indicate which
 * fields are null. The fields that are null contain a non-null value (0 for Integers, for instance) so that they can be
 * serialized. The overrided get() method returns those values so that Pangool can serialize the Tuple, however, for
 * null-aware getter one must use {@link #getNullable(int)} or {@link #getNullable(String)}.
 */
@SuppressWarnings("serial")
public class NullableTuple extends Tuple {

	private int nFields;

	public NullableTuple(ITuple tuple) {
		this(tuple, false);
	}

	public NullableTuple(ITuple tuple, boolean deepCopy) {
		this(tuple.getSchema());
		setWrappedTuple(tuple);
	}

	/**
	 * For instance reusing, instantiate the Tuple once with the wrapped Schema and use {@link #setWrappedTuple(ITuple)}
	 * afterwards.
	 */
	public NullableTuple(Schema schema) {
		super(new NullableSchema(schema));
		nFields = getSchema().getFields().size() - 1;
		initNulls();
	}

	public void setWrappedTuple(ITuple wrappedTuple) {
		setWrappedTuple(wrappedTuple, false);
	}

	public void setWrappedTuple(ITuple wrappedTuple, boolean deepCopy) {
		clear();
		for(Field field: wrappedTuple.getSchema().getFields()) {
			Object obj = wrappedTuple.get(field.getName());
			if(deepCopy && field.getType().equals(Type.STRING)) {
				// for deep-copying primitive types we have to take
				// into account that Strings are often wrapped into UTF8 objects in Pangool.
				// Other than that, it is safe to shallow-copy primitive types as a deep copy.
				// Usually you don't want to deep-copy unless you are keeping Tuples in an in-memory array.
				obj = obj.toString();
			}
			set(field.getName(), obj);
		}
	}

	@Override
	public void clear() {
		for(int i = 0; i < nFields; i++) {
			super.set(i, null);
		}
		byte[] nulls = getNulls();
		if(nulls == null) {
			initNulls();
		} else {
			for(int i = 0; i < nulls.length; i++) {
				nulls[i] = 0;
			}
		}
	}

	public Object getNullable(int pos) {
		if(isNull(pos)) {
			return null;
		}
		return super.get(pos);
	}

	public Object getNullable(String field) {
		int pos = getSchema().getFieldPos(field);
		return getNullable(pos);
	}

	@Override
	public void set(int pos, Object object) {
		if(pos == (nFields)) { // _nulls
			super.set(nFields, object);
		}
		if(object != null) {
			setNoNull(pos);
		} else {
			setNull(pos);

			Field.Type type = getSchema().getField(pos).getType();
			switch(type) {

			case INT:
				object = 0;
				break;
			case BOOLEAN:
				object = false;
				break;
			case BYTES:
				object = new byte[0];
				break;
			case DOUBLE:
				object = 0d;
				break;
			case FLOAT:
				object = 0f;
				break;
			case LONG:
				object = 0l;
				break;
			case STRING:
				object = "";
				break;
			case ENUM:
				throw new RuntimeException("Unsupported operation: setting a null Enum");
			case OBJECT:
				throw new RuntimeException("Unsupported operation: setting a null Object");
			}
		}
		super.set(pos, object);
	}

	@Override
	public void set(String field, Object object) {
		int pos = getSchema().getFieldPos(field);
		set(pos, object);
	}

	// --- Helper methods for dealing with the byte[] bit set --- //

	public void initNulls() {
		byte[] nulls = new byte[(nFields / 8) + 1];
		for(int i = 0; i < nulls.length; i++) {
			nulls[i] = 0;
		}
		super.set(nFields, nulls);
	}

	public byte[] getNulls() {
		Object b = super.get(nFields);
		if(b instanceof byte[]) {
			return (byte[]) b;
		} else if(b instanceof ByteBuffer) {
			byte[] nullBytes = new byte[(nFields / 8) + 1];
			ByteBuffer bB = (ByteBuffer) b;
			int position = bB.position();
			bB.get(nullBytes, position, nullBytes.length);
			bB.position(position);
			// Replace deserialized ByteBuffer by the native byte array
			// This eases the possible modifications on the byte array even though it is less efficient than keeping the
			// bytebuffer
			super.set(nFields, nullBytes);
			return nullBytes;
		} else {
			throw new RuntimeException("Field that contains null info is not byte[] neither ByteBuffer!");
		}
	}

	public static boolean isNull(int pos, byte[] nulls, int offset) {
		return (nulls[offset + (pos / 8)] & (1 << (pos % 8))) > 0;		
	}
	
	public boolean isNull(int pos) {
		return isNull(pos, getNulls(), 0);
	}

	public void setNull(int pos) {
		byte[] nulls = getNulls();
		byte b = nulls[pos / 8];
		nulls[pos / 8] = (byte) (b | (1 << (pos % 8)));
	}

	public void setNoNull(int pos) {
		byte[] nulls = getNulls();
		byte b = nulls[pos / 8];
		nulls[pos / 8] = (byte) (b & (byte) (255 ^ (1 << (pos % 8))));
	}

	public void printNulls(PrintStream printStream) {
		byte[] nulls = getNulls();
		for(byte b : nulls) {
			printUnsignedByte(b, printStream);
		}
	}

	public static void printUnsignedByte(byte b, PrintStream printStream) {
		for(int i = 0; i < 8; i++) {
			printStream.print(((b & (1 << i)) > 0) ? "1" : "0");
		}
		printStream.println();
	}
}