org.apache.pig.builtin.Utf8StorageConverter Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pig.builtin;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.PushbackInputStream;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.util.Arrays;
import java.util.Deque;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pig.LoadStoreCaster;
import org.apache.pig.PigWarning;
import org.apache.pig.ResourceSchema.ResourceFieldSchema;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.DataType;
import org.apache.pig.data.DefaultBagFactory;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.util.LogUtils;
import org.joda.time.DateTime;
/**
* This abstract class provides standard conversions between utf8 encoded data
* and pig data types. It is intended to be extended by load and store
* functions (such as {@link PigStorage}).
*/
public class Utf8StorageConverter implements LoadStoreCaster {
protected BagFactory mBagFactory = BagFactory.getInstance();
protected TupleFactory mTupleFactory = TupleFactory.getInstance();
protected final Log mLog = LogFactory.getLog(getClass());
private static final Integer mMaxInt = Integer.valueOf(Integer.MAX_VALUE);
private static final Integer mMinInt = Integer.valueOf(Integer.MIN_VALUE);
private static final Long mMaxLong = Long.valueOf(Long.MAX_VALUE);
private static final Long mMinLong = Long.valueOf(Long.MIN_VALUE);
private static final int BUFFER_SIZE = 1024;
public Utf8StorageConverter() {
}
private char findStartChar(char start) throws IOException{
switch (start) {
case ')': return '(';
case ']': return '[';
case '}': return '{';
default: throw new IOException("Unknown start character");
}
}
private DataBag consumeBag(PushbackInputStream in, ResourceFieldSchema fieldSchema) throws IOException {
if (fieldSchema==null) {
throw new IOException("Schema is null");
}
ResourceFieldSchema[] fss=fieldSchema.getSchema().getFields();
Tuple t;
int buf;
while ((buf=in.read())!='{') {
if (buf==-1) {
throw new IOException("Unexpect end of bag");
}
}
if (fss.length!=1)
throw new IOException("Only tuple is allowed inside bag schema");
ResourceFieldSchema fs = fss[0];
DataBag db = DefaultBagFactory.getInstance().newDefaultBag();
while (true) {
t = consumeTuple(in, fs);
if (t!=null)
db.add(t);
while ((buf=in.read())!='}'&&buf!=',') {
if (buf==-1) {
throw new IOException("Unexpect end of bag");
}
}
if (buf=='}')
break;
}
return db;
}
private Tuple consumeTuple(PushbackInputStream in, ResourceFieldSchema fieldSchema) throws IOException {
if (fieldSchema==null) {
throw new IOException("Schema is null");
}
int buf;
ByteArrayOutputStream mOut;
while ((buf=in.read())!='('||buf=='}') {
if (buf==-1) {
throw new IOException("Unexpect end of tuple");
}
if (buf=='}') {
in.unread(buf);
return null;
}
}
Tuple t = TupleFactory.getInstance().newTuple();
if (fieldSchema.getSchema()!=null && fieldSchema.getSchema().getFields().length!=0) {
ResourceFieldSchema[] fss = fieldSchema.getSchema().getFields();
// Interpret item inside tuple one by one based on the inner schema
for (int i=0;i level = new LinkedList(); // keep track of nested tuple/bag/map. We do not interpret, save them as bytearray
mOut = new ByteArrayOutputStream(BUFFER_SIZE);
while (true) {
buf=in.read();
if (buf==-1) {
throw new IOException("Unexpect end of tuple");
}
if (buf=='['||buf=='{'||buf=='(') {
level.push((char)buf);
mOut.write(buf);
}
else if (buf==')' && level.isEmpty()) // End of tuple
{
DataByteArray value = new DataByteArray(mOut.toByteArray());
t.append(value);
break;
}
else if (buf==',' && level.isEmpty())
{
DataByteArray value = new DataByteArray(mOut.toByteArray());
t.append(value);
mOut.reset();
}
else if (buf==']' ||buf=='}'||buf==')')
{
if (level.peek()==findStartChar((char)buf))
level.pop();
else
throw new IOException("Malformed tuple");
mOut.write(buf);
}
else
mOut.write(buf);
}
}
return t;
}
private Map consumeMap(PushbackInputStream in, ResourceFieldSchema fieldSchema) throws IOException {
int buf;
boolean emptyMap = true;
while ((buf=in.read())!='[') {
if (buf==-1) {
throw new IOException("Unexpect end of map");
}
}
HashMap m = new HashMap();
ByteArrayOutputStream mOut = new ByteArrayOutputStream(BUFFER_SIZE);
while (true) {
// Read key (assume key can not contains special character such as #, (, [, {, }, ], )
while ((buf=in.read())!='#') {
// end of map
if (emptyMap && buf==']') {
return m;
}
if (buf==-1) {
throw new IOException("Unexpect end of map");
}
emptyMap = false;
mOut.write(buf);
}
String key = bytesToCharArray(mOut.toByteArray());
if (key.length()==0)
throw new IOException("Map key can not be null");
// Read value
mOut.reset();
Deque level = new LinkedList(); // keep track of nested tuple/bag/map. We do not interpret, save them as bytearray
while (true) {
buf=in.read();
if (buf==-1) {
throw new IOException("Unexpect end of map");
}
if (buf=='['||buf=='{'||buf=='(') {
level.push((char)buf);
}
else if (buf==']' && level.isEmpty()) // End of map
break;
else if (buf==']' ||buf=='}'||buf==')')
{
if (level.isEmpty())
throw new IOException("Malformed map");
if (level.peek()==findStartChar((char)buf))
level.pop();
} else if (buf==','&&level.isEmpty()) { // Current map item complete
break;
}
mOut.write(buf);
}
Object value = null;
if (fieldSchema!=null && fieldSchema.getSchema()!=null && mOut.size()>0) {
value = bytesToObject(mOut.toByteArray(), fieldSchema.getSchema().getFields()[0]);
} else if (mOut.size()>0) { // untyped map
value = new DataByteArray(mOut.toByteArray());
}
m.put(key, value);
mOut.reset();
if (buf==']')
break;
}
return m;
}
private Object bytesToObject(byte[] b, ResourceFieldSchema fs) throws IOException {
Object field;
if (DataType.isComplex(fs.getType())) {
ByteArrayInputStream bis = new ByteArrayInputStream(b);
PushbackInputStream in = new PushbackInputStream(bis);
field = consumeComplexType(in, fs);
}
else {
field = parseSimpleType(b, fs);
}
return field;
}
private Object consumeComplexType(PushbackInputStream in, ResourceFieldSchema complexFieldSchema) throws IOException {
Object field;
switch (complexFieldSchema.getType()) {
case DataType.BAG:
field = consumeBag(in, complexFieldSchema);
break;
case DataType.TUPLE:
field = consumeTuple(in, complexFieldSchema);
break;
case DataType.MAP:
field = consumeMap(in, complexFieldSchema);
break;
default:
throw new IOException("Unknown complex data type");
}
return field;
}
private Object parseSimpleType(byte[] b, ResourceFieldSchema simpleFieldSchema) throws IOException {
Object field;
switch (simpleFieldSchema.getType()) {
case DataType.INTEGER:
field = bytesToInteger(b);
break;
case DataType.LONG:
field = bytesToLong(b);
break;
case DataType.FLOAT:
field = bytesToFloat(b);
break;
case DataType.DOUBLE:
field = bytesToDouble(b);
break;
case DataType.CHARARRAY:
field = bytesToCharArray(b);
break;
case DataType.BYTEARRAY:
field = new DataByteArray(b);
break;
case DataType.BOOLEAN:
field = bytesToBoolean(b);
break;
case DataType.BIGINTEGER:
field = bytesToBigInteger(b);
break;
case DataType.BIGDECIMAL:
field = bytesToBigDecimal(b);
break;
case DataType.DATETIME:
field = bytesToDateTime(b);
break;
default:
throw new IOException("Unknown simple data type");
}
return field;
}
@Override
public DataBag bytesToBag(byte[] b, ResourceFieldSchema schema) throws IOException {
if(b == null)
return null;
DataBag db;
try {
ByteArrayInputStream bis = new ByteArrayInputStream(b);
PushbackInputStream in = new PushbackInputStream(bis);
db = consumeBag(in, schema);
} catch (IOException e) {
LogUtils.warn(this, "Unable to interpret value " + Arrays.toString(b) + " in field being " +
"converted to type bag, caught ParseException <" +
e.getMessage() + "> field discarded",
PigWarning.FIELD_DISCARDED_TYPE_CONVERSION_FAILED, mLog);
return null;
}
return db;
}
@Override
public String bytesToCharArray(byte[] b) throws IOException {
if(b == null)
return null;
return new String(b, "UTF-8");
}
@Override
public Double bytesToDouble(byte[] b) {
if(b == null || b.length == 0) {
return null;
}
try {
return Double.valueOf(new String(b));
} catch (NumberFormatException nfe) {
LogUtils.warn(this, "Unable to interpret value " + Arrays.toString(b) + " in field being " +
"converted to double, caught NumberFormatException <" +
nfe.getMessage() + "> field discarded",
PigWarning.FIELD_DISCARDED_TYPE_CONVERSION_FAILED, mLog);
return null;
}
}
@Override
public Float bytesToFloat(byte[] b) throws IOException {
if(b == null || b.length == 0) {
return null;
}
String s;
if (b.length > 0 && (b[b.length - 1] == 'F' || b[b.length - 1] == 'f')) {
s = new String(b, 0, b.length - 1);
} else {
s = new String(b);
}
try {
return Float.valueOf(s);
} catch (NumberFormatException nfe) {
LogUtils.warn(this, "Unable to interpret value " + Arrays.toString(b) + " in field being " +
"converted to float, caught NumberFormatException <" +
nfe.getMessage() + "> field discarded",
PigWarning.FIELD_DISCARDED_TYPE_CONVERSION_FAILED, mLog);
return null;
}
}
@Override
public Boolean bytesToBoolean(byte[] b) throws IOException {
if(b == null)
return null;
String s = new String(b);
if (s.equalsIgnoreCase("true")) {
return Boolean.TRUE;
} else if (s.equalsIgnoreCase("false")) {
return Boolean.FALSE;
} else {
return null;
}
}
/**
* Sanity check of whether this number is a valid integer or long.
* @param number the number to check
* @return true if it doesn't contain any invalid characters, i.e. only contains digits and '-'
*/
private static boolean sanityCheckIntegerLong(String number){
for (int i=0; i < number.length(); i++){
if (number.charAt(i) >= '0' && number.charAt(i) <='9' || i == 0 && number.charAt(i) == '-'){
// valid one
}
else{
// contains invalid characters, must not be a integer or long.
return false;
}
}
return true;
}
@Override
public Integer bytesToInteger(byte[] b) throws IOException {
if(b == null || b.length == 0) {
return null;
}
String s = new String(b);
s = s.trim();
Integer ret = null;
// See PIG-2835. Using exception handling to check if it's a double is very expensive.
// So we write our sanity check.
if (sanityCheckIntegerLong(s)){
try {
ret = Integer.valueOf(s);
} catch (NumberFormatException nfe) {
}
}
if (ret == null){
// It's possible that this field can be interpreted as a double.
// Unfortunately Java doesn't handle this in Integer.valueOf. So
// we need to try to convert it to a double and if that works then
// go to an int.
try {
Double d = Double.valueOf(s);
// Need to check for an overflow error
if (Double.compare(d.doubleValue(), mMaxInt.doubleValue() + 1) >= 0 ||
Double.compare(d.doubleValue(), mMinInt.doubleValue() - 1) <= 0) {
LogUtils.warn(this, "Value " + d + " too large for integer",
PigWarning.TOO_LARGE_FOR_INT, mLog);
return null;
}
return Integer.valueOf(d.intValue());
} catch (NumberFormatException nfe2) {
LogUtils.warn(this, "Unable to interpret value " + Arrays.toString(b) + " in field being " +
"converted to int, caught NumberFormatException <" +
nfe2.getMessage() + "> field discarded",
PigWarning.FIELD_DISCARDED_TYPE_CONVERSION_FAILED, mLog);
return null;
}
}
return ret;
}
@Override
public Long bytesToLong(byte[] b) throws IOException {
if (b == null || b.length == 0) {
return null;
}
String s = new String(b).trim();
if(s.endsWith("l") || s.endsWith("L")) {
s = s.substring(0, s.length()-1);
}
// See PIG-2835. Using exception handling to check if it's a double is very expensive.
// So we write our sanity check.
Long ret = null;
if (sanityCheckIntegerLong(s)) {
try {
ret = Long.valueOf(s);
} catch (NumberFormatException nfe) {
}
}
if (ret == null) {
// It's possible that this field can be interpreted as a double.
// Unfortunately Java doesn't handle this in Long.valueOf. So
// we need to try to convert it to a double and if that works then
// go to an long.
try {
Double d = Double.valueOf(s);
// Need to check for an overflow error
if (Double.compare(d.doubleValue(), mMaxLong.doubleValue() + 1) > 0 ||
Double.compare(d.doubleValue(), mMinLong.doubleValue() - 1) < 0) {
LogUtils.warn(this, "Value " + d + " too large for long",
PigWarning.TOO_LARGE_FOR_INT, mLog);
return null;
}
return Long.valueOf(d.longValue());
} catch (NumberFormatException nfe2) {
LogUtils.warn(this, "Unable to interpret value " + Arrays.toString(b) + " in field being " +
"converted to long, caught NumberFormatException <" +
nfe2.getMessage() + "> field discarded",
PigWarning.FIELD_DISCARDED_TYPE_CONVERSION_FAILED, mLog);
return null;
}
}
return ret;
}
@Override
public DateTime bytesToDateTime(byte[] b) throws IOException {
if (b == null) {
return null;
}
try {
String dtStr = new String(b);
return ToDate.extractDateTime(dtStr);
} catch (IllegalArgumentException e) {
LogUtils.warn(this, "Unable to interpret value " + Arrays.toString(b) + " in field being " +
"converted to datetime, caught IllegalArgumentException <" +
e.getMessage() + "> field discarded",
PigWarning.FIELD_DISCARDED_TYPE_CONVERSION_FAILED, mLog);
return null;
}
}
@Override
public Map bytesToMap(byte[] b, ResourceFieldSchema fieldSchema) throws IOException {
if(b == null)
return null;
Map map;
try {
ByteArrayInputStream bis = new ByteArrayInputStream(b);
PushbackInputStream in = new PushbackInputStream(bis);
map = consumeMap(in, fieldSchema);
}
catch (IOException e) {
LogUtils.warn(this, "Unable to interpret value " + Arrays.toString(b) + " in field being " +
"converted to type map, caught ParseException <" +
e.getMessage() + "> field discarded",
PigWarning.FIELD_DISCARDED_TYPE_CONVERSION_FAILED, mLog);
return null;
}
return map;
}
@Override
public Tuple bytesToTuple(byte[] b, ResourceFieldSchema fieldSchema) throws IOException {
if(b == null)
return null;
Tuple t;
try {
ByteArrayInputStream bis = new ByteArrayInputStream(b);
PushbackInputStream in = new PushbackInputStream(bis);
t = consumeTuple(in, fieldSchema);
}
catch (IOException e) {
LogUtils.warn(this, "Unable to interpret value " + Arrays.toString(b) + " in field being " +
"converted to type tuple, caught ParseException <" +
e.getMessage() + "> field discarded",
PigWarning.FIELD_DISCARDED_TYPE_CONVERSION_FAILED, mLog);
return null;
}
return t;
}
@Override
public BigInteger bytesToBigInteger(byte[] b) throws IOException {
if (b == null || b.length == 0) {
return null;
}
return new BigInteger(new String(b));
}
@Override
public BigDecimal bytesToBigDecimal(byte[] b) throws IOException {
if (b == null || b.length == 0) {
return null;
}
return new BigDecimal(new String(b));
}
@Override
public byte[] toBytes(DataBag bag) throws IOException {
return bag.toString().getBytes();
}
@Override
public byte[] toBytes(String s) throws IOException {
return s.getBytes();
}
@Override
public byte[] toBytes(Double d) throws IOException {
return d.toString().getBytes();
}
@Override
public byte[] toBytes(Float f) throws IOException {
return f.toString().getBytes();
}
@Override
public byte[] toBytes(Integer i) throws IOException {
return i.toString().getBytes();
}
@Override
public byte[] toBytes(Long l) throws IOException {
return l.toString().getBytes();
}
@Override
public byte[] toBytes(Boolean b) throws IOException {
return b.toString().getBytes();
}
@Override
public byte[] toBytes(DateTime dt) throws IOException {
return dt.toString().getBytes();
}
@Override
public byte[] toBytes(Map m) throws IOException {
return DataType.mapToString(m).getBytes();
}
@Override
public byte[] toBytes(Tuple t) throws IOException {
return t.toString().getBytes();
}
@Override
public byte[] toBytes(DataByteArray a) throws IOException {
return a.get();
}
@Override
public byte[] toBytes(BigInteger bi) throws IOException {
return bi.toString().getBytes();
}
@Override
public byte[] toBytes(BigDecimal bd) throws IOException {
return bd.toString().getBytes();
}
}