All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.pig.builtin.Utf8StorageConverter Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pig.builtin;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.PushbackInputStream;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.util.Arrays;
import java.util.Deque;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.Map;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pig.LoadStoreCaster;
import org.apache.pig.PigWarning;
import org.apache.pig.ResourceSchema.ResourceFieldSchema;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.DataType;
import org.apache.pig.data.DefaultBagFactory;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.util.LogUtils;
import org.joda.time.DateTime;

/**
 * This abstract class provides standard conversions between utf8 encoded data
 * and pig data types.  It is intended to be extended by load and store
 * functions (such as {@link PigStorage}).
 */
public class Utf8StorageConverter implements LoadStoreCaster {

    protected BagFactory mBagFactory = BagFactory.getInstance();
    protected TupleFactory mTupleFactory = TupleFactory.getInstance();
    protected final Log mLog = LogFactory.getLog(getClass());

    private static final Integer mMaxInt = Integer.valueOf(Integer.MAX_VALUE);
    private static final Integer mMinInt = Integer.valueOf(Integer.MIN_VALUE);
    private static final Long mMaxLong = Long.valueOf(Long.MAX_VALUE);
    private static final Long mMinLong = Long.valueOf(Long.MIN_VALUE);
    private static final int BUFFER_SIZE = 1024;

    public Utf8StorageConverter() {
    }

    private char findStartChar(char start) throws IOException{
        switch (start) {
        case ')': return '(';
        case ']': return '[';
        case '}': return '{';
        default: throw new IOException("Unknown start character");
        }
    }

    private DataBag consumeBag(PushbackInputStream in, ResourceFieldSchema fieldSchema) throws IOException {
        if (fieldSchema==null) {
            throw new IOException("Schema is null");
        }
        ResourceFieldSchema[] fss=fieldSchema.getSchema().getFields();
        Tuple t;
        int buf;
        while ((buf=in.read())!='{') {
            if (buf==-1) {
                throw new IOException("Unexpect end of bag");
            }
        }
        if (fss.length!=1)
            throw new IOException("Only tuple is allowed inside bag schema");
        ResourceFieldSchema fs = fss[0];
        DataBag db = DefaultBagFactory.getInstance().newDefaultBag();
        while (true) {
            t = consumeTuple(in, fs);
            if (t!=null)
                db.add(t);
            while ((buf=in.read())!='}'&&buf!=',') {
                if (buf==-1) {
                    throw new IOException("Unexpect end of bag");
                }
            }
            if (buf=='}')
                break;
        }
        return db;
    }

    private Tuple consumeTuple(PushbackInputStream in, ResourceFieldSchema fieldSchema) throws IOException {
        if (fieldSchema==null) {
            throw new IOException("Schema is null");
        }
        int buf;
        ByteArrayOutputStream mOut;

        while ((buf=in.read())!='('||buf=='}') {
            if (buf==-1) {
                throw new IOException("Unexpect end of tuple");
            }
            if (buf=='}') {
                in.unread(buf);
                return null;
            }
        }
        Tuple t = TupleFactory.getInstance().newTuple();
        if (fieldSchema.getSchema()!=null && fieldSchema.getSchema().getFields().length!=0) {
            ResourceFieldSchema[] fss = fieldSchema.getSchema().getFields();
            // Interpret item inside tuple one by one based on the inner schema
            for (int i=0;i level = new LinkedList();  // keep track of nested tuple/bag/map. We do not interpret, save them as bytearray
            mOut = new ByteArrayOutputStream(BUFFER_SIZE);
            while (true) {
                buf=in.read();
                if (buf==-1) {
                    throw new IOException("Unexpect end of tuple");
                }
                if (buf=='['||buf=='{'||buf=='(') {
                    level.push((char)buf);
                    mOut.write(buf);
                }
                else if (buf==')' && level.isEmpty()) // End of tuple
                {
                    DataByteArray value = new DataByteArray(mOut.toByteArray());
                    t.append(value);
                    break;
                }
                else if (buf==',' && level.isEmpty())
                {
                    DataByteArray value = new DataByteArray(mOut.toByteArray());
                    t.append(value);
                    mOut.reset();
                }
                else if (buf==']' ||buf=='}'||buf==')')
                {
                    if (level.peek()==findStartChar((char)buf))
                        level.pop();
                    else
                        throw new IOException("Malformed tuple");
                    mOut.write(buf);
                }
                else
                    mOut.write(buf);
            }
        }
        return t;
    }

    private Map consumeMap(PushbackInputStream in, ResourceFieldSchema fieldSchema) throws IOException {
        int buf;
        boolean emptyMap = true;

        while ((buf=in.read())!='[') {
            if (buf==-1) {
                throw new IOException("Unexpect end of map");
            }
        }
        HashMap m = new HashMap();
        ByteArrayOutputStream mOut = new ByteArrayOutputStream(BUFFER_SIZE);
        while (true) {
            // Read key (assume key can not contains special character such as #, (, [, {, }, ], )
            while ((buf=in.read())!='#') {
                // end of map
                if (emptyMap && buf==']') {
                    return m;
                }
                if (buf==-1) {
                    throw new IOException("Unexpect end of map");
                }
                emptyMap = false;
                mOut.write(buf);
            }
            String key = bytesToCharArray(mOut.toByteArray());
            if (key.length()==0)
                throw new IOException("Map key can not be null");

            // Read value
            mOut.reset();
            Deque level = new LinkedList(); // keep track of nested tuple/bag/map. We do not interpret, save them as bytearray
            while (true) {
                buf=in.read();
                if (buf==-1) {
                    throw new IOException("Unexpect end of map");
                }
                if (buf=='['||buf=='{'||buf=='(') {
                    level.push((char)buf);
                }
                else if (buf==']' && level.isEmpty()) // End of map
                    break;
                else if (buf==']' ||buf=='}'||buf==')')
                {
                	if (level.isEmpty())
                		throw new IOException("Malformed map");

                    if (level.peek()==findStartChar((char)buf))
                        level.pop();
                } else if (buf==','&&level.isEmpty()) { // Current map item complete
                    break;
                }
                mOut.write(buf);
            }
            Object value = null;
            if (fieldSchema!=null && fieldSchema.getSchema()!=null && mOut.size()>0) {
                value = bytesToObject(mOut.toByteArray(), fieldSchema.getSchema().getFields()[0]);
            } else if (mOut.size()>0) { // untyped map
                value = new DataByteArray(mOut.toByteArray());
            }
            m.put(key, value);
            mOut.reset();
            if (buf==']')
                break;
        }
        return m;
    }

    private Object bytesToObject(byte[] b, ResourceFieldSchema fs) throws IOException {
        Object field;
        if (DataType.isComplex(fs.getType())) {
            ByteArrayInputStream bis = new ByteArrayInputStream(b);
            PushbackInputStream in = new PushbackInputStream(bis);
            field = consumeComplexType(in, fs);
        }
        else {
            field = parseSimpleType(b, fs);
        }
        return field;
    }

    private Object consumeComplexType(PushbackInputStream in, ResourceFieldSchema complexFieldSchema) throws IOException {
        Object field;
        switch (complexFieldSchema.getType()) {
        case DataType.BAG:
            field = consumeBag(in, complexFieldSchema);
            break;
        case DataType.TUPLE:
            field = consumeTuple(in, complexFieldSchema);
            break;
        case DataType.MAP:
            field = consumeMap(in, complexFieldSchema);
            break;
        default:
            throw new IOException("Unknown complex data type");
        }
        return field;
    }

    private Object parseSimpleType(byte[] b, ResourceFieldSchema simpleFieldSchema) throws IOException {
        Object field;
        switch (simpleFieldSchema.getType()) {
        case DataType.INTEGER:
            field = bytesToInteger(b);
            break;
        case DataType.LONG:
            field = bytesToLong(b);
            break;
        case DataType.FLOAT:
            field = bytesToFloat(b);
            break;
        case DataType.DOUBLE:
            field = bytesToDouble(b);
            break;
        case DataType.CHARARRAY:
            field = bytesToCharArray(b);
            break;
        case DataType.BYTEARRAY:
            field = new DataByteArray(b);
            break;
        case DataType.BOOLEAN:
            field = bytesToBoolean(b);
            break;
        case DataType.BIGINTEGER:
            field = bytesToBigInteger(b);
            break;
        case DataType.BIGDECIMAL:
            field = bytesToBigDecimal(b);
            break;
        case DataType.DATETIME:
            field = bytesToDateTime(b);
            break;
        default:
            throw new IOException("Unknown simple data type");
        }
        return field;
    }

    @Override
    public DataBag bytesToBag(byte[] b, ResourceFieldSchema schema) throws IOException {
        if(b == null)
            return null;
        DataBag db;
        try {
            ByteArrayInputStream bis = new ByteArrayInputStream(b);
            PushbackInputStream in = new PushbackInputStream(bis);
            db = consumeBag(in, schema);
        } catch (IOException e) {
            LogUtils.warn(this, "Unable to interpret value " + Arrays.toString(b) + " in field being " +
                    "converted to type bag, caught ParseException <" +
                    e.getMessage() + "> field discarded",
                    PigWarning.FIELD_DISCARDED_TYPE_CONVERSION_FAILED, mLog);
            return null;
        }
        return db;
    }

    @Override
    public String bytesToCharArray(byte[] b) throws IOException {
        if(b == null)
            return null;
        return new String(b, "UTF-8");
    }

    @Override
    public Double bytesToDouble(byte[] b) {
        if(b == null || b.length == 0) {
            return null;
        }
        
        try {
            return Double.valueOf(new String(b));
        } catch (NumberFormatException nfe) {
            LogUtils.warn(this, "Unable to interpret value " + Arrays.toString(b) + " in field being " +
                    "converted to double, caught NumberFormatException <" +
                    nfe.getMessage() + "> field discarded",
                    PigWarning.FIELD_DISCARDED_TYPE_CONVERSION_FAILED, mLog);
            return null;
        }
    }

    @Override
    public Float bytesToFloat(byte[] b) throws IOException {
        if(b == null || b.length == 0) {
            return null;
        }
        
        String s;
        if (b.length > 0 && (b[b.length - 1] == 'F' || b[b.length - 1] == 'f')) {
            s = new String(b, 0, b.length - 1);
        } else {
            s = new String(b);
        }

        try {
            return Float.valueOf(s);
        } catch (NumberFormatException nfe) {
            LogUtils.warn(this, "Unable to interpret value " + Arrays.toString(b) + " in field being " +
                    "converted to float, caught NumberFormatException <" +
                    nfe.getMessage() + "> field discarded",
                    PigWarning.FIELD_DISCARDED_TYPE_CONVERSION_FAILED, mLog);
            return null;
        }
    }


    @Override
    public Boolean bytesToBoolean(byte[] b) throws IOException {
        if(b == null)
            return null;
        String s = new String(b);
        if (s.equalsIgnoreCase("true")) {
            return Boolean.TRUE;
        } else if (s.equalsIgnoreCase("false")) {
            return Boolean.FALSE;
        } else {
            return null;
        }
    }

    /**
     * Sanity check of whether this number is a valid integer or long.
     * @param number the number to check
     * @return true if it doesn't contain any invalid characters, i.e. only contains digits and '-'
     */
    private static boolean sanityCheckIntegerLong(String number){
        for (int i=0; i < number.length(); i++){
            if (number.charAt(i) >= '0' && number.charAt(i) <='9' || i == 0 && number.charAt(i) == '-'){
                // valid one
            }
            else{
                // contains invalid characters, must not be a integer or long.
                return false;
            }
        }
        return true;
    }

    @Override
    public Integer bytesToInteger(byte[] b) throws IOException {
        if(b == null || b.length == 0) {
            return null;
        }
        
        String s = new String(b);
        s = s.trim();
        Integer ret = null;

        // See PIG-2835. Using exception handling to check if it's a double is very expensive.
        // So we write our sanity check.
        if (sanityCheckIntegerLong(s)){
            try {
                ret = Integer.valueOf(s);
            } catch (NumberFormatException nfe) {
            }
        }
        if (ret == null){
            // It's possible that this field can be interpreted as a double.
            // Unfortunately Java doesn't handle this in Integer.valueOf.  So
            // we need to try to convert it to a double and if that works then
            // go to an int.
            try {
                Double d = Double.valueOf(s);
                // Need to check for an overflow error
                if (Double.compare(d.doubleValue(), mMaxInt.doubleValue() + 1) >= 0 ||
                        Double.compare(d.doubleValue(), mMinInt.doubleValue() - 1) <= 0) {
                    LogUtils.warn(this, "Value " + d + " too large for integer",
                            PigWarning.TOO_LARGE_FOR_INT, mLog);
                    return null;
                }
                return Integer.valueOf(d.intValue());
            } catch (NumberFormatException nfe2) {
                LogUtils.warn(this, "Unable to interpret value " + Arrays.toString(b) + " in field being " +
                        "converted to int, caught NumberFormatException <" +
                        nfe2.getMessage() + "> field discarded",
                        PigWarning.FIELD_DISCARDED_TYPE_CONVERSION_FAILED, mLog);
                return null;
            }
        }
        return ret;
    }

    @Override
    public Long bytesToLong(byte[] b) throws IOException {
        if (b == null || b.length == 0) {
            return null;
        }
        
        String s = new String(b).trim();
        if(s.endsWith("l") || s.endsWith("L")) {
            s = s.substring(0, s.length()-1);
        }

        // See PIG-2835. Using exception handling to check if it's a double is very expensive.
        // So we write our sanity check.
        Long ret = null;
        if (sanityCheckIntegerLong(s)) {
            try {
                ret = Long.valueOf(s);
            } catch (NumberFormatException nfe) {
            }
        }

        if (ret == null) {
            // It's possible that this field can be interpreted as a double.
            // Unfortunately Java doesn't handle this in Long.valueOf.  So
            // we need to try to convert it to a double and if that works then
            // go to an long.
            try {
                Double d = Double.valueOf(s);
                // Need to check for an overflow error
                if (Double.compare(d.doubleValue(), mMaxLong.doubleValue() + 1) > 0 ||
                        Double.compare(d.doubleValue(), mMinLong.doubleValue() - 1) < 0) {
                	LogUtils.warn(this, "Value " + d + " too large for long",
                	            PigWarning.TOO_LARGE_FOR_INT, mLog);
                    return null;
                }
                return Long.valueOf(d.longValue());
            } catch (NumberFormatException nfe2) {
                LogUtils.warn(this, "Unable to interpret value " + Arrays.toString(b) + " in field being " +
                            "converted to long, caught NumberFormatException <" +
                            nfe2.getMessage() + "> field discarded",
                            PigWarning.FIELD_DISCARDED_TYPE_CONVERSION_FAILED, mLog);
                return null;
            }
        }
        return ret;
    }

    @Override
    public DateTime bytesToDateTime(byte[] b) throws IOException {
        if (b == null) {
            return null;
        }
        try {
            String dtStr = new String(b);
            return ToDate.extractDateTime(dtStr);
        } catch (IllegalArgumentException e) {
            LogUtils.warn(this, "Unable to interpret value " + Arrays.toString(b) + " in field being " +
                    "converted to datetime, caught IllegalArgumentException <" +
                    e.getMessage() + "> field discarded",
                    PigWarning.FIELD_DISCARDED_TYPE_CONVERSION_FAILED, mLog);
            return null;
        }
    }

    @Override
    public Map bytesToMap(byte[] b, ResourceFieldSchema fieldSchema) throws IOException {
        if(b == null)
            return null;
        Map map;
        try {
            ByteArrayInputStream bis = new ByteArrayInputStream(b);
            PushbackInputStream in = new PushbackInputStream(bis);
            map = consumeMap(in, fieldSchema);
        }
        catch (IOException e) {
            LogUtils.warn(this, "Unable to interpret value " + Arrays.toString(b) + " in field being " +
                    "converted to type map, caught ParseException <" +
                    e.getMessage() + "> field discarded",
                    PigWarning.FIELD_DISCARDED_TYPE_CONVERSION_FAILED, mLog);
            return null;
        }
        return map;
    }

    @Override
    public Tuple bytesToTuple(byte[] b, ResourceFieldSchema fieldSchema) throws IOException {
        if(b == null)
            return null;
        Tuple t;

        try {
            ByteArrayInputStream bis = new ByteArrayInputStream(b);
            PushbackInputStream in = new PushbackInputStream(bis);
            t = consumeTuple(in, fieldSchema);
        }
        catch (IOException e) {
            LogUtils.warn(this, "Unable to interpret value " + Arrays.toString(b) + " in field being " +
                    "converted to type tuple, caught ParseException <" +
                    e.getMessage() + "> field discarded",
                    PigWarning.FIELD_DISCARDED_TYPE_CONVERSION_FAILED, mLog);
            return null;
        }

        return t;
    }

    @Override
    public BigInteger bytesToBigInteger(byte[] b) throws IOException {
        if (b == null || b.length == 0) {
            return null;
        }
        return new BigInteger(new String(b));
    }

    @Override
    public BigDecimal bytesToBigDecimal(byte[] b) throws IOException {
        if (b == null || b.length == 0) {
            return null;
        }
        return new BigDecimal(new String(b));
    }

    @Override
    public byte[] toBytes(DataBag bag) throws IOException {
        return bag.toString().getBytes();
    }

    @Override
    public byte[] toBytes(String s) throws IOException {
        return s.getBytes();
    }

    @Override
    public byte[] toBytes(Double d) throws IOException {
        return d.toString().getBytes();
    }

    @Override
    public byte[] toBytes(Float f) throws IOException {
        return f.toString().getBytes();
    }

    @Override
    public byte[] toBytes(Integer i) throws IOException {
        return i.toString().getBytes();
    }

    @Override
    public byte[] toBytes(Long l) throws IOException {
        return l.toString().getBytes();
    }

    @Override
    public byte[] toBytes(Boolean b) throws IOException {
        return b.toString().getBytes();
    }

    @Override
    public byte[] toBytes(DateTime dt) throws IOException {
        return dt.toString().getBytes();
    }

    @Override
    public byte[] toBytes(Map m) throws IOException {
        return DataType.mapToString(m).getBytes();
    }

    @Override
    public byte[] toBytes(Tuple t) throws IOException {
        return t.toString().getBytes();
    }

    @Override
    public byte[] toBytes(DataByteArray a) throws IOException {
        return a.get();
    }

    @Override
    public byte[] toBytes(BigInteger bi) throws IOException {
        return bi.toString().getBytes();
    }

    @Override
    public byte[] toBytes(BigDecimal bd) throws IOException {
        return bd.toString().getBytes();
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy