All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.hpccsystems.dfs.client.AvroSchemaTranslator Maven / Gradle / Ivy

The newest version!
/*******************************************************************************
 *     HPCC SYSTEMS software Copyright (C) 2020 HPCC Systems®.
 *
 *     Licensed under the Apache License, Version 2.0 (the "License");
 *     you may not use this file except in compliance with the License.
 *     You may obtain a copy of the License at
 *
 *        http://www.apache.org/licenses/LICENSE-2.0
 *
 *     Unless required by applicable law or agreed to in writing, software
 *     distributed under the License is distributed on an "AS IS" BASIS,
 *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *     See the License for the specific language governing permissions and
 *     limitations under the License.
 *******************************************************************************/
package org.hpccsystems.dfs.client;

import org.apache.avro.LogicalType;
import org.apache.avro.LogicalTypes;
import org.apache.avro.LogicalTypes.Decimal;
import org.apache.avro.Schema;
import org.apache.avro.Conversions.DecimalConversion;

import org.hpccsystems.commons.ecl.FieldDef;
import org.hpccsystems.commons.ecl.FieldType;
import org.hpccsystems.commons.ecl.HpccSrcType;

import java.util.List;
import java.util.ArrayList;

/*
ARRAY -> Collection
BOOLEAN  -> Boolean
BYTES -> ByteBuffer
DOUBLE -> Double
ENUM -> GenericEnumSymbol
FIXED -> GenericFixed
FLOAT -> Float
INT -> Int
LONG -> Long
MAP -> Map
NULL 
RECORD -> IndexedRecord
STRING -> CharSequence
UNION -> [Could be lots of types depending on meta info]
*/

/**
 * A helper class that translates a given Avro record Schema into an HPCC Systems record definition.
 */
public class AvroSchemaTranslator
{
    public static FieldDef toHPCC(Schema schema, String fieldName) throws Exception
    {
        FieldDef ret = null;
        switch (schema.getType())
        {
            case ARRAY:
            {
                Schema elementSchema = schema.getElementType();
                FieldDef[] elementDef = new FieldDef[1];
                elementDef[0] = toHPCC(elementSchema, elementSchema.getName());

                if (elementDef[0].getFieldType() == FieldType.RECORD)
                {
                    ret = new FieldDef(fieldName, FieldType.DATASET, "DATASET", 0, false, false, 
                                        HpccSrcType.LITTLE_ENDIAN, elementDef);
                }
                else
                {
                    ret = new FieldDef(fieldName, FieldType.SET, "SET", 0, false, false, 
                                        HpccSrcType.LITTLE_ENDIAN, elementDef);
                }
                break;
            }
            case BOOLEAN:
            {
                ret = new FieldDef(fieldName, FieldType.BOOLEAN, "BOOL", 1, true, false,
                                    HpccSrcType.LITTLE_ENDIAN, new FieldDef[0]);
                break;
            }
            case BYTES:
            {
                // Also could be Decimal logical type, but decimal is always fixed on
                // The HPCC side. Do we just ignore this case and throw an error, 
                // or choose a default IE: DEC32_16 and change precision during translation?
                // For the moment lets treat this as an error
                if (schema.getLogicalType() instanceof LogicalTypes.Decimal)
                {
                    throw new Exception("Invalid field type. Non-fixed length decimal values are currently unsupported.");
                }

                ret = new FieldDef(fieldName, FieldType.BINARY, "BINARY", 0, false, false, 
                                    HpccSrcType.LITTLE_ENDIAN, new FieldDef[0]);
                break;
            }
            case DOUBLE:
            {
                ret = new FieldDef(fieldName, FieldType.REAL, "REAL8", 8, true, false, 
                                    HpccSrcType.LITTLE_ENDIAN, new FieldDef[0]);
                break;
            }
            case ENUM:
            {
                // Enum treat as int
                List enumSymbols = schema.getEnumSymbols();
                
                // On the HPCC side we could unsigned, but sticking to signed here make things easier on us
                int bytesNeeded = 1;
                if (enumSymbols.size() >= Byte.MAX_VALUE)
                {
                    bytesNeeded = 2;
                    if (enumSymbols.size() >= Short.MAX_VALUE)
                    {
                        // If there are this many enum symbols we should probably treat it as an error
                        throw new Exception("Enum has too many symbols, a max of " + Short.MAX_VALUE + " symbols are supported.");
                    }
                }

                ret = new FieldDef(fieldName, FieldType.INTEGER, "INTEGER" + bytesNeeded, bytesNeeded, true, false, 
                                    HpccSrcType.LITTLE_ENDIAN, new FieldDef[0]);
                break;
            }
            case FIXED:
            {
                if (schema.getLogicalType() instanceof LogicalTypes.Decimal)
                {
                    // Fixed binary represents a big-endian encoded integer.
                    // BigInt -> Decimal conversion *= 10^-Scale
                    LogicalTypes.Decimal decimalInfo = (LogicalTypes.Decimal) schema.getLogicalType();

                    ret = new FieldDef(fieldName, FieldType.DECIMAL, "DECIMAL", 1, true, false, 
                                    HpccSrcType.LITTLE_ENDIAN, new FieldDef[0]);
                    ret.setPrecision(decimalInfo.getPrecision());
                    ret.setScale(decimalInfo.getScale());
                }
                else if (schema.getLogicalType().getName().equals("duration"))
                {
                    // Duration (3 ints: months, days, millis) move to INTEGER8 of millis
                    ret = new FieldDef(fieldName, FieldType.INTEGER, "INTEGER8", 8, true, true, 
                                        HpccSrcType.LITTLE_ENDIAN, new FieldDef[0]);
                }
                else
                {
                    ret = new FieldDef(fieldName, FieldType.BINARY, "BINARY", schema.getFixedSize(), true, false, 
                                        HpccSrcType.LITTLE_ENDIAN, new FieldDef[0]);
                }

                break;
            }
            case FLOAT:
            {
                ret = new FieldDef(fieldName, FieldType.REAL, "REAL4", 4, true, false, 
                                    HpccSrcType.LITTLE_ENDIAN, new FieldDef[0]);
                break;
            }
            case INT:
            {
                ret = new FieldDef(fieldName, FieldType.INTEGER, "INTEGER4", 4, true, false, 
                                    HpccSrcType.LITTLE_ENDIAN, new FieldDef[0]);
                break;
            }
            case LONG:
            {
                ret = new FieldDef(fieldName, FieldType.INTEGER, "INTEGER8", 8, true, false, 
                                    HpccSrcType.LITTLE_ENDIAN, new FieldDef[0]);
                break;
            }
            case MAP:
            {
                // Map Represent as a set of key value records, this will allow DICTIONARY to be used
                FieldDef[] childFields = new FieldDef[2];
                childFields[0] = new FieldDef("key", FieldType.STRING, "UTF8", 0, false, false, 
                                    HpccSrcType.UTF8, new FieldDef[0]);
                childFields[1] = toHPCC(schema.getValueType(),"value");

                FieldDef[] elementDef = new FieldDef[1];
                elementDef[0] = new FieldDef("KeyValueRecord", FieldType.RECORD, "RECORD", 0, false, false,
                                   HpccSrcType.LITTLE_ENDIAN, childFields);

                ret = new FieldDef(fieldName, FieldType.DATASET, "DATASET", 0, false, false, 
                                    HpccSrcType.LITTLE_ENDIAN, elementDef);
                break;
            }
            case NULL:
            {
                // This seems to be primarily used in unions.
                // However, having a standard field type of Null isn't prohibited. 
                // Treat these as a bool value if encountered

                ret = new FieldDef(fieldName, FieldType.BOOLEAN, "BOOL", 1, true, false,
                                    HpccSrcType.LITTLE_ENDIAN, new FieldDef[0]);
                break;
            }
            case RECORD:
            {
                List fields = schema.getFields();

                boolean hasNullableFields = false;
                for (Schema.Field field : fields)
                {
                    if (field.schema().getType() == Schema.Type.UNION)
                    {
                        for (Schema unionType : field.schema().getTypes())
                        {
                            if (unionType.getType() == Schema.Type.NULL)
                            {
                                hasNullableFields = true;
                            }
                        }
                    }
                }

                int numNullBitFields = 0;
                if (hasNullableFields)
                {
                    numNullBitFields = (fields.size() + 63) / 64;
                }

                FieldDef[] childFields = new FieldDef[fields.size() + numNullBitFields];
                for (int i = 0; i < fields.size(); i++)
                {
                    Schema.Field field = fields.get(i);
                    childFields[i] = toHPCC(field.schema(),field.name());
                }

                // Create nullable fields
                for (int i = 0; i < numNullBitFields; i++)
                {
                    childFields[fields.size() + i] = new FieldDef("nullBitField"+i, FieldType.INTEGER, "INTEGER8", 8, true, false, 
                                                                    HpccSrcType.LITTLE_ENDIAN, new FieldDef[0]);
                }

                ret = new FieldDef(fieldName, FieldType.RECORD, "RECORD", 0, false, false,
                                   HpccSrcType.LITTLE_ENDIAN, childFields);

                break;
            }
            case STRING:
            {
                ret = new FieldDef(fieldName, FieldType.STRING, "UTF8", 0, false, false, 
                                    HpccSrcType.UTF8, new FieldDef[0]);
                break;
            }
            case UNION:
            {
                // Need to handle the simple union case of ["null", "string, etc"]
                // Complex unions IE ["string", "array", "record"] will be handled by creating a rec
                // with a unionType int
                List unionTypes = schema.getTypes();

                boolean isNullable = false;
                ArrayList types = new ArrayList();
                for (Schema type : unionTypes)
                {
                    if (type.getType() == Schema.Type.UNION)
                    {
                        isNullable = true;
                    }
                    else
                    {
                        types.add(type);
                    }
                }
                
                boolean isSimpleNullable = (types.size() == 1 && isNullable);
                if (isSimpleNullable)
                {
                    // Nulls will be combined into a bitfield
                    ret = toHPCC(types.get(0), fieldName);
                }
                else
                {
                    ArrayList childFields = new ArrayList(); 
                    for (int i = 0; i < types.size(); i++)
                    {
                        Schema childSchema = types.get(i);
                        childFields.add(new Schema.Field(childSchema.getName() + "Type", childSchema, "", (Object)null));
                    }

                    // Add a union type field
                    childFields.add(new Schema.Field("unionType", Schema.create(Schema.Type.INT), "", (Object)null));

                    boolean isError = false;
                    ret = toHPCC(Schema.createRecord(fieldName+"Rec", "", "", isError),fieldName);
                }
                
                break;
            }
        };

        return ret;
    }

    public static Schema toAvro(FieldDef fd) throws Exception
    {
        switch (fd.getFieldType()) 
        {
            case VAR_STRING:
            case STRING:
            case CHAR:
                return Schema.create(Schema.Type.STRING);
            case INTEGER:
                if (fd.getDataLen() <= 4)
                {
                    return Schema.create(Schema.Type.INT);
                }
                else
                {
                    return Schema.create(Schema.Type.LONG);
                }
            case BINARY:
                return Schema.create(Schema.Type.BYTES);
            case BOOLEAN:
                return Schema.create(Schema.Type.BOOLEAN);
            case REAL:
                if (fd.getDataLen() <= 4)
                {
                    return Schema.create(Schema.Type.FLOAT);
                }
                else
                {
                    return Schema.create(Schema.Type.DOUBLE);
                }
            case DECIMAL:
                DecimalConversion conv = new DecimalConversion();
                return conv.getRecommendedSchema();
            case SET:
            case DATASET:
                Schema childSchema = toAvro(fd.getDef(0));
                return Schema.createArray(childSchema);
            case RECORD:
                {
                    ArrayList fields = new ArrayList();
                    for (int i = 0; i < fd.getNumDefs(); i++) 
                    {
                        FieldDef childField = fd.getDef(i);
                        fields.add(new Schema.Field(childField.getFieldName(), toAvro(childField), null, null));
                    }

                    Boolean isError = false;
                    Schema recSchema = Schema.createRecord(fd.getFieldName(),"","",isError,fields);
                    if (isError)
                    {
                        throw new Exception("AvroSchemaTranslator: Error translation record def.");
                    }
                    
                    recSchema.addAlias(fd.getFieldName());
                    return recSchema;
                }
            case UNKNOWN:
            default:
                throw new Exception("AvroSchemaTranslator: Unknown field type");
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy