All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.gobblin.converter.parquet.JsonElementConversionFactory Maven / Gradle / Ivy

Go to download

A distributed data integration framework for streaming and batch data ecosystems.

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.gobblin.converter.parquet;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;

import org.apache.parquet.example.data.Group;
import org.apache.parquet.example.data.simple.BinaryValue;
import org.apache.parquet.example.data.simple.BooleanValue;
import org.apache.parquet.example.data.simple.DoubleValue;
import org.apache.parquet.example.data.simple.FloatValue;
import org.apache.parquet.example.data.simple.IntegerValue;
import org.apache.parquet.example.data.simple.LongValue;
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.schema.GroupType;
import org.apache.parquet.schema.LogicalTypeAnnotation;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
import org.apache.parquet.schema.Type;
import org.apache.parquet.schema.Types;

import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;

import org.apache.gobblin.converter.parquet.JsonSchema.*;

import static org.apache.gobblin.converter.parquet.JsonElementConversionFactory.RecordConverter.RecordType.CHILD;
import static org.apache.gobblin.converter.parquet.JsonSchema.*;
import static org.apache.gobblin.converter.parquet.JsonSchema.InputType.STRING;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64;
import static org.apache.parquet.schema.Type.Repetition.OPTIONAL;
import static org.apache.parquet.schema.Type.Repetition.REPEATED;
import static org.apache.parquet.schema.Type.Repetition.REQUIRED;


/**
 * 

* Creates a JsonElement to Parquet converter for all supported data types. *

* * @author tilakpatidar * */ public class JsonElementConversionFactory { /** * Use to create a converter for a single field from a parquetSchema. * * @param schema * @param repeated - Is the {@link Type} repeated in the parent {@link Group} * @return */ public static JsonElementConverter getConverter(JsonSchema schema, boolean repeated) { InputType fieldType = schema.getInputType(); switch (fieldType) { case INT: return new IntConverter(schema, repeated); case LONG: return new LongConverter(schema, repeated); case FLOAT: return new FloatConverter(schema, repeated); case DOUBLE: return new DoubleConverter(schema, repeated); case BOOLEAN: return new BooleanConverter(schema, repeated); case STRING: return new StringConverter(schema, repeated); case ARRAY: return new ArrayConverter(schema); case ENUM: return new EnumConverter(schema); case RECORD: return new RecordConverter(schema); case MAP: return new MapConverter(schema); case DATE: case TIMESTAMP: return new StringConverter(schema, repeated); default: throw new UnsupportedOperationException(fieldType + " is unsupported"); } } /** * Converts a JsonElement into a supported ParquetType * @author tilakpatidar * */ public static abstract class JsonElementConverter { protected final JsonSchema jsonSchema; protected JsonElementConverter(JsonSchema schema) { this.jsonSchema = schema; } /** * Convert value to a parquet type and perform null check. * @param value * @return Parquet safe type */ public Object convert(JsonElement value) { if (value.isJsonNull()) { if (this.jsonSchema.isNullable()) { return null; } throw new RuntimeException( "Field: " + this.jsonSchema.getColumnName() + " is not nullable and contains a null value"); } return convertField(value); } /** * Returns a {@link Type} parquet schema * @return */ abstract public Type schema(); /** * Convert JsonElement to Parquet type * @param value * @return */ abstract Object convertField(JsonElement value); } /** * Converts a {@link JsonSchema} to a {@link PrimitiveType} */ public static abstract class PrimitiveConverter extends JsonElementConverter { protected final boolean repeated; private PrimitiveTypeName outputType; protected Type schema; /** * @param jsonSchema * @param repeated * @param outputType */ public PrimitiveConverter(JsonSchema jsonSchema, boolean repeated, PrimitiveTypeName outputType) { super(jsonSchema); this.repeated = repeated; this.outputType = outputType; this.schema = buildSchema(); } protected Type buildSchema() { return new PrimitiveType(this.repeated ? REPEATED : optionalOrRequired(this.jsonSchema), this.outputType, this.jsonSchema.getColumnName()); } @Override public Type schema() { return this.schema; } } /** * Converts {@link JsonSchema} having collection of elements of {@link InputType} into a {@link GroupType}. */ public static abstract class CollectionConverter extends JsonElementConverter { protected InputType elementType; protected JsonElementConverter elementConverter; protected Type schema; public CollectionConverter(JsonSchema collectionSchema, InputType elementType, boolean repeated) { super(collectionSchema); this.elementType = elementType; this.elementConverter = getConverter(getElementSchema(), repeated); this.schema = buildSchema(); } @Override public Type schema() { return this.schema; } /** * Prepare a {@link JsonSchema} for the elements in a collection. * @return */ abstract JsonSchema getElementSchema(); abstract Type buildSchema(); } public static class IntConverter extends PrimitiveConverter { public IntConverter(JsonSchema schema, boolean repeated) { super(schema, repeated, INT32); } @Override IntegerValue convertField(JsonElement value) { return new IntegerValue(value.getAsInt()); } } public static class LongConverter extends PrimitiveConverter { public LongConverter(JsonSchema schema, boolean repeated) { super(schema, repeated, INT64); } @Override LongValue convertField(JsonElement value) { return new LongValue(value.getAsLong()); } } public static class FloatConverter extends PrimitiveConverter { public FloatConverter(JsonSchema schema, boolean repeated) { super(schema, repeated, PrimitiveTypeName.FLOAT); } @Override FloatValue convertField(JsonElement value) { return new FloatValue(value.getAsFloat()); } } public static class DoubleConverter extends PrimitiveConverter { public DoubleConverter(JsonSchema schema, boolean repeated) { super(schema, repeated, PrimitiveTypeName.DOUBLE); } @Override DoubleValue convertField(JsonElement value) { return new DoubleValue(value.getAsDouble()); } } public static class BooleanConverter extends PrimitiveConverter { public BooleanConverter(JsonSchema schema, boolean repeated) { super(schema, repeated, PrimitiveTypeName.BOOLEAN); } @Override BooleanValue convertField(JsonElement value) { return new BooleanValue(value.getAsBoolean()); } } public static class StringConverter extends PrimitiveConverter { public StringConverter(JsonSchema schema, boolean repeated) { super(schema, repeated, BINARY); this.schema = buildSchema(); } @Override BinaryValue convertField(JsonElement value) { return new BinaryValue(Binary.fromString(value.getAsString())); } @Override protected Type buildSchema() { String columnName = this.jsonSchema.getColumnName(); if (this.repeated) { return Types.repeated(BINARY).as(LogicalTypeAnnotation.StringLogicalTypeAnnotation.stringType()).named(columnName); } switch (optionalOrRequired(this.jsonSchema)) { case OPTIONAL: return Types.optional(BINARY).as(LogicalTypeAnnotation.StringLogicalTypeAnnotation.stringType()).named(columnName); case REQUIRED: return Types.required(BINARY).as(LogicalTypeAnnotation.StringLogicalTypeAnnotation.stringType()).named(columnName); default: throw new RuntimeException("Unsupported Repetition type"); } } } public static Type.Repetition optionalOrRequired(JsonSchema jsonBaseSchema) { return jsonBaseSchema.isNullable() ? OPTIONAL : REQUIRED; } public static class ArrayConverter extends CollectionConverter { public ArrayConverter(JsonSchema arraySchema) { super(arraySchema, arraySchema.getElementTypeUsingKey(ARRAY_ITEMS_KEY), true); } @Override Object convertField(JsonElement value) { ParquetGroup array = new ParquetGroup((GroupType) schema()); JsonElementConverter converter = this.elementConverter; for (JsonElement elem : (JsonArray) value) { array.add(ARRAY_KEY, converter.convert(elem)); } return array; } @Override protected Type buildSchema() { List fields = new ArrayList<>(); fields.add(0, this.elementConverter.schema()); return new GroupType(optionalOrRequired(jsonSchema), this.jsonSchema.getColumnName(), fields); } @Override JsonSchema getElementSchema() { JsonSchema jsonSchema = JsonSchema.buildBaseSchema(this.elementType, true); jsonSchema.setColumnName(ARRAY_KEY); return jsonSchema; } } public static class EnumConverter extends CollectionConverter { private final HashSet symbols = new HashSet<>(); public EnumConverter(JsonSchema enumSchema) { super(enumSchema, STRING, false); JsonArray symbolsArray = enumSchema.getSymbols(); symbolsArray.forEach(e -> symbols.add(e.getAsString())); } @Override Object convertField(JsonElement value) { if (symbols.contains(value.getAsString()) || (this.jsonSchema.isNullable() && value.isJsonNull())) { return this.elementConverter.convert(value); } throw new RuntimeException("Symbol " + value.getAsString() + " does not belong to set " + symbols.toString()); } @Override protected Type buildSchema() { return this.elementConverter.schema(); } @Override JsonSchema getElementSchema() { JsonSchema jsonSchema = JsonSchema.buildBaseSchema(STRING, this.jsonSchema.isNullable()); jsonSchema.setColumnName(this.jsonSchema.getColumnName()); return jsonSchema; } } public static class RecordConverter extends JsonElementConverter { private final HashMap converters; private final RecordType recordType; private final Type schema; public enum RecordType { ROOT, CHILD } public RecordConverter(JsonSchema recordSchema) { this(recordSchema, CHILD); } public RecordConverter(JsonSchema recordSchema, RecordType recordType) { super(recordSchema); this.converters = new HashMap<>(); this.recordType = recordType; this.schema = buildSchema(); } @Override Object convertField(JsonElement value) { ParquetGroup r1 = new ParquetGroup((GroupType) schema()); JsonObject inputRecord = value.getAsJsonObject(); for (Map.Entry entry : inputRecord.entrySet()) { String key = entry.getKey(); JsonElementConverter converter = this.converters.get(key); Object convertedValue = converter.convert(entry.getValue()); boolean valueIsNull = convertedValue == null; Type.Repetition repetition = optionalOrRequired(converter.jsonSchema); if (valueIsNull && repetition.equals(OPTIONAL)) { continue; } r1.add(key, convertedValue); } return r1; } private Type buildSchema() { JsonArray inputSchema = this.jsonSchema.getDataTypeValues(); List parquetTypes = new ArrayList<>(); for (JsonElement element : inputSchema) { JsonObject map = (JsonObject) element; JsonSchema elementSchema = new JsonSchema(map); String columnName = elementSchema.getColumnName(); JsonElementConverter converter = JsonElementConversionFactory.getConverter(elementSchema, false); Type schemaType = converter.schema(); this.converters.put(columnName, converter); parquetTypes.add(schemaType); } String docName = this.jsonSchema.getColumnName(); switch (recordType) { case ROOT: return new MessageType(docName, parquetTypes); case CHILD: return new GroupType(optionalOrRequired(this.jsonSchema), docName, parquetTypes); default: throw new RuntimeException("Unsupported Record type"); } } @Override public Type schema() { return this.schema; } } public static class MapConverter extends CollectionConverter { public MapConverter(JsonSchema mapSchema) { super(mapSchema, mapSchema.getElementTypeUsingKey(MAP_ITEMS_KEY), false); } @Override Object convertField(JsonElement value) { ParquetGroup mapGroup = new ParquetGroup((GroupType) schema()); JsonElementConverter converter = this.elementConverter; JsonObject map = (JsonObject) value; for (Map.Entry entry : map.entrySet()) { ParquetGroup entrySet = (ParquetGroup) mapGroup.addGroup(MAP_KEY); entrySet.add(MAP_KEY_COLUMN_NAME, entry.getKey()); entrySet.add(MAP_VALUE_COLUMN_NAME, converter.convert(entry.getValue())); } return mapGroup; } @Override protected Type buildSchema() { JsonElementConverter elementConverter = this.elementConverter; JsonElementConverter keyConverter = getKeyConverter(); GroupType mapGroup = Types.repeatedGroup().addFields(keyConverter.schema(), elementConverter.schema()).named(MAP_KEY) .asGroupType(); String columnName = this.jsonSchema.getColumnName(); switch (optionalOrRequired(this.jsonSchema)) { case OPTIONAL: return Types.optionalGroup().addFields(mapGroup).named(columnName).asGroupType(); case REQUIRED: return Types.requiredGroup().addFields(mapGroup).named(columnName).asGroupType(); default: return null; } } @Override JsonSchema getElementSchema() { JsonSchema jsonSchema = JsonSchema.buildBaseSchema(this.elementType, false); jsonSchema.setColumnName(MAP_VALUE_COLUMN_NAME); return jsonSchema; } public JsonElementConverter getKeyConverter() { JsonSchema jsonSchema = JsonSchema.buildBaseSchema(STRING, false); jsonSchema.setColumnName(MAP_KEY_COLUMN_NAME); return getConverter(jsonSchema, false); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy