All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.visallo.web.structuredingest.parquet.ParquetStructuredIngestParser Maven / Gradle / Ivy

There is a newer version: 4.0.0
Show newest version
package org.visallo.web.structuredingest.parquet;

import com.google.common.base.Joiner;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import jodd.datetime.JDateTime;
import jodd.datetime.JulianDateStamp;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.format.converter.ParquetMetadataConverter;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.ParquetReader;
import org.apache.parquet.hadoop.api.ReadSupport;
import org.apache.parquet.hadoop.metadata.BlockMetaData;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.OriginalType;
import org.apache.parquet.schema.Type;
import org.apache.parquet.tools.read.SimpleReadSupport;
import org.apache.parquet.tools.read.SimpleRecord;
import org.visallo.core.exception.VisalloException;
import org.visallo.core.util.VisalloLogger;
import org.visallo.core.util.VisalloLoggerFactory;
import org.visallo.web.structuredingest.core.model.ClientApiAnalysis;
import org.visallo.web.structuredingest.core.model.ParseOptions;
import org.visallo.web.structuredingest.core.model.StructuredIngestParser;
import org.visallo.web.structuredingest.core.util.BaseStructuredFileParserHandler;
import org.visallo.web.structuredingest.core.util.StructuredFileParserHandler;
import org.visallo.web.structuredingest.core.util.mapping.ColumnMappingType;

import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.function.ToLongFunction;

public class ParquetStructuredIngestParser implements StructuredIngestParser {
    private static final VisalloLogger LOGGER = VisalloLoggerFactory.getLogger(ParquetStructuredIngestParser.class);
    private static final String PARQUET_MIME_TYPE = "application/x-parquet";
    private static final long MILLIS_PER_DAY = 24 * 60 * 60 * 1000L;
    private static final long NANOS_PER_DAY = MILLIS_PER_DAY * 1000000;

    @Override
    public Set getSupportedMimeTypes() {
        return Sets.newHashSet(PARQUET_MIME_TYPE);
    }

    @Override
    public void ingest(InputStream in, ParseOptions parseOptions, BaseStructuredFileParserHandler parserHandler) throws Exception {
        File tempFile = null;
        try {
            tempFile = File.createTempFile("parquet", "tmp");
            IOUtils.copy(in, new FileOutputStream(tempFile));

            Configuration conf = new Configuration();
            Path path = new Path(tempFile.getAbsolutePath());
            ParquetMetadata metaData = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER);
            MessageType schema = metaData.getFileMetaData().getSchema();
            parserHandler.setTotalRows(getRowCount(metaData));

            try (ParquetReader reader = ParquetReader.builder(new SimpleReadSupport(), new Path(tempFile.getAbsolutePath())).build()) {
                parserHandler.newSheet("");
                int rowNum = 0;

                for (SimpleRecord value = reader.read(); value != null; value = reader.read()) {
                    Map row = Maps.newHashMap();

                    for (SimpleRecord.NameValue nameValue : value.getValues()) {
                        String name = nameValue.getName();
                        Object val = nameValue.getValue();
                        if (!(val instanceof SimpleRecord)) {
                            Type type = schema.getType(schema.getFieldIndex(name));
                            row.put(name, getRecordValue(val, type));
                        }
                    }

                    if (!parserHandler.addRow(row, rowNum++)) break;
                }
            }
        } finally {
            if (tempFile != null) tempFile.delete();
        }
    }

    private long getRowCount(ParquetMetadata metaData) {
        // maven shade isn't working with lambdas, so use anon class
        return metaData.getBlocks().stream().mapToLong(new ToLongFunction() {
                    @Override
                    public long applyAsLong(BlockMetaData value) {
                        return value.getRowCount();
                    }
                }).sum();
    }

    @Override
    public ClientApiAnalysis analyze(InputStream inputStream) throws Exception {
        File tempFile = null;
        try {
            tempFile = File.createTempFile("parquet", "tmp");
            IOUtils.copy(inputStream, new FileOutputStream(tempFile));

            Configuration conf = new Configuration();
            Path path = new Path(tempFile.getAbsolutePath());
            ParquetMetadata metaData = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER);
            MessageType schema = metaData.getFileMetaData().getSchema();
            StructuredFileParserHandler parserHandler = new StructuredFileParserHandler();
            parserHandler.newSheet("");
            parserHandler.getHints().allowHeaderSelection = false;
            parserHandler.getHints().sendColumnIndices = false;

            for (ColumnDescriptor col : schema.getColumns()) {
                String[] colPath = col.getPath();
                if (colPath.length == 0) {
                    throw new VisalloException("structured file path has no elements");
                }

                if (colPath.length > 1) {
                    LOGGER.warn("not parsing nested element %s", Joiner.on(",").join(col.getPath()));
                    continue;
                }

                parserHandler.addColumn(colPath[0], getColumnType(schema.getType(colPath)));
            }

            DateFormat df = new SimpleDateFormat("yyyy/MM/dd HH:mm");
            df.setTimeZone(TimeZone.getTimeZone("UTC"));
            ReadSupport readSupport = new SimpleReadSupport();
            try (ParquetReader reader = ParquetReader.builder(readSupport, path).build()) {
                int rowNum = 0;

                for (SimpleRecord value = reader.read(); value != null; value = reader.read()) {
                    Map row = Maps.newHashMap();

                    for (SimpleRecord.NameValue nameValue : value.getValues()) {
                        String name = nameValue.getName();
                        Object val = nameValue.getValue();
                        if (!(val instanceof SimpleRecord)) {
                            Type type = schema.getType(schema.getFieldIndex(name));
                            Object recordValue = getRecordValue(val, type);
                            String strValue;
                            if (recordValue instanceof Date) {
                                strValue = df.format((Date) recordValue);
                            } else {
                                strValue = recordValue.toString();
                            }
                            row.put(name, strValue);
                        }
                    }

                    if (!parserHandler.addRow(row, rowNum++)) break;
                }

                parserHandler.setTotalRows(getRowCount(metaData));
                return parserHandler.getResult();
            }
        } finally {
            if (tempFile != null) tempFile.delete();
        }
    }



    private Object getRecordValue(Object value, Type type) {
        String primitive = type.asPrimitiveType().getPrimitiveTypeName().toString();

        // Special handling for INT96 "Timestamp" columns
        if (primitive.equals("INT96")) {
            Date date = getDateFromInt96(value);
            if (date != null) {
                return date;
            }
        }

        ColumnMappingType columnType = getColumnType(type);
        switch (columnType) {
            case Boolean:
                return Boolean.parseBoolean(value.toString());

            case Date:
                Calendar c = new GregorianCalendar();
                c.setTimeZone(TimeZone.getTimeZone("UTC"));
                c.setTime(new Date(0));
                c.add(Calendar.DAY_OF_YEAR, (Integer) value);
                return c.getTime();

            case DateTime:
                return new Date((Long) value);

            case GeoPoint:
                break;
            case Number:
                break;

            case String:
            case Unknown:
            default:

        }

        return value.toString();
    }

    private ColumnMappingType getColumnType(Type type) {
        OriginalType originalType = type.getOriginalType();
        if (originalType == null) {

            String primitiveType = type.asPrimitiveType().getPrimitiveTypeName().toString();
            if (primitiveType.equals("BOOLEAN")) return ColumnMappingType.Boolean;
            if (primitiveType.equals("DECIMAL")) return ColumnMappingType.Number;
            if (primitiveType.equals("DOUBLE")) return ColumnMappingType.Number;
            if (primitiveType.equals("INT96")) return ColumnMappingType.Date;
            if (primitiveType.indexOf("INT") == 0) return ColumnMappingType.Number;
            if (primitiveType.indexOf("UINT") == 0) return ColumnMappingType.Number;

            return ColumnMappingType.Unknown;
        }
        String original = originalType.toString();

        if (original.equals("DATE")) return ColumnMappingType.Date;
        if (original.equals("TIMESTAMP_MILLIS")) return ColumnMappingType.DateTime;
        if (original.indexOf("UTF") == 0) return ColumnMappingType.String;


        return ColumnMappingType.Unknown;
    }

    private Date getDateFromInt96(Object val) {
        if (val instanceof byte[]) {
            byte[] bytes = (byte[]) val;
            ByteBuffer buf = Binary.fromByteArray(bytes).toByteBuffer();
            buf.order(ByteOrder.LITTLE_ENDIAN);
            long timeOfDayNanos = buf.getLong();
            int julianDay = buf.getInt();

            JDateTime date = new JDateTime(new JulianDateStamp(julianDay, (double) timeOfDayNanos / NANOS_PER_DAY));
            return date.convertToDate();
        }
        return null;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy