com.wl4g.infra.common.dataformat.orc.OrcJsonHolder Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of infra-common Show documentation
A Infra Common.
There is a newer version: 3.1.72
/*
 * Copyright 2023 ~ 2030 the original author or authors. James Wong 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *      http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.wl4g.infra.common.dataformat.orc;

import lombok.AllArgsConstructor;
import org.apache.commons.collections4.IteratorUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PositionedReadable;
import org.apache.hadoop.fs.Seekable;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.hadoop.util.Progressable;
import org.apache.orc.OrcConf;
import org.apache.orc.OrcFile;
import org.apache.orc.Reader;
import org.apache.orc.RecordReader;
import org.apache.orc.TypeDescription;
import org.apache.orc.Writer;
import org.apache.orc.impl.PhysicalFsWriter;
import org.apache.orc.impl.writer.WriterEncryptionVariant;

import javax.annotation.Nullable;
import javax.validation.constraints.Min;
import javax.validation.constraints.NotNull;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URI;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;

import static com.wl4g.infra.common.collection.CollectionUtils2.isEmpty;
import static com.wl4g.infra.common.lang.Assert2.notNullOf;
import static java.util.Collections.emptyList;
import static java.util.Objects.isNull;
import static java.util.Objects.nonNull;
import static org.apache.commons.lang3.SystemUtils.LINE_SEPARATOR;
import static org.apache.orc.OrcFile.CompressionStrategy.COMPRESSION;

/**
 * The {@link OrcJsonHolder} class provides conversion utilities between ORC and json.
 */
@AllArgsConstructor
public abstract class OrcJsonHolder {
    private static final String DEFAULT_DATE_FORMATTER = "yyyy-MM-dd'T'HH:mm:ss.SSSXXX";
    private static final Configuration DEFAULT_CONF = new Configuration();
    private static final Path DEFAULT_DUMMY_PATH = new Path("/dev/null");
    private static final FileStatus DEFAULT_STATUS = new FileStatus(0, false, 0, 0, 0, DEFAULT_DUMMY_PATH);

    private boolean usePhysicalFsWriter;
    private @Min(0) int batchMaxSize;
    private @Nullable String timestampFormat;
    private @Nullable Properties options;

    @SuppressWarnings("unused")
    protected OrcJsonHolder() {
        this.usePhysicalFsWriter = true;
        this.batchMaxSize = 1024 * 1024;
        this.timestampFormat = DEFAULT_DATE_FORMATTER;
        this.options = new Properties() {
            {
                setProperty(OrcConf.COMPRESSION_STRATEGY.name(), COMPRESSION.name());
            }
        };
    }

    // ----- Get ORC schema from JSON -----

    /**
     * Get the ORC schema type for the given json json.
     *
     * @param jsonNode The record json jsonNode
     * @return The ORC schema type.
     */
    public TypeDescription getSchema(@NotNull Object jsonNode) {
        notNullOf(jsonNode, "jsonNode");
        return getSchemaFromJsonObject(jsonNode);
    }

    /**
     * Get the ORC schema type for the given json json.
     *
     * @param jsonNode The record json jsonNode
     * @return The ORC schema type.
     */
    protected abstract TypeDescription getSchemaFromJsonObject(@NotNull Object jsonNode);

    /**
     * Get the ORC schema type for the given array node.
     *
     * @param arrayNodeElements The array node
     * @return The ORC schema type.
     */
    @SuppressWarnings("all")
    protected TypeDescription getListSchemaFromArrayNode(Iterable arrayNodeElements) {
        final List childTypes = new ArrayList<>();
        for (Object element : arrayNodeElements) {
            childTypes.add(getSchemaFromJsonObject(element));
        }
        return TypeDescription.createList(getMergedSchemaType(childTypes));
    }

    /**
     * Get the merged ORC schema type for the given list of types.
     *
     * @param schemas The list of types
     * @return The merged ORC schema type.
     */
    protected TypeDescription getMergedSchemaType(List schemas) {
        if (isEmpty(schemas)) {
            throw new IllegalArgumentException("Cannot determine common type for an empty list");
        }
        TypeDescription mergedType = schemas.get(0);
        for (TypeDescription type : schemas) {
            mergedType = getMergedSchemaType(mergedType, type);
        }
        return mergedType;
    }

    /**
     * Get the common ORC schema type for the given two types.
     *
     * @param first  The first type
     * @param second The second type
     * @return The common ORC schema type.
     */
    protected TypeDescription getMergedSchemaType(TypeDescription first, TypeDescription second) {
        if (first.getCategory() == second.getCategory()) {
            return first;
        }
        switch (first.getCategory()) {
            case BYTE:
            case SHORT:
            case INT:
            case LONG:
                switch (second.getCategory()) {
                    case BYTE:
                    case SHORT:
                    case INT:
                    case LONG:
                        return TypeDescription.createLong();
                    case FLOAT:
                    case DOUBLE:
                        return TypeDescription.createDouble();
                    default:
                        throw new IllegalArgumentException("Cannot determine common type for " + first + " and " + second);
                }
            case FLOAT:
                if (second.getCategory() == TypeDescription.Category.DOUBLE) {
                    return TypeDescription.createDouble();
                } else {
                    throw new IllegalArgumentException("Cannot determine common type for " + first + " and " + second);
                }
            case DOUBLE:
                return TypeDescription.createDouble();
            default:
                throw new IllegalArgumentException("Cannot determine common type for " + first + " and " + second);
        }
    }

    // ----- Write ORC from JSON -----

    @SuppressWarnings("all")
    public FileSystem.Statistics writeToOrc(@NotNull List records,
                                            @NotNull TypeDescription schema,
                                            @Nullable Byte magic,
                                            @NotNull OutputStream orcOutput) throws IOException {
        notNullOf(records, "records");
        notNullOf(schema, "schema");

        // Each json is at least 16B, which is a good number
        final ByteArrayOutputStream jsonOutput = new ByteArrayOutputStream(records.size() * 16);
        if (nonNull(magic)) {
            jsonOutput.write(magic);
        }
        // Concatenate all json records to a byte input stream with '\n' delimiters.
        for (Object record : records) {
            jsonOutput.write(toJsonByteArray(record));
            jsonOutput.write(LINE_SEPARATOR.getBytes());
        }
        return writeToOrc(jsonOutput.toByteArray(), schema, orcOutput);
    }

    @SuppressWarnings("ConstantConditions")
    public FileSystem.Statistics writeToOrc(@NotNull byte[] jsonBytes,
                                            @NotNull TypeDescription schema,
                                            @NotNull OutputStream orcOutput) throws IOException {
        notNullOf(jsonBytes, "jsonBytes");
        notNullOf(schema, "schema");

        final OrcFile.WriterOptions writerOptions = OrcFile.writerOptions(options, DEFAULT_CONF)
                                                           .setSchema(schema)
                                                           .version(OrcFile.Version.CURRENT);
        final FileSystem.Statistics stats = new FileSystem.Statistics("stream://");
        final FSDataOutputStream outputStream = new FSDataOutputStream(orcOutput, stats);
        if (usePhysicalFsWriter) {
            final PhysicalFsWriter physicalFsWriter = new PhysicalFsWriter(outputStream,
                    writerOptions, new WriterEncryptionVariant[0]);
            writerOptions.physicalWriter(physicalFsWriter);
        } else {
            writerOptions.fileSystem(new OrcStreamFileSystem(outputStream, DEFAULT_STATUS, DEFAULT_CONF));
        }
        try (final Writer writer = OrcFile.createWriter(DEFAULT_DUMMY_PATH, writerOptions)) {
            final VectorizedRowBatch rowBatch = schema.createRowBatch(batchMaxSize);
            try (FSDataInputStream inputStream = new FSDataInputStream(new PositionedByteArrayInputStream(jsonBytes));
                 final RecordReader reader = createRecordReader(inputStream, jsonBytes.length, schema, timestampFormat)) {
                while (reader.nextBatch(rowBatch)) { // There is added data in this batch?
                    writer.addRowBatch(rowBatch);
                }
            }
        }
        return stats;
    }

    protected abstract byte[] toJsonByteArray(Object record);

    protected abstract RecordReader createRecordReader(@NotNull FSDataInputStream orcInput,
                                                       @Min(0) int length,
                                                       @NotNull TypeDescription schema,
                                                       @Nullable String timestampFormat) throws IOException;

    // ----- Read ORC to JSON -----

    /**
     * Read ORC data to json node records.
     *
     * @param orcBytes The ORC input stream
     * @return The json records
     * @throws IOException If an I/O error occurs.
     */
    public List