All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.thinkbiganalytics.discovery.util.ParserHelper Maven / Gradle / Ivy

The newest version!
package com.thinkbiganalytics.discovery.util;

/*-
 * #%L
 * thinkbig-schema-discovery-api
 * %%
 * Copyright (C) 2017 ThinkBig Analytics
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */

import com.fasterxml.jackson.annotation.JsonProperty;
import com.thinkbiganalytics.discovery.schema.DataTypeDescriptor;
import com.thinkbiganalytics.discovery.schema.Field;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.Validate;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.io.StringWriter;
import java.nio.charset.Charset;
import java.sql.JDBCType;
import java.sql.Types;
import java.util.List;

/**
 * Provides utility methods useful for writing parsers
 */
public class ParserHelper {

    private static final Logger log = LoggerFactory.getLogger(ParserHelper.class);


    public static String sqlTypeToHiveType(Integer type) {
        switch (type) {
            case Types.BIGINT:
                return "bigint";
            case Types.NUMERIC:
            case Types.DOUBLE:
            case Types.DECIMAL:
                return "double";
            case Types.INTEGER:
                return "int";
            case Types.FLOAT:
                return "float";
            case Types.TINYINT:
                return "tinyint";
            case Types.DATE:
                return "date";
            case Types.TIMESTAMP:
                return "timestamp";
            case Types.BOOLEAN:
                return "boolean";
            case Types.BINARY:
                return "binary";
            default:
                return "string";
        }
    }

    /*
    Convert the JDBC sql type to a hive type
     */
    public static String sqlTypeToHiveType(JDBCType jdbcType) {
        if (jdbcType != null) {
            Integer type = jdbcType.getVendorTypeNumber();
            return sqlTypeToHiveType(type);

        }
        return null;
    }

    /**
     * Derive the corresponding data type from sample values
     *
     * @param values a list of string values
     * @return the JDBC data type
     */
    public static JDBCType deriveJDBCDataType(List values) {

        JDBCType guess = null;
        if (values != null) {
            for (String v : values) {
                if (!StringUtils.isEmpty(v)) {
                    JDBCType currentPass;
                    try {
                        Integer.parseInt(v);
                        currentPass = JDBCType.INTEGER;
                    } catch (NumberFormatException e) {
                        try {
                            Double.parseDouble(v);
                            currentPass = JDBCType.DOUBLE;
                        } catch (NumberFormatException ex) {
                            // return immediately for non-numeric case
                            return JDBCType.VARCHAR;
                        }
                    }
                    // If a double is encountered, use that type
                    if (guess == null || currentPass == JDBCType.DOUBLE) {
                        guess = currentPass;
                    }
                }
            }
        }
        return (guess == null ? JDBCType.VARCHAR : guess);
    }

    /**
     * Derive data types
     *
     * @param type   the target database platform
     * @param fields the fields
     */
    public static void deriveDataTypes(TableSchemaType type, List fields) {
        for (Field field : fields) {
            if (StringUtils.isEmpty(field.getDerivedDataType())) {
                JDBCType jdbcType = JDBCType.VARCHAR;
                try {
                    if (!StringUtils.isEmpty(field.getNativeDataType())) {
                        jdbcType = JDBCType.valueOf(field.getNativeDataType());
                    } else {
                        jdbcType = deriveJDBCDataType(field.getSampleValues());
                    }
                } catch (IllegalArgumentException e) {
                    log.warn("Unable to convert data type [?] will be converted to VARCHAR", field.getNativeDataType());
                }

                switch (type) {
                    case HIVE:
                        String hiveType = sqlTypeToHiveType(jdbcType);
                        field.setDerivedDataType(hiveType);
                        field.setDataTypeDescriptor(hiveTypeToDescriptor(hiveType));
                        break;
                    case RDBMS:
                        field.setDerivedDataType(jdbcType.getName());
                }
            }
        }
    }

    /*
    Returns whether the provided field represents a complex structure such as ARRAY, STRUCT, or BINARY
    */
    public static DataTypeDescriptor hiveTypeToDescriptor(String hiveType) {
        HiveDataTypeDescriptor descriptor = new HiveDataTypeDescriptor();
        if (hiveType != null) {
            hiveType = hiveType.toLowerCase();
            switch (hiveType) {
                case "boolean":
                case "string":
                    break;
                case "bigint":
                case "double":
                case "int":
                case "float":
                case "tinyint":
                    descriptor.setNumeric(true);
                    break;
                case "date":
                case "timestamp":
                    descriptor.setDate(true);
                    break;
                default:
                    if (hiveType.contains("decimal")) {
                        descriptor.setNumeric(true);
                    } else {
                        descriptor.setComplex(true);
                    }
            }
        }
        return descriptor;
    }

    public static String toNativeType(Integer dataType) {
        return JDBCType.valueOf(dataType).getName();
    }

    static class HiveDataTypeDescriptor implements DataTypeDescriptor {

        @JsonProperty("date")
        boolean isDate;

        @JsonProperty("numeric")
        boolean isNumeric;

        @JsonProperty("complex")
        boolean isComplex;

        @Override
        public Boolean isDate() {
            return isDate;
        }

        @Override
        public Boolean isNumeric() {
            return isNumeric;
        }

        @Override
        public Boolean isComplex() {
            return isComplex;
        }

        public void setDate(boolean date) {
            isDate = date;
        }

        public void setNumeric(boolean numeric) {
            isNumeric = numeric;
        }

        public void setComplex(boolean complex) {
            isComplex = complex;
        }

    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy