com.thinkbiganalytics.discovery.util.ParserHelper Maven / Gradle / Ivy
The newest version!
package com.thinkbiganalytics.discovery.util;
/*-
* #%L
* thinkbig-schema-discovery-api
* %%
* Copyright (C) 2017 ThinkBig Analytics
* %%
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
import com.fasterxml.jackson.annotation.JsonProperty;
import com.thinkbiganalytics.discovery.schema.DataTypeDescriptor;
import com.thinkbiganalytics.discovery.schema.Field;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.Validate;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.io.StringWriter;
import java.nio.charset.Charset;
import java.sql.JDBCType;
import java.sql.Types;
import java.util.List;
/**
* Provides utility methods useful for writing parsers
*/
public class ParserHelper {
private static final Logger log = LoggerFactory.getLogger(ParserHelper.class);
public static String sqlTypeToHiveType(Integer type) {
switch (type) {
case Types.BIGINT:
return "bigint";
case Types.NUMERIC:
case Types.DOUBLE:
case Types.DECIMAL:
return "double";
case Types.INTEGER:
return "int";
case Types.FLOAT:
return "float";
case Types.TINYINT:
return "tinyint";
case Types.DATE:
return "date";
case Types.TIMESTAMP:
return "timestamp";
case Types.BOOLEAN:
return "boolean";
case Types.BINARY:
return "binary";
default:
return "string";
}
}
/*
Convert the JDBC sql type to a hive type
*/
public static String sqlTypeToHiveType(JDBCType jdbcType) {
if (jdbcType != null) {
Integer type = jdbcType.getVendorTypeNumber();
return sqlTypeToHiveType(type);
}
return null;
}
/**
* Derive the corresponding data type from sample values
*
* @param values a list of string values
* @return the JDBC data type
*/
public static JDBCType deriveJDBCDataType(List values) {
JDBCType guess = null;
if (values != null) {
for (String v : values) {
if (!StringUtils.isEmpty(v)) {
JDBCType currentPass;
try {
Integer.parseInt(v);
currentPass = JDBCType.INTEGER;
} catch (NumberFormatException e) {
try {
Double.parseDouble(v);
currentPass = JDBCType.DOUBLE;
} catch (NumberFormatException ex) {
// return immediately for non-numeric case
return JDBCType.VARCHAR;
}
}
// If a double is encountered, use that type
if (guess == null || currentPass == JDBCType.DOUBLE) {
guess = currentPass;
}
}
}
}
return (guess == null ? JDBCType.VARCHAR : guess);
}
/**
* Derive data types
*
* @param type the target database platform
* @param fields the fields
*/
public static void deriveDataTypes(TableSchemaType type, List extends Field> fields) {
for (Field field : fields) {
if (StringUtils.isEmpty(field.getDerivedDataType())) {
JDBCType jdbcType = JDBCType.VARCHAR;
try {
if (!StringUtils.isEmpty(field.getNativeDataType())) {
jdbcType = JDBCType.valueOf(field.getNativeDataType());
} else {
jdbcType = deriveJDBCDataType(field.getSampleValues());
}
} catch (IllegalArgumentException e) {
log.warn("Unable to convert data type [?] will be converted to VARCHAR", field.getNativeDataType());
}
switch (type) {
case HIVE:
String hiveType = sqlTypeToHiveType(jdbcType);
field.setDerivedDataType(hiveType);
field.setDataTypeDescriptor(hiveTypeToDescriptor(hiveType));
break;
case RDBMS:
field.setDerivedDataType(jdbcType.getName());
}
}
}
}
/*
Returns whether the provided field represents a complex structure such as ARRAY, STRUCT, or BINARY
*/
public static DataTypeDescriptor hiveTypeToDescriptor(String hiveType) {
HiveDataTypeDescriptor descriptor = new HiveDataTypeDescriptor();
if (hiveType != null) {
hiveType = hiveType.toLowerCase();
switch (hiveType) {
case "boolean":
case "string":
break;
case "bigint":
case "double":
case "int":
case "float":
case "tinyint":
descriptor.setNumeric(true);
break;
case "date":
case "timestamp":
descriptor.setDate(true);
break;
default:
if (hiveType.contains("decimal")) {
descriptor.setNumeric(true);
} else {
descriptor.setComplex(true);
}
}
}
return descriptor;
}
public static String toNativeType(Integer dataType) {
return JDBCType.valueOf(dataType).getName();
}
static class HiveDataTypeDescriptor implements DataTypeDescriptor {
@JsonProperty("date")
boolean isDate;
@JsonProperty("numeric")
boolean isNumeric;
@JsonProperty("complex")
boolean isComplex;
@Override
public Boolean isDate() {
return isDate;
}
@Override
public Boolean isNumeric() {
return isNumeric;
}
@Override
public Boolean isComplex() {
return isComplex;
}
public void setDate(boolean date) {
isDate = date;
}
public void setNumeric(boolean numeric) {
isNumeric = numeric;
}
public void setComplex(boolean complex) {
isComplex = complex;
}
}
}