
com.datastax.data.prepare.util.SharedMethods Maven / Gradle / Ivy
package com.datastax.data.prepare.util;
import org.apache.parquet.Strings;
import org.apache.spark.ml.linalg.VectorUDT;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
//todo andy 名称
public class SharedMethods {
private static final Logger logger = LoggerFactory.getLogger(SharedMethods.class);
// 过滤属性
public static StructField[] attributeFilter(Dataset data, final String attributeSelector, final boolean invertSelection, final String attribute,
final String regularExpression, final String valueType) {
StructField[] fields = data.schema().fields();
boolean flag = true;
if(Consts.ATTRIBUTE_NAME.equals(attributeSelector)) {
if(attribute != null && attribute.length() != 0) {
String[] temp = handleColsWithEmpty(attribute.split(Consts.DELIMITER));
if(temp.length == 0) {
logger.info("Attribute Name 的属性为空");
return null;
}
fields = getSelectedField(fields, temp, invertSelection, 1);
flag = false;
}else {
fields = null;
}
}
if(Consts.REGULAR_EXPRESSION.equals( attributeSelector)) {
if(regularExpression != null && !"".equals(regularExpression)) {
Pattern pattern = Pattern.compile(regularExpression);
StructField[] temp = new StructField[fields != null ? fields.length : 0];
for(int i = 0, position = 0; i < (fields != null ? fields.length : 0); i++) {
if(invertSelection != pattern.matcher(fields[i].name()).matches()) {
temp[position++] = fields[i];
}
}
fields = temp;
}else {
fields = null;
}
flag = false;
}
if(Consts.VALUE_TYPE.equals(attributeSelector)) {
DataType[] types = null;
if(Consts.NUMERIC.equals(valueType)) {
types = new DataType[]{DataTypes.IntegerType, DataTypes.ByteType, DataTypes.DoubleType,
DataTypes.FloatType, DataTypes.LongType, DataTypes.ShortType};
}
if(Consts.INTEGER.equals(valueType)) {
types = new DataType[]{DataTypes.ShortType, DataTypes.IntegerType, DataTypes.ByteType, DataTypes.LongType};
}
if(Consts.DATE.equals(valueType)) {
types = new DataType[]{DataTypes.DateType};
}
if(Consts.TIMESTAMP.equals(valueType)) {
types = new DataType[]{DataTypes.TimestampType};
}
if(types == null) {
types = new DataType[]{DataTypes.StringType, DataTypes.CalendarIntervalType, DataTypes.BinaryType};
}
fields = getSelectedField(fields, types, invertSelection, 2);
flag = false;
}
if((flag & invertSelection) || fields == null) {
logger.info("None of Attributes is selected");
return null;
}
return dropSuffixEmpty(fields);
}
private static StructField[] dropSuffixEmpty(StructField[] fields) {
int position = fields.length-1;
while(fields[position] == null) { position--; }
StructField[] temp = new StructField[position+1];
System.arraycopy(fields, 0, temp, 0, position+1);
return temp;
}
private static StructField[] getSelectedField(StructField[] fields, Object[] temp, boolean invertSelection, int type) {
StructField[] result = new StructField[fields.length];
int[] sign = new int[fields.length];
boolean flag;
for(int i=0; i fileList, String... suffixs) {
if(!file.exists()) {
logger.info("file does not exists");
return ;
}
if(file.isDirectory()) {
File[] files = file.listFiles();
if(files == null) {
return ;
}
for(File temp : files) {
if(file.isDirectory()) {
filesFilter(temp, fileList, suffixs);
}else {
if(accept(file.getName(), suffixs)) { fileList.add(file); }
}
}
}else {
if(accept(file.getName(), suffixs)) { fileList.add(file); }
}
}
private static boolean accept(String fileName, String[] suffixs) {
if(suffixs.length == 0) {
return true;
}
boolean flag = false;
for(String suffix : suffixs) {
if(!Strings.isNullOrEmpty(suffix)) {
if(fileName.endsWith(suffix)) {
flag = true; break;
}
}
}
return flag;
}
//todo andy 无用
public static boolean checkColumnFormat(Row[] rows, String regex) {
boolean flag = true;
for(Row row : rows) {
if(!Strings.isNullOrEmpty(row.getString(0)) && !row.getString(0).matches(regex)) {
flag = false;
break;
}
}
return flag;
}
public static String[] handleColsWithEmpty(String[] cols) {
List list = new ArrayList<>();
int i = 0;
for(String s : cols) {
String t = s.trim();
if(!t.isEmpty()) {
i++;
list.add(t);
}
}
return list.toArray(new String[i]);
}
public static void recordSchema(StructField[] fields, Map map) {
for(int i=0; i
© 2015 - 2025 Weber Informatics LLC | Privacy Policy