Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
com.datastax.data.exploration.common.DataOperator Maven / Gradle / Ivy
package com.datastax.data.exploration.common;
import com.csvreader.CsvReader;
import com.csvreader.CsvWriter;
import com.datastax.data.exploration.biz.datatable.DataType;
import com.datastax.data.exploration.util.Consts;
import org.apache.commons.io.FileUtils;
import org.apache.spark.sql.Row;
import org.javatuples.Pair;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.stream.Collectors;
import static com.datastax.data.exploration.util.CommonUtil.columnIndex;
import static com.datastax.data.exploration.util.CommonUtil.matchType;
/**
* 数据持久化工具
*/
public class DataOperator {
private static final Logger logger = LoggerFactory.getLogger(DataOperator.class);
private static void checkPath(String fileAddress) {
File path = new File(fileAddress).getParentFile();
if (!path.exists()) {
path.mkdirs();
}
}
/**
* 持久化数据和数据类型
*
* @param id 数据集id
* @param columns 数据集 titles
* @param listRow 数据集 行内容
*/
public static void writeDataAndType(String id, String[] columns, List listRow) {
writeEntireData(id, listRow);
writeType(id, columns,typeJudgment(listRow, columns.length));
}
/**
* 保存抽样后完整的数据集
*
* @param id 数据集id
* @param listRow 数据集(每行,不含表头)
*/
private static void writeEntireData(String id, List listRow) {
String fileAddress = FileUrl.entireData(id);
checkPath(fileAddress);
if (listRow != null && listRow.size() > 0) {
int columnSize = listRow.get(0).size();
CsvWriter csvWriter = new CsvWriter(fileAddress, Consts.COA, Charset.forName("UTF-8"));
try {
for (Row row : listRow) {
String[] rowData = new String[columnSize];
for (int i = 0; i < columnSize; i++) {
rowData[i] = row.getString(i);
}
csvWriter.writeRecord(rowData);
}
csvWriter.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* 保存过滤后的抽样数据集
*
* @param id 数据集id
* @param listRow 数据集(每行,不含表头)
*/
public static void writeExactData(String id, List listRow) {
writeData(listRow, FileUrl.exactData(id));
}
/**
* 保存错误的抽样数据集
*
* @param id 数据集id
* @param listRow 数据集(每行,不含表头)
*/
public static void writeErrorData(String id, List listRow) {
writeData(listRow, FileUrl.errorData(id));
}
private static void writeData(List listRow, String fileAddress) {
checkPath(fileAddress);
CsvWriter csvWriter = new CsvWriter(fileAddress, Consts.COA, Charset.forName("UTF-8"));
try {
for (String[] row : listRow) {
csvWriter.writeRecord(row);
}
csvWriter.close();
} catch (IOException e) {
e.printStackTrace();
}
}
private static List readData(String fileAddress) {
ArrayList csvList = new ArrayList<>();
try {
CsvReader reader = new CsvReader(fileAddress, Consts.COA, Charset.forName("UTF-8"));
while (reader.readRecord()) {
csvList.add(reader.getValues());
}
reader.close();
} catch (Exception ex) {
logger.error(ex.getMessage());
}
return csvList;
}
/**
* 读取数据集表头
*
* @param id 数据集id
* @return 返回的是数据集表头
*/
public static String[] readHeaders(String id) {
File typeFile = new File(FileUrl.type(id));
if (!typeFile.exists()) {
logger.info("type file is not exist!");
}
List lines = readLines(typeFile);
if (lines.size() == 2) {
return lines.get(0).split(Consts.DELIMITER);
} else {
logger.info("read type file error!");
}
return null;
}
/**
* 读取抽样后的完整数据集
*
* @param id 数据集id
* @return 返回的是抽样后的完整数据集
*/
public static List readEntireData(String id) {
return readData(FileUrl.entireData(id));
}
/**
* 读取过滤后的准确数据集
*
* @param id 数据集id
* @return 返回的是过滤后准确的数据集
*/
public static List readExactData(String id) {
return readData(FileUrl.exactData(id));
}
/**
* 读取错误的数据集
*
* @param id 数据集id
* @return 返回的是错误的数据集
*/
public static List readErrorData(String id) {
return readData(FileUrl.errorData(id));
}
/**
* 记录数据集是否有表头
*
* @param id 数据集id
* @param title 数据集是否有表头
*/
public static void writeTitle(String id, String title) {
String fileAddress = FileUrl.title(id);
checkPath(fileAddress);
writeFile(new File(fileAddress), title);
}
/**
* 获取数据集是否有表头记录
*
* @param id 数据集id
* @return 返回的是数据集是否有表头记录
*/
public static String readTitle(String id) {
String title = null;
try {
File file = new File(FileUrl.title(id));
if (file.exists() && file.isFile()) {
title = FileUtils.readFileToString(file, "utf8");
} else {
logger.info("title file is not exist!");
}
} catch (IOException e) {
logger.error("Exception:read title file error!");
}
return title;
}
/**
* 获取数据类型
*
* @param id 文件名,及id
* @return 返回的是数据类型 column, type
*/
public static Map readType(String id) {
File typeFile = new File(FileUrl.type(id));
if (!typeFile.exists()) {
logger.info("type file is not exist!");
}
Map colNamesAndtypes = new HashMap<>();
if (typeFile.exists() && typeFile.isFile()) {
List lines = readLines(typeFile);
if (lines.size() == 2) {
String columns[] = lines.get(0).split(Consts.DELIMITER);
String types[] = lines.get(1).split(Consts.DELIMITER);
for (int i = 0; i < columns.length; i++) {
colNamesAndtypes.put(columns[i], types[i]);
}
} else {
logger.info("read type file error!");
}
}
return colNamesAndtypes;
}
/**
* 读取数据集表头
*
* @param id 数据集id
* @return 标题名称,数据类型
*/
public static Pair readTitleAndType(String id) {
List lines = readLines(new File(FileUrl.type(id)));
if (lines.size() != 2) return null;
String[] titles = lines.get(0).split(Consts.DELIMITER);
String[] types = lines.get(1).split(Consts.DELIMITER);
DataType[] dateTypes = new DataType[types.length];
for (int i = 0; i < types.length; i++) {
dateTypes[i] = DataType.getDataType(types[i]);
}
return new Pair<>(titles, dateTypes);
}
/**
* 持久化数据类型
*
* @param id 数据集id
* @param columns 数据集表头
* @param typeList 数据类型
*/
public static void writeType(String id, String columns[], List typeList) {
String fileAddress = FileUrl.type(id);
checkPath(fileAddress);
StringBuffer typeStr = new StringBuffer();
StringBuffer columnStr = new StringBuffer();
if (typeList.size() == columns.length) {
for (int i = 0; i < typeList.size(); i++) {
typeStr.append(typeList.get(i));
columnStr.append(columns[i]);
if (i < typeList.size() - 1) {
columnStr.append(Consts.DELIMITER);
typeStr.append(Consts.DELIMITER);
}
}
}
List lines = new ArrayList<>();
lines.add(columnStr.toString());
lines.add(typeStr.toString());
writeLines(new File(fileAddress), lines);
}
/**
* 类型判断
*
* @param listRow 数据集
* @param columnSize 属性数目
* @return 类型
*/
private static List typeJudgment(List listRow, int columnSize) {
List listCol;
List typeList = new ArrayList<>();
for (int i = 0; i < columnSize; i++) {
final int index = i;
listCol = listRow.parallelStream().map(e -> e.getString(index)).filter(Objects::nonNull).distinct().collect(Collectors.toList());
long count = listCol.parallelStream().count();
if (count < 2) {
typeList.add(DataType.TEXT.getName());
} else if (count == 2) {
typeList.add(DataType.BINOMIAL.getName());
} else if (count > 2 && count < 20) {
typeList.add(DataType.POLYNOMIAL.getName());
} else {
typeList.add(parseType(listCol));
}
}
return typeList;
}
private static String parseType(List listCol) {
SimpleDateFormat formatDate = new SimpleDateFormat("yyyy-MM-dd");
SimpleDateFormat formatDateTime = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
SimpleDateFormat formatTime = new SimpleDateFormat("HH:mm");
for (int i = 0; i < listCol.size(); i++) {
try {
Integer.parseInt(listCol.get(i));
} catch (Exception e) {
break;
}
if (i == listCol.size() - 1) {
return DataType.INT.getName();
}
}
for (int i = 0; i < listCol.size(); i++) {
try {
Double.parseDouble(listCol.get(i));
} catch (Exception e) {
break;
}
if (i == listCol.size() - 1) {
return DataType.DECIMAL.getName();
}
}
for (int i = 0; i < listCol.size(); i++) {
try {
formatDateTime.parse(listCol.get(i));
} catch (Exception e) {
break;
}
if (i == listCol.size() - 1) {
return DataType.DATE_TIME.getName();
}
}
for (int i = 0; i < listCol.size(); i++) {
try {
formatDate.parse(listCol.get(i));
} catch (Exception e) {
break;
}
if (i == listCol.size() - 1) {
return DataType.DATE.getName();
}
}
for (int i = 0; i < listCol.size(); i++) {
try {
formatTime.parse(listCol.get(i));
} catch (Exception e) {
break;
}
if (i == listCol.size() - 1) {
return DataType.TIME.getName();
}
}
return DataType.TEXT.getName();
}
/**
* 过滤数据
*
* @param id 数据集 id
* @return 返回的是准确数据集和错误数据集
*/
public static Pair, List> filterData(String id) {
Map typesMap = DataOperator.readType(id);
List listRow = DataOperator.readEntireData(id);
String columns[] = DataOperator.readHeaders(id);
List listErrorRow = new ArrayList<>();
List listExactRow = listRow
.parallelStream()
.filter(row -> {
boolean matchFlag = true;
for (String column : columns) {
String col = row[columnIndex(columns, column)];
if (col == null || col.trim().isEmpty()) {
matchFlag = false;
break;
}
}
if (!matchFlag) {
listErrorRow.add(row);
}
return matchFlag;
})
.filter(row -> {
boolean matchFlag = true;
for (String column : columns) {
String col = row[columnIndex(columns, column)];
matchFlag = matchType(col, typesMap.get(column));
if (!matchFlag) {
break;
}
}
if (!matchFlag) {
listErrorRow.add(row);
}
return matchFlag;
})
.collect(Collectors.toList());
return Pair.with(listExactRow, listErrorRow);
}
/**
* 从文件中读取统计信息
*
* @param fileName 存储统计信息的文件名
* @return 返回读取到的字符串
*/
public static String readStatistics(String fileName) {
return readFile(new File(FileUrl.statistic(fileName)));
}
/**
* 将统计信息写入文件
*
* @param fileName 存储统计信息的文件名
* @param content 所要保存的字符串
*/
public static void writeStatistics(String fileName, String content) {
writeFile(new File(FileUrl.statistic(fileName)), content);
}
/**
* 读取数据集和其对应的表头,并构建为一个MAP
*
* @param id 文件名
* @return 返回构建后的MAP
*/
public static Map> readEntireDataWithHead(String id) {
List strings = DataOperator.readEntireData(id);
String[] headers = DataOperator.readHeaders(id);
if (strings == null || headers == null) return null;
Map> map = new HashMap<>();
for (int i = 0; i < headers.length; i++) {
int finalI = i;
map.put(headers[i], strings.stream().map(e -> e[finalI]).collect(Collectors.toList()));
}
return map;
}
/**
* 读取数据集和其对应的表头,并构建为一个MAP
*
* @param id 文件名
* @return 返回构建后的MAP
*/
public static Map> readExactDataWithHead(String id) {
List strings = DataOperator.readExactData(id);
String[] headers = DataOperator.readHeaders(id);
if (strings == null || headers == null) return null;
Map> map = new HashMap<>();
for (int i = 0; i < headers.length; i++) {
int finalI = i;
map.put(headers[i], strings.stream().map(e -> e[finalI]).collect(Collectors.toList()));
}
return map;
}
/**
* 写入数据报告文件
*/
public static void writeReport(String id, List lines) {
writeLines(new File(FileUrl.report(id)), lines);
}
/**
* 读取数据报告文件
*/
public static List readReport(String id) {
return readLines(new File(FileUrl.report(id)));
}
/**
* 文件操作
**/
private static void writeFile(File file, String content) {
try {
FileUtils.writeStringToFile(file, content, "UTF-8");
} catch (IOException e) {
logger.error("Exception:write title file error!", e);
}
}
private static void writeLines(File file, List lines) {
try {
FileUtils.writeLines(file, "UTF-8", lines);
} catch (IOException e) {
logger.error("Exception:write title file error!", e);
}
}
private static String readFile(File file) {
try {
return FileUtils.readFileToString(file, "UTF-8");
} catch (IOException e) {
logger.error("Exception:read type file error", e);
return "";
}
}
private static List readLines(File file) {
try {
return FileUtils.readLines(file, "UTF-8");
} catch (IOException e) {
logger.error("Exception:read type file error", e);
return new ArrayList<>();
}
}
/**
* 解析id,如: userId,flowId-dataId-componetId => /userId/flowId/dataId/componetId
* @param id 各id的组合: flowId-dataId-componetId
* @param userId 用户id:userId
* @return
*/
private static String parseIdUrl(Long userId,String id) {
String[] ids = id.split("-");
StringBuffer idPath = new StringBuffer("");
idPath.append(File.separatorChar);
idPath.append(userId);
for (String str: ids) {
idPath.append(File.separatorChar);
idPath.append(str);
}
return idPath.toString();
}
}