com.clickzetta.platform.tools.IngestData Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of clickzetta-java Show documentation
Show all versions of clickzetta-java Show documentation
The java SDK for clickzetta's Lakehouse
package com.clickzetta.platform.tools;
import com.clickzetta.platform.client.Table;
import com.clickzetta.platform.client.api.BulkLoadState;
import com.clickzetta.platform.client.api.BulkLoadStream;
import com.clickzetta.platform.client.api.BulkLoadWriter;
import com.clickzetta.platform.client.api.Client;
import com.clickzetta.platform.client.api.ClientBuilder;
import com.clickzetta.platform.client.api.ErrorTypeHandler;
import com.clickzetta.platform.client.api.Message;
import com.clickzetta.platform.client.api.Row;
import com.clickzetta.platform.client.api.Stream;
import com.clickzetta.platform.common.Type;
import com.clickzetta.platform.util.JsonParser;
import com.google.common.base.Preconditions;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.collections4.MapUtils;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.math.BigDecimal;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.regex.Pattern;
public class IngestData {
private static final Logger LOG = LoggerFactory.getLogger(IngestData.class);
private static final List defaultNullValueString = new ArrayList() {
{
add("NULL");
}
};
private final WrapConf conf;
private final Client client;
private final boolean isBulkLoadMode;
public IngestData(String[] args) throws IOException {
conf = parameterParser(args);
isBulkLoadMode = conf.bulkLoadMode;
Preconditions.checkArgument(!StringUtils.isEmpty(conf.crlHost) || !StringUtils.isEmpty(conf.streamUrl),
"crlHost or streamUrl can not be empty");
if (!StringUtils.isEmpty(conf.crlHost)) {
Preconditions.checkArgument(conf.crlPort > 0, "crlPort can not less than 0");
}
Preconditions.checkArgument(!StringUtils.isEmpty(conf.schemaName), "schemaName can not be empty");
Preconditions.checkArgument(!StringUtils.isEmpty(conf.tableName), "tableName can not be empty");
if (!isBulkLoadMode) {
Preconditions.checkArgument(!StringUtils.isEmpty(conf.operator), "operator can not be empty");
}
Preconditions.checkArgument(!CollectionUtils.isEmpty(conf.filePath), "filePath can not be empty");
Preconditions.checkArgument(!MapUtils.isEmpty(conf.fieldMap), "fieldMap can not be empty");
Preconditions.checkArgument(conf.delimiter != null, "delimiter can not be null");
ClientBuilder builder = Client.getBuilder();
if (!StringUtils.isEmpty(conf.crlHost) && conf.crlPort != 0) {
builder.crlAddr(conf.crlHost, conf.crlPort);
}
if (conf.instanceId != null) {
builder.instanceId(conf.getInstanceId());
}
if (conf.workspace != null) {
builder.workspace(conf.getWorkspace());
}
if (!CollectionUtils.isEmpty(conf.getWorkerAddrs())) {
for (String addr : conf.getWorkerAddrs()) {
String[] str = addr.split(Pattern.quote(":"));
if (str.length == 2) {
builder.workerAddr(str[0], Integer.parseInt(str[1]));
} else if (str.length == 1) {
builder.workerAddr(str[0], null);
} else {
throw new IllegalArgumentException("invalid worker addr: " + addr);
}
}
}
if (!StringUtils.isEmpty(conf.getStreamUrl())) {
builder.streamUrl(conf.getStreamUrl());
}
Properties properties = new Properties();
if (conf.username != null && conf.password != null) {
properties.put("username", conf.username);
properties.put("password", conf.password);
}
if (conf.getProperties() != null) {
properties.putAll(conf.getProperties());
}
builder.properties(properties);
client = builder.build();
}
public static WrapConf parameterParser(String[] args) throws IOException {
Options options = new Options();
options.addOption("h", "help", false, "Print help.");
options.addOption(Option.builder("c").longOpt("conf").type(String.class)
.hasArg(true).required().desc("The json conf user define.")
.build());
options.addOption(Option.builder("s").longOpt("host").type(String.class)
.hasArg(true).desc("The hostname/ip of the ingestion server")
.build());
options.addOption(Option.builder("p").longOpt("port").type(Integer.class)
.hasArg(true).desc("The tcp listening port of ingestion server.")
.build());
options.addOption(Option.builder("u").longOpt("url").type(String.class)
.hasArg(true).desc("The stream url for gateway conf.")
.build());
options.addOption(Option.builder("w").longOpt("workspace").type(String.class)
.hasArg(true).desc("If your workspace is passed from script,you can specify it here.")
.build());
options.addOption(Option.builder("sn").longOpt("schemaName").type(String.class)
.hasArg(true).desc("If your schemaName is passed from script,you can specify it here.")
.build());
options.addOption(Option.builder("tn").longOpt("tableName").type(String.class)
.hasArg(true).desc("If your tableName is passed from script,you can specify it here.")
.build());
options.addOption(Option.builder("un").longOpt("username").type(String.class)
.hasArg(true).desc("User name for authentication.")
.build());
options.addOption(Option.builder("pw").longOpt("password").type(String.class)
.hasArg(true).desc("Password for authentication.")
.build());
HelpFormatter hf = new HelpFormatter();
hf.setWidth(110);
CommandLineParser parser = new DefaultParser();
try {
CommandLine commandLine = parser.parse(options, args);
if (commandLine.hasOption('h')) {
hf.printHelp("WrapExample", options, true);
System.exit(1);
}
boolean exist = commandLine.hasOption("c");
if (exist) {
String confPath = commandLine.getOptionValue("c");
WrapConf conf = new JsonParser().parserWrapConf(confPath);
conf.configurePath = confPath;
if (commandLine.hasOption("s")) {
conf.crlHost = commandLine.getOptionValue("s");
}
if (commandLine.hasOption("p")) {
conf.crlPort = Integer.parseInt(commandLine.getOptionValue("p"));
}
if (commandLine.hasOption("u")) {
conf.streamUrl = commandLine.getOptionValue("u");
}
if (commandLine.hasOption("w")) {
conf.workspace = commandLine.getOptionValue("w");
}
if (commandLine.hasOption("sn")) {
conf.schemaName = commandLine.getOptionValue("sn");
}
if (commandLine.hasOption("tn")) {
conf.tableName = commandLine.getOptionValue("tn");
}
if (commandLine.hasOption("un")) {
conf.username = commandLine.getOptionValue("un");
}
if (commandLine.hasOption("pw")) {
conf.password = commandLine.getOptionValue("pw");
}
return conf;
}
// not reach here if all right.
throw new ParseException("The Conf File parse fail or not set.");
} catch (ParseException e) {
hf.printHelp("WrapExample", options, true);
throw new IOException(e);
}
}
private static void fieldMapAndSchemaCheck(Table table, WrapConf conf) {
// do schema & field check.
// TODO change schema to arrow schema.
for (Map.Entry entry : conf.fieldMap.entrySet()) {
if (conf.skipCheckFields != null && conf.skipCheckFields.contains(entry.getKey())) {
continue;
}
Type type = null;
switch (entry.getValue().toLowerCase()) {
case "boolean":
case "bool":
type = Type.BOOL;
break;
case "int8":
type = Type.INT8;
break;
case "int16":
type = Type.INT16;
break;
case "int":
case "int32":
case "integer":
type = Type.INT32;
break;
case "bigint":
case "int64":
type = Type.INT64;
break;
case "float":
type = Type.FLOAT;
break;
case "double":
type = Type.DOUBLE;
break;
case "json":
case "string":
type = Type.STRING;
break;
case "varchar":
case "char":
type = Type.VARCHAR;
break;
case "decimal":
type = Type.DECIMAL;
break;
case "date":
type = Type.DATE;
break;
case "binary":
type = Type.BINARY;
break;
case "datetime":
type = Type.UNIXTIME_MICROS;
break;
default:
throw new UnsupportedOperationException("Unsupported type: " + entry.getValue());
}
// TODO hack for kudu & arrow table. remove after
// Preconditions.checkArgument(table.getSchema().hasColumn(entry.getKey()),
//// "User field [%s] is not found in schema.", entry.getKey());
// Preconditions.checkArgument(table.getSchema().getColumn(entry.getKey()).getType()
// == type,
// "Field [%s] has different type with server schema: [%s] vs [%s].",
// table.getSchema().getColumn(entry.getKey()).getName(),
// table.getSchema().getColumn(entry.getKey()).getType().getName(),
// entry.getValue());
}
{
LinkedHashMap resetMap = new LinkedHashMap<>();
for (Map.Entry entry : conf.fieldMap.entrySet()) {
resetMap.put(entry.getKey(), entry.getValue().toLowerCase());
}
conf.fieldMap = resetMap;
}
{
if (conf.getNullTypeString() != null) {
LinkedHashMap> resetMap = new LinkedHashMap<>();
for (Map.Entry> entry : conf.getNullTypeString().entrySet()) {
resetMap.put(entry.getKey().toLowerCase(), entry.getValue());
}
conf.nullTypeString = resetMap;
}
}
}
public static void main(String[] args) throws Exception {
IngestData igsData = new IngestData(args);
igsData.doIngest();
}
public void doIngest() throws Exception {
com.clickzetta.platform.client.api.Options options = null;
Stream stream = null;
com.clickzetta.platform.client.api.BulkLoadOptions bulkLoadOptions = null;
com.clickzetta.platform.client.api.BulkLoadCommitOptions bulkLoadCommitOptions = null;
BulkLoadStream bulkLoadStream = null;
BulkLoadWriter bulkLoadWriter = null;
if (isBulkLoadMode()) {
bulkLoadOptions = conf.bulkLoadOptions.toCZBulkLoadOptions();
bulkLoadCommitOptions = conf.bulkLoadCommitOptions.toCZBulkLoadCommitOptions();
bulkLoadStream = client.createBulkLoadStream(conf.schemaName, conf.tableName, bulkLoadOptions);
bulkLoadWriter = bulkLoadStream.openWriter(0);
// fieldMapAndSchemaCheck(bulkLoadStream.getTable(), conf);
} else {
if (conf.throwOnFailure) {
conf.options.setErrorTypeHandler(ErrorTypeHandler.TERMINATE_INSTANCE);
} else {
conf.options.setErrorTypeHandler(new ErrorTypeHandler() {
@Override
public void onSuccess(Message message) {
LOG.info("mutate data with batch id {} cost {} ms successfully.", message.getBatchId(),
System.currentTimeMillis() - message.getTimestamp());
}
@Override
public void onFailure(Message message, Throwable e) {
LOG.error(String.format("mutate data with batch id %s failed.", message.getBatchId()), e);
}
@Override
public boolean getTerminateIfFailure() {
return false;
}
});
}
options = conf.options.toCZSessionOptions();
stream = client.createStream(conf.schemaName, conf.tableName, conf.getTabletNum(), options);
fieldMapAndSchemaCheck(stream.getTable(), conf);
}
long readLineNaNoTime = 0;
long readFileNaNoTime = 0;
long totalWriteNum = 0;
List columnNameList = new ArrayList<>(conf.getFieldMap().keySet());
List fieldTypeList = new ArrayList<>(conf.getFieldMap().values());
// read line string from target file.
Path configureFilePath = Paths.get(conf.configurePath).getParent();
long startTime = System.nanoTime();
for (String filePath : conf.filePath) {
long readFileStartTime = System.nanoTime();
Path path = Paths.get(filePath);
if (!path.isAbsolute()) {
path = configureFilePath.resolve(path);
}
path = path.normalize();
LOG.info("start to feed data with target file {}", path.toString());
try (BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(path.toString()), "UTF-8"))) {
String line = null;
readFileNaNoTime += System.nanoTime() - readFileStartTime;
while ((line = reader.readLine()) != null) {
Long readLineStartTime = System.nanoTime();
String[] str = line.split(Pattern.quote(conf.delimiter));
if (!conf.isAutoCompletion && str.length < conf.fieldMap.size()) {
if (conf.skipErrorData) {
continue;
} else {
throw new IOException("Error Data Format: \n" + line);
}
}
if (conf.isAutoCompletion && str.length < conf.fieldMap.size()) {
String[] fullStr = new String[conf.fieldMap.size()];
System.arraycopy(str, 0, fullStr, 0, str.length);
Arrays.fill(fullStr, str.length, fullStr.length, "NULL");
str = fullStr;
}
Row op = null;
if (isBulkLoadMode()) {
op = bulkLoadWriter.createRow();
} else {
switch (conf.operator) {
case "insert":
op = stream.createInsertRow();
break;
case "update":
op = stream.createUpdateRow();
break;
case "upsert":
op = stream.createUpsertRow();
break;
case "delete":
op = stream.createDeleteRow();
break;
case "delete_ignore":
op = stream.createDeleteIgnoreRow();
break;
default:
throw new UnsupportedOperationException(String.format("not support operator with [%s] . " +
"use insert|update|upsert|delete instead.", conf.operator));
}
}
readLineNaNoTime += System.nanoTime() - readLineStartTime;
for (int i = 0; i < columnNameList.size(); i++) {
Object value = null;
boolean defineNullValue = false;
if (conf.getNullTypeString() != null) {
List nullTypeString = conf.getNullTypeString().get(fieldTypeList.get(i));
if (nullTypeString != null && nullTypeString.contains(str[i])) {
defineNullValue = true;
}
}
if (defaultNullValueString.contains(str[i])) {
defineNullValue = true;
}
switch (fieldTypeList.get(i).toLowerCase()) {
case "boolean":
case "bool":
value = defineNullValue ? null : Boolean.parseBoolean(str[i]);
break;
case "int8":
value = defineNullValue ? null : Byte.valueOf(str[i]);
break;
case "int16":
value = defineNullValue ? null : Short.valueOf(str[i]);
break;
case "int":
case "int32":
case "integer":
value = defineNullValue ? null : Integer.parseInt(str[i]);
break;
case "bigint":
case "int64":
value = defineNullValue ? null : Long.parseLong(str[i]);
break;
case "float":
value = defineNullValue ? null : Float.parseFloat(str[i]);
break;
case "double":
value = defineNullValue ? null : Double.parseDouble(str[i]);
break;
case "string":
case "varchar":
case "char":
case "json":
value = defineNullValue ? null : str[i];
break;
case "decimal":
value = defineNullValue ? null : new BigDecimal(str[i]);
break;
case "date":
value = defineNullValue ? null : java.sql.Date.valueOf(str[i]);
break;
case "binary":
value = defineNullValue ? null : str[i].getBytes();
break;
case "datetime":
if (!defineNullValue) {
value = Timestamp.valueOf(str[i]);
}
break;
default:
throw new UnsupportedOperationException("Unsupported type: " + fieldTypeList.get(i));
}
op.setValue(columnNameList.get(i), value);
}
if (isBulkLoadMode()) {
bulkLoadWriter.write(op);
} else {
stream.apply(op);
}
totalWriteNum++;
}
}
}
if (isBulkLoadMode()) {
bulkLoadWriter.close();
bulkLoadStream.commit(bulkLoadCommitOptions);
// wait for at most 10 min for regression test.
BulkLoadState state = BulkLoadState.COMMIT_SUBMITTED;
for (int time = 0; time < 600; time++) {
state = bulkLoadStream.getStreamState();
LOG.info("Get BulkLoadStream {} state {} time {}", bulkLoadStream.getStreamId(), state, time);
if (state == BulkLoadState.COMMIT_SUCCESS || state == BulkLoadState.COMMIT_FAILED) {
break;
} else {
try {
Thread.sleep(1000);
} catch (InterruptedException ite) {
throw new IOException(ite);
}
}
}
if (state != BulkLoadState.COMMIT_SUCCESS) {
throw new IOException("BulkLoadStream " + bulkLoadStream.getStreamId() +
" sync commit failed or timeout with state " + state);
}
bulkLoadStream.close();
} else {
stream.flush();
stream.close();
}
if (conf.isReleaseResource()) {
LOG.info("client call release resource with target stream");
client.releaseResource(stream);
}
client.close();
Long totalNaNoTime = System.nanoTime() - startTime - readLineNaNoTime - readFileNaNoTime;
LOG.info("end to feed data size: {} cost: {} ms", totalWriteNum, totalNaNoTime / 1000 / 1000);
LOG.info("data tps: {} r/s", totalWriteNum / (totalNaNoTime / 1000.0 / 1000.0 / 1000.0));
LOG.info("Process runtime: {} s", (System.nanoTime() - startTime) / 1000.0 / 1000.0 / 1000.0);
Thread.sleep(1 * 1000);
LOG.info("finish success");
if (conf.forceExit) {
System.exit(0);
}
}
public boolean isBulkLoadMode() {
return isBulkLoadMode;
}
public Client getClient() throws IOException {
return client;
}
public WrapConf getConf() {
return conf;
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy