All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.clickzetta.platform.tools.IngestData Maven / Gradle / Ivy

There is a newer version: 2.0.0
Show newest version
package com.clickzetta.platform.tools;

import com.clickzetta.platform.client.Table;
import com.clickzetta.platform.client.api.BulkLoadState;
import com.clickzetta.platform.client.api.BulkLoadStream;
import com.clickzetta.platform.client.api.BulkLoadWriter;
import com.clickzetta.platform.client.api.Client;
import com.clickzetta.platform.client.api.ClientBuilder;
import com.clickzetta.platform.client.api.ErrorTypeHandler;
import com.clickzetta.platform.client.api.Message;
import com.clickzetta.platform.client.api.Row;
import com.clickzetta.platform.client.api.Stream;
import com.clickzetta.platform.common.Type;
import com.clickzetta.platform.util.JsonParser;
import com.google.common.base.Preconditions;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.collections4.MapUtils;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.math.BigDecimal;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.regex.Pattern;

public class IngestData {

  private static final Logger LOG = LoggerFactory.getLogger(IngestData.class);

  private static final List defaultNullValueString = new ArrayList() {
    {
      add("NULL");
    }
  };
  private final WrapConf conf;

  private final Client client;
  private final boolean isBulkLoadMode;

  public IngestData(String[] args) throws IOException {
    conf = parameterParser(args);
    isBulkLoadMode = conf.bulkLoadMode;
    Preconditions.checkArgument(!StringUtils.isEmpty(conf.crlHost) || !StringUtils.isEmpty(conf.streamUrl),
        "crlHost or streamUrl can not be empty");
    if (!StringUtils.isEmpty(conf.crlHost)) {
      Preconditions.checkArgument(conf.crlPort > 0, "crlPort can not less than 0");
    }
    Preconditions.checkArgument(!StringUtils.isEmpty(conf.schemaName), "schemaName can not be empty");
    Preconditions.checkArgument(!StringUtils.isEmpty(conf.tableName), "tableName can not be empty");
    if (!isBulkLoadMode) {
      Preconditions.checkArgument(!StringUtils.isEmpty(conf.operator), "operator can not be empty");
    }
    Preconditions.checkArgument(!CollectionUtils.isEmpty(conf.filePath), "filePath can not be empty");
    Preconditions.checkArgument(!MapUtils.isEmpty(conf.fieldMap), "fieldMap can not be empty");
    Preconditions.checkArgument(conf.delimiter != null, "delimiter can not be null");

    ClientBuilder builder = Client.getBuilder();
    if (!StringUtils.isEmpty(conf.crlHost) && conf.crlPort != 0) {
      builder.crlAddr(conf.crlHost, conf.crlPort);
    }
    if (conf.instanceId != null) {
      builder.instanceId(conf.getInstanceId());
    }
    if (conf.workspace != null) {
      builder.workspace(conf.getWorkspace());
    }
    if (!CollectionUtils.isEmpty(conf.getWorkerAddrs())) {
      for (String addr : conf.getWorkerAddrs()) {
        String[] str = addr.split(Pattern.quote(":"));
        if (str.length == 2) {
          builder.workerAddr(str[0], Integer.parseInt(str[1]));
        } else if (str.length == 1) {
          builder.workerAddr(str[0], null);
        } else {
          throw new IllegalArgumentException("invalid worker addr: " + addr);
        }
      }
    }
    if (!StringUtils.isEmpty(conf.getStreamUrl())) {
      builder.streamUrl(conf.getStreamUrl());
    }
    Properties properties = new Properties();
    if (conf.username != null && conf.password != null) {
      properties.put("username", conf.username);
      properties.put("password", conf.password);
    }
    if (conf.getProperties() != null) {
      properties.putAll(conf.getProperties());
    }
    builder.properties(properties);
    client = builder.build();
  }

  public static WrapConf parameterParser(String[] args) throws IOException {
    Options options = new Options();
    options.addOption("h", "help", false, "Print help.");
    options.addOption(Option.builder("c").longOpt("conf").type(String.class)
        .hasArg(true).required().desc("The json conf user define.")
        .build());

    options.addOption(Option.builder("s").longOpt("host").type(String.class)
        .hasArg(true).desc("The hostname/ip of the ingestion server")
        .build());

    options.addOption(Option.builder("p").longOpt("port").type(Integer.class)
        .hasArg(true).desc("The tcp listening port of ingestion server.")
        .build());

    options.addOption(Option.builder("u").longOpt("url").type(String.class)
        .hasArg(true).desc("The stream url for gateway conf.")
        .build());

    options.addOption(Option.builder("w").longOpt("workspace").type(String.class)
        .hasArg(true).desc("If your workspace is passed from script,you can specify it here.")
        .build());

    options.addOption(Option.builder("sn").longOpt("schemaName").type(String.class)
        .hasArg(true).desc("If your schemaName is passed from script,you can specify it here.")
        .build());

    options.addOption(Option.builder("tn").longOpt("tableName").type(String.class)
        .hasArg(true).desc("If your tableName is passed from script,you can specify it here.")
        .build());

    options.addOption(Option.builder("un").longOpt("username").type(String.class)
        .hasArg(true).desc("User name for authentication.")
        .build());

    options.addOption(Option.builder("pw").longOpt("password").type(String.class)
        .hasArg(true).desc("Password for authentication.")
        .build());

    HelpFormatter hf = new HelpFormatter();
    hf.setWidth(110);
    CommandLineParser parser = new DefaultParser();
    try {
      CommandLine commandLine = parser.parse(options, args);
      if (commandLine.hasOption('h')) {
        hf.printHelp("WrapExample", options, true);
        System.exit(1);
      }

      boolean exist = commandLine.hasOption("c");
      if (exist) {
        String confPath = commandLine.getOptionValue("c");
        WrapConf conf = new JsonParser().parserWrapConf(confPath);
        conf.configurePath = confPath;

        if (commandLine.hasOption("s")) {
          conf.crlHost = commandLine.getOptionValue("s");
        }
        if (commandLine.hasOption("p")) {
          conf.crlPort = Integer.parseInt(commandLine.getOptionValue("p"));
        }
        if (commandLine.hasOption("u")) {
          conf.streamUrl = commandLine.getOptionValue("u");
        }
        if (commandLine.hasOption("w")) {
          conf.workspace = commandLine.getOptionValue("w");
        }
        if (commandLine.hasOption("sn")) {
          conf.schemaName = commandLine.getOptionValue("sn");
        }
        if (commandLine.hasOption("tn")) {
          conf.tableName = commandLine.getOptionValue("tn");
        }
        if (commandLine.hasOption("un")) {
          conf.username = commandLine.getOptionValue("un");
        }
        if (commandLine.hasOption("pw")) {
          conf.password = commandLine.getOptionValue("pw");
        }
        return conf;
      }
      // not reach here if all right.
      throw new ParseException("The Conf File parse fail or not set.");
    } catch (ParseException e) {
      hf.printHelp("WrapExample", options, true);
      throw new IOException(e);
    }
  }

  private static void fieldMapAndSchemaCheck(Table table, WrapConf conf) {
    // do schema & field check.
    // TODO change schema to arrow schema.
    for (Map.Entry entry : conf.fieldMap.entrySet()) {
      if (conf.skipCheckFields != null && conf.skipCheckFields.contains(entry.getKey())) {
        continue;
      }
      Type type = null;
      switch (entry.getValue().toLowerCase()) {
        case "boolean":
        case "bool":
          type = Type.BOOL;
          break;
        case "int8":
          type = Type.INT8;
          break;
        case "int16":
          type = Type.INT16;
          break;
        case "int":
        case "int32":
        case "integer":
          type = Type.INT32;
          break;
        case "bigint":
        case "int64":
          type = Type.INT64;
          break;
        case "float":
          type = Type.FLOAT;
          break;
        case "double":
          type = Type.DOUBLE;
          break;
        case "json":
        case "string":
          type = Type.STRING;
          break;
        case "varchar":
        case "char":
          type = Type.VARCHAR;
          break;
        case "decimal":
          type = Type.DECIMAL;
          break;
        case "date":
          type = Type.DATE;
          break;
        case "binary":
          type = Type.BINARY;
          break;
        case "datetime":
          type = Type.UNIXTIME_MICROS;
          break;
        default:
          throw new UnsupportedOperationException("Unsupported type: " + entry.getValue());
      }
      // TODO hack for kudu & arrow table. remove after
      // Preconditions.checkArgument(table.getSchema().hasColumn(entry.getKey()),
      //// "User field [%s] is not found in schema.", entry.getKey());
      // Preconditions.checkArgument(table.getSchema().getColumn(entry.getKey()).getType()
      // == type,
      // "Field [%s] has different type with server schema: [%s] vs [%s].",
      // table.getSchema().getColumn(entry.getKey()).getName(),
      // table.getSchema().getColumn(entry.getKey()).getType().getName(),
      // entry.getValue());
    }
    {
      LinkedHashMap resetMap = new LinkedHashMap<>();
      for (Map.Entry entry : conf.fieldMap.entrySet()) {
        resetMap.put(entry.getKey(), entry.getValue().toLowerCase());
      }
      conf.fieldMap = resetMap;
    }
    {
      if (conf.getNullTypeString() != null) {
        LinkedHashMap> resetMap = new LinkedHashMap<>();
        for (Map.Entry> entry : conf.getNullTypeString().entrySet()) {
          resetMap.put(entry.getKey().toLowerCase(), entry.getValue());
        }
        conf.nullTypeString = resetMap;
      }
    }
  }

  public static void main(String[] args) throws Exception {
    IngestData igsData = new IngestData(args);
    igsData.doIngest();
  }


  public void doIngest() throws Exception {
    com.clickzetta.platform.client.api.Options options = null;
    Stream stream = null;

    com.clickzetta.platform.client.api.BulkLoadOptions bulkLoadOptions = null;
    com.clickzetta.platform.client.api.BulkLoadCommitOptions bulkLoadCommitOptions = null;
    BulkLoadStream bulkLoadStream = null;
    BulkLoadWriter bulkLoadWriter = null;

    if (isBulkLoadMode()) {
      bulkLoadOptions = conf.bulkLoadOptions.toCZBulkLoadOptions();
      bulkLoadCommitOptions = conf.bulkLoadCommitOptions.toCZBulkLoadCommitOptions();
      bulkLoadStream = client.createBulkLoadStream(conf.schemaName, conf.tableName, bulkLoadOptions);
      bulkLoadWriter = bulkLoadStream.openWriter(0);
      // fieldMapAndSchemaCheck(bulkLoadStream.getTable(), conf);
    } else {
      if (conf.throwOnFailure) {
        conf.options.setErrorTypeHandler(ErrorTypeHandler.TERMINATE_INSTANCE);
      } else {
        conf.options.setErrorTypeHandler(new ErrorTypeHandler() {
          @Override
          public void onSuccess(Message message) {
            LOG.info("mutate data with batch id {} cost {} ms successfully.", message.getBatchId(),
                System.currentTimeMillis() - message.getTimestamp());
          }

          @Override
          public void onFailure(Message message, Throwable e) {
            LOG.error(String.format("mutate data with batch id %s failed.", message.getBatchId()), e);
          }

          @Override
          public boolean getTerminateIfFailure() {
            return false;
          }
        });
      }
      options = conf.options.toCZSessionOptions();
      stream = client.createStream(conf.schemaName, conf.tableName, conf.getTabletNum(), options);

      fieldMapAndSchemaCheck(stream.getTable(), conf);
    }

    long readLineNaNoTime = 0;
    long readFileNaNoTime = 0;
    long totalWriteNum = 0;
    List columnNameList = new ArrayList<>(conf.getFieldMap().keySet());
    List fieldTypeList = new ArrayList<>(conf.getFieldMap().values());

    // read line string from target file.
    Path configureFilePath = Paths.get(conf.configurePath).getParent();
    long startTime = System.nanoTime();
    for (String filePath : conf.filePath) {
      long readFileStartTime = System.nanoTime();
      Path path = Paths.get(filePath);
      if (!path.isAbsolute()) {
        path = configureFilePath.resolve(path);
      }
      path = path.normalize();
      LOG.info("start to feed data with target file {}", path.toString());
      try (BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(path.toString()), "UTF-8"))) {
        String line = null;
        readFileNaNoTime += System.nanoTime() - readFileStartTime;
        while ((line = reader.readLine()) != null) {
          Long readLineStartTime = System.nanoTime();
          String[] str = line.split(Pattern.quote(conf.delimiter));
          if (!conf.isAutoCompletion && str.length < conf.fieldMap.size()) {
            if (conf.skipErrorData) {
              continue;
            } else {
              throw new IOException("Error Data Format: \n" + line);
            }
          }
          if (conf.isAutoCompletion && str.length < conf.fieldMap.size()) {
            String[] fullStr = new String[conf.fieldMap.size()];
            System.arraycopy(str, 0, fullStr, 0, str.length);
            Arrays.fill(fullStr, str.length, fullStr.length, "NULL");
            str = fullStr;
          }
          Row op = null;
          if (isBulkLoadMode()) {
            op = bulkLoadWriter.createRow();
          } else {
            switch (conf.operator) {
              case "insert":
                op = stream.createInsertRow();
                break;
              case "update":
                op = stream.createUpdateRow();
                break;
              case "upsert":
                op = stream.createUpsertRow();
                break;
              case "delete":
                op = stream.createDeleteRow();
                break;
              case "delete_ignore":
                op = stream.createDeleteIgnoreRow();
                break;
              default:
                throw new UnsupportedOperationException(String.format("not support operator with [%s] . " +
                    "use insert|update|upsert|delete instead.", conf.operator));
            }
          }
          readLineNaNoTime += System.nanoTime() - readLineStartTime;
          for (int i = 0; i < columnNameList.size(); i++) {
            Object value = null;
            boolean defineNullValue = false;
            if (conf.getNullTypeString() != null) {
              List nullTypeString = conf.getNullTypeString().get(fieldTypeList.get(i));
              if (nullTypeString != null && nullTypeString.contains(str[i])) {
                defineNullValue = true;
              }
            }
            if (defaultNullValueString.contains(str[i])) {
              defineNullValue = true;
            }
            switch (fieldTypeList.get(i).toLowerCase()) {
              case "boolean":
              case "bool":
                value = defineNullValue ? null : Boolean.parseBoolean(str[i]);
                break;
              case "int8":
                value = defineNullValue ? null : Byte.valueOf(str[i]);
                break;
              case "int16":
                value = defineNullValue ? null : Short.valueOf(str[i]);
                break;
              case "int":
              case "int32":
              case "integer":
                value = defineNullValue ? null : Integer.parseInt(str[i]);
                break;
              case "bigint":
              case "int64":
                value = defineNullValue ? null : Long.parseLong(str[i]);
                break;
              case "float":
                value = defineNullValue ? null : Float.parseFloat(str[i]);
                break;
              case "double":
                value = defineNullValue ? null : Double.parseDouble(str[i]);
                break;
              case "string":
              case "varchar":
              case "char":
              case "json":
                value = defineNullValue ? null : str[i];
                break;
              case "decimal":
                value = defineNullValue ? null : new BigDecimal(str[i]);
                break;
              case "date":
                value = defineNullValue ? null : java.sql.Date.valueOf(str[i]);
                break;
              case "binary":
                value = defineNullValue ? null : str[i].getBytes();
                break;
              case "datetime":
                if (!defineNullValue) {
                  value = Timestamp.valueOf(str[i]);
                }
                break;
              default:
                throw new UnsupportedOperationException("Unsupported type: " + fieldTypeList.get(i));
            }
            op.setValue(columnNameList.get(i), value);
          }
          if (isBulkLoadMode()) {
            bulkLoadWriter.write(op);
          } else {
            stream.apply(op);
          }
          totalWriteNum++;
        }
      }
    }
    if (isBulkLoadMode()) {
      bulkLoadWriter.close();
      bulkLoadStream.commit(bulkLoadCommitOptions);
      // wait for at most 10 min for regression test.
      BulkLoadState state = BulkLoadState.COMMIT_SUBMITTED;
      for (int time = 0; time < 600; time++) {
        state = bulkLoadStream.getStreamState();
        LOG.info("Get BulkLoadStream {} state {} time {}", bulkLoadStream.getStreamId(), state, time);
        if (state == BulkLoadState.COMMIT_SUCCESS || state == BulkLoadState.COMMIT_FAILED) {
          break;
        } else {
          try {
            Thread.sleep(1000);
          } catch (InterruptedException ite) {
            throw new IOException(ite);
          }
        }
      }
      if (state != BulkLoadState.COMMIT_SUCCESS) {
        throw new IOException("BulkLoadStream " + bulkLoadStream.getStreamId() +
            " sync commit failed or timeout with state " + state);
      }
      bulkLoadStream.close();
    } else {
      stream.flush();
      stream.close();
    }
    if (conf.isReleaseResource()) {
      LOG.info("client call release resource with target stream");
      client.releaseResource(stream);
    }
    client.close();
    Long totalNaNoTime = System.nanoTime() - startTime - readLineNaNoTime - readFileNaNoTime;
    LOG.info("end to feed data size: {} cost: {} ms", totalWriteNum, totalNaNoTime / 1000 / 1000);
    LOG.info("data tps: {} r/s", totalWriteNum / (totalNaNoTime / 1000.0 / 1000.0 / 1000.0));
    LOG.info("Process runtime: {} s", (System.nanoTime() - startTime) / 1000.0 / 1000.0 / 1000.0);
    Thread.sleep(1 * 1000);
    LOG.info("finish success");
    if (conf.forceExit) {
      System.exit(0);
    }
  }

  public boolean isBulkLoadMode() {
    return isBulkLoadMode;
  }

  public Client getClient() throws IOException {
    return client;
  }

  public WrapConf getConf() {
    return conf;
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy