All Downloads are FREE. Search and download functionalities are using the official Maven repository.

co.cask.wrangler.executor.TextDirectives Maven / Gradle / Ivy

There is a newer version: 3.2.2
Show newest version
/*
 * Copyright © 2016-2017 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package co.cask.wrangler.executor;

import co.cask.wrangler.api.DirectiveParseException;
import co.cask.wrangler.api.Directives;
import co.cask.wrangler.api.Step;
import co.cask.wrangler.steps.IncrementTransientVariable;
import co.cask.wrangler.steps.SetTransientVariable;
import co.cask.wrangler.steps.column.ChangeColCaseNames;
import co.cask.wrangler.steps.column.CleanseColumnNames;
import co.cask.wrangler.steps.column.Columns;
import co.cask.wrangler.steps.column.ColumnsReplace;
import co.cask.wrangler.steps.column.Copy;
import co.cask.wrangler.steps.column.Drop;
import co.cask.wrangler.steps.column.Keep;
import co.cask.wrangler.steps.column.Merge;
import co.cask.wrangler.steps.column.Rename;
import co.cask.wrangler.steps.column.SplitToColumns;
import co.cask.wrangler.steps.column.Swap;
import co.cask.wrangler.steps.date.DiffDate;
import co.cask.wrangler.steps.date.FormatDate;
import co.cask.wrangler.steps.language.SetCharset;
import co.cask.wrangler.steps.nlp.Stemming;
import co.cask.wrangler.steps.parser.CsvParser;
import co.cask.wrangler.steps.parser.FixedLengthParser;
import co.cask.wrangler.steps.parser.HL7Parser;
import co.cask.wrangler.steps.parser.JsParser;
import co.cask.wrangler.steps.parser.JsPath;
import co.cask.wrangler.steps.parser.ParseAvro;
import co.cask.wrangler.steps.parser.ParseAvroFile;
import co.cask.wrangler.steps.parser.ParseDate;
import co.cask.wrangler.steps.parser.ParseExcel;
import co.cask.wrangler.steps.parser.ParseLog;
import co.cask.wrangler.steps.parser.ParseProtobuf;
import co.cask.wrangler.steps.parser.ParseSimpleDate;
import co.cask.wrangler.steps.parser.XmlParser;
import co.cask.wrangler.steps.parser.XmlToJson;
import co.cask.wrangler.steps.row.Fail;
import co.cask.wrangler.steps.row.Flatten;
import co.cask.wrangler.steps.row.RecordConditionFilter;
import co.cask.wrangler.steps.row.RecordMissingOrNullFilter;
import co.cask.wrangler.steps.row.RecordRegexFilter;
import co.cask.wrangler.steps.row.SendToError;
import co.cask.wrangler.steps.row.SetRecordDelimiter;
import co.cask.wrangler.steps.row.SplitToRows;

import co.cask.wrangler.steps.transformation.CatalogLookup;
import co.cask.wrangler.steps.transformation.CharacterCut;
import co.cask.wrangler.steps.transformation.Decode;
import co.cask.wrangler.steps.transformation.Encode;
import co.cask.wrangler.steps.transformation.Expression;
import co.cask.wrangler.steps.transformation.ExtractRegexGroups;
import co.cask.wrangler.steps.transformation.FillNullOrEmpty;
import co.cask.wrangler.steps.transformation.FindAndReplace;
import co.cask.wrangler.steps.transformation.GenerateUUID;
import co.cask.wrangler.steps.transformation.IndexSplit;
import co.cask.wrangler.steps.transformation.InvokeHttp;
import co.cask.wrangler.steps.transformation.Lower;
import co.cask.wrangler.steps.transformation.MaskNumber;
import co.cask.wrangler.steps.transformation.MaskShuffle;
import co.cask.wrangler.steps.transformation.MessageHash;
import co.cask.wrangler.steps.transformation.Quantization;
import co.cask.wrangler.steps.transformation.SetColumn;
import co.cask.wrangler.steps.transformation.Split;
import co.cask.wrangler.steps.transformation.SplitEmail;
import co.cask.wrangler.steps.transformation.SplitURL;
import co.cask.wrangler.steps.transformation.TableLookup;
import co.cask.wrangler.steps.transformation.TextDistanceMeasure;
import co.cask.wrangler.steps.transformation.TextMetricMeasure;
import co.cask.wrangler.steps.transformation.TitleCase;
import co.cask.wrangler.steps.transformation.Upper;
import co.cask.wrangler.steps.transformation.UrlEncode;
import co.cask.wrangler.steps.transformation.XPathArrayElement;
import co.cask.wrangler.steps.transformation.XPathElement;
import co.cask.wrangler.steps.transformation.Trim;
import co.cask.wrangler.steps.transformation.LeftTrim;
import co.cask.wrangler.steps.transformation.RightTrim;


import co.cask.wrangler.steps.writer.WriteAsCSV;
import co.cask.wrangler.steps.writer.WriteAsJsonMap;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.StringTokenizer;

/**
 * Parses the DSL into specification containing stepRegistry for wrangling.
 *
 * Following are some of the commands and format that {@link TextDirectives}
 * will handle.
 */
public class TextDirectives implements Directives {
  private static final Logger LOG = LoggerFactory.getLogger(TextDirectives.class);

  // directives for wrangling.
  private String[] directives;

  // Usage Registry
  private final UsageRegistry usageRegistry = new UsageRegistry();

  public TextDirectives(String[] directives) {
    this.directives = directives;
  }

  public TextDirectives(String directives) {
    this(directives.split("\n"));
  }

  public TextDirectives(List directives) {
    this(directives.toArray(new String[directives.size()]));
  }

  /**
   * Parses the DSL to generate a sequence of stepRegistry to be executed by {@link co.cask.wrangler.api.Pipeline}.
   *
   * The transformation parsing here needs a better solution. It has many limitations and having different way would
   * allow us to provide much more advanced semantics for directives.
   *
   * @return List of stepRegistry to be executed.
   * @throws ParseException
   */
  private List parse() throws DirectiveParseException {
    List steps = new ArrayList<>();

    // Split directive by EOL
    int lineno = 1;

    // Iterate through each directive and create necessary stepRegistry.
    for (String directive : directives) {
      directive = directive.trim();
      if (directive.isEmpty() || directive.startsWith("//") || directive.startsWith("#")) {
        continue;
      }
      StringTokenizer tokenizer = new StringTokenizer(directive, " ");
      String command = tokenizer.nextToken();

      switch (command) {
        case "set": {
          switch (tokenizer.nextToken()) {
            // set format [csv|json]  
            case "format": {
              String format = getNextToken(tokenizer, "set format", "[csv|json]", lineno);
              if (format.equalsIgnoreCase("csv")) {
                String delimStr = getNextToken(tokenizer, "set format", "delimiter", lineno);
                char delimiter = delimStr.charAt(0);
                if (delimStr.startsWith("\\")) {
                  String unescapedStr = StringEscapeUtils.unescapeJava(delimStr);
                  if (unescapedStr == null) {
                    throw new IllegalArgumentException("Invalid delimiter for CSV Parser: " + delimStr);
                  }
                  delimiter = unescapedStr.charAt(0);
                }
                boolean ignoreEmptyLines =
                  getNextToken(tokenizer, "set format", "true|false", lineno).equalsIgnoreCase("true");
                CsvParser.Options opt = new CsvParser.Options(delimiter, ignoreEmptyLines);
                steps.add(new CsvParser(lineno, directive, opt, STARTING_COLUMN, false));
                steps.add(new Drop(lineno, directive, STARTING_COLUMN));
              } else {
                throw new DirectiveParseException(
                  String.format("Unknown format '%s' specified at line %d", format, lineno)
                );
              }
            }
            break;

            // set column  
            case "column": {
              String column = getNextToken(tokenizer, "set column", "column-name", lineno);
              String expr = getNextToken(tokenizer, "\n", "set column", "jexl-expression", lineno);
              steps.add(new Expression(lineno, directive, column, expr));
            }
            break;

            // set columns 
            case "columns": {
              String columns = getNextToken(tokenizer, "\n", "set columns", "name1, name2, ...", lineno);
              String cols[] = columns.split(",");
              steps.add(new Columns(lineno, directive, Arrays.asList(cols)));
            }
            break;
          }
        }
        break;

        // rename  
        case "rename": {
          String oldcol = getNextToken(tokenizer,  command, "old", lineno);
          String newcol = getNextToken(tokenizer, command, "new", lineno);
          steps.add(new Rename(lineno, directive, oldcol, newcol));
        }
        break;

        // drop [,]
        case "drop": {
          String colums = getNextToken(tokenizer, command, "column", lineno);
          steps.add(new Drop(lineno, directive, Arrays.asList(colums.split(","))));
        }
        break;

        // merge    
        case "merge": {
          String col1 = getNextToken(tokenizer, command, "first", lineno);
          String col2 = getNextToken(tokenizer, command, "second", lineno);
          String dest = getNextToken(tokenizer, command, "new-column", lineno);
          String delimiter = getNextToken(tokenizer, "\n", command, "delimiter", lineno);
          int start = delimiter.indexOf('\'');
          if (start != -1) {
            int end = delimiter.lastIndexOf('\'');
            if (end == -1 || start == end) {
              throw new DirectiveParseException(
                String.format("One of the quote is missing when specifying the delimiter.")
              );
            }
            delimiter = StringEscapeUtils.unescapeJava(delimiter.substring(start + 1, end));
          }
          steps.add(new Merge(lineno, directive, col1, col2, dest, delimiter));
        }
        break;

        // uppercase 
        case "uppercase": {
          String col = getNextToken(tokenizer, command, "col", lineno);
          steps.add(new Upper(lineno, directive, col));
        }
        break;

        // lowercase 
        case "lowercase": {
          String col = getNextToken(tokenizer, command, "col", lineno);
          steps.add(new Lower(lineno, directive, col));
        }
        break;

        // titlecase 
        case "titlecase": {
          String col = getNextToken(tokenizer, command, "col", lineno);
          steps.add(new TitleCase(lineno, directive, col));
        }
        break;

        // indexsplit    
        case "indexsplit": {
          String source = getNextToken(tokenizer, command, "source", lineno);
          String startStr = getNextToken(tokenizer, command, "start", lineno);
          String endStr = getNextToken(tokenizer, command, "end", lineno);
          int start = Integer.parseInt(startStr);
          int end = Integer.parseInt(endStr);
          String destination = getNextToken(tokenizer, command, "destination", lineno);
          steps.add(new IndexSplit(lineno, directive, source, start, end, destination));
        }
        break;

        // split    
        case "split": {
          String source = getNextToken(tokenizer, command, "source-column-name", lineno);
          String delimiter = getNextToken(tokenizer, command, "delimiter", lineno);
          String firstCol = getNextToken(tokenizer, command, "new-column-1", lineno);
          String secondCol = getNextToken(tokenizer, command, "new-column-2", lineno);
          steps.add(new Split(lineno, directive, source, delimiter, firstCol, secondCol));
        }
        break;

        // filter-row-if-matched  
        case "filter-row-if-matched": {
          String column = getNextToken(tokenizer, command, "column", lineno);
          String pattern = getNextToken(tokenizer, "\n", command, "regex", lineno);
          steps.add(new RecordRegexFilter(lineno, directive, column, pattern, true));
        }
        break;

        // filter-row-if-not-matched  
        case "filter-row-if-not-matched": {
          String column = getNextToken(tokenizer, command, "column", lineno);
          String pattern = getNextToken(tokenizer, "\n", command, "regex", lineno);
          steps.add(new RecordRegexFilter(lineno, directive, column, pattern, false));
        }
        break;

        // filter-row-if-true  
        case "filter-row-if-true": {
          String condition = getNextToken(tokenizer, "\n", command, "condition", lineno);
          steps.add(new RecordConditionFilter(lineno, directive, condition, true));
        }
        break;

        // filter-row-if-false  
        case "filter-row-if-false": {
          String condition = getNextToken(tokenizer, "\n", command, "condition", lineno);
          steps.add(new RecordConditionFilter(lineno, directive, condition, false));
        }
        break;

        // set-variable  
        case "set-variable": {
          String column = getNextToken(tokenizer, command, "column", lineno);
          String expression = getNextToken(tokenizer, "\n", command, "expression", lineno);
          steps.add(new SetTransientVariable(lineno, directive, column, expression));
        }
        break;

        // increment-variable   
        case "increment-variable": {
          String column = getNextToken(tokenizer, command, "column", lineno);
          String value = getNextToken(tokenizer, command, "value", lineno);
          String expression = getNextToken(tokenizer, "\n", command, "expression", lineno);
          steps.add(new IncrementTransientVariable(lineno, directive, column, value, expression));
        }
        break;

        // mask-number  
        case "mask-number": {
          String column = getNextToken(tokenizer, command, "column", lineno);
          String mask = getNextToken(tokenizer, command, "pattern", lineno);
          steps.add(new MaskNumber(lineno, directive, column, mask));
        }
        break;

        // mask-shuffle 
        case "mask-shuffle": {
          String column = getNextToken(tokenizer, command, "column", lineno);
          steps.add(new MaskShuffle(lineno, directive, column));
        }
        break;

        // format-date  
        case "format-date": {
          String column = getNextToken(tokenizer, command, "column", 1);
          String format = getNextToken(tokenizer, "\n", command, "format", lineno);
          steps.add(new FormatDate(lineno, directive, column, format));
        }
        break;

        // format-unix-timestamp  
        case "format-unix-timestamp": {
          String column = getNextToken(tokenizer, command, "column", lineno);
          String dstDatePattern = getNextToken(tokenizer, "\n", command, "destination-format", lineno);
          steps.add(new FormatDate(lineno, directive, column, dstDatePattern));
        }
        break;

        // quantize   <[range1:range2)=value>,[]*
        case "quantize": {
          String column1 = getNextToken(tokenizer, command, "source-column", lineno);
          String column2 = getNextToken(tokenizer, command, "destination-column", lineno);
          String ranges = getNextToken(tokenizer, "\n", command, "destination-column", lineno);
          steps.add(new Quantization(lineno, directive, column1, column2, ranges));
        }
        break;

        // find-and-replace  
        case "find-and-replace" : {
          String column = getNextToken(tokenizer, command, "column", lineno);
          String expression = getNextToken(tokenizer, "\n", command, "sed-script", lineno);
          steps.add(new FindAndReplace(lineno, directive, column, expression));
        }
        break;

        // parse-as-csv   []
        case "parse-as-csv" : {
          String column = getNextToken(tokenizer, command, "column", lineno);
          String delimStr = getNextToken(tokenizer, command, "delimiter", lineno);
          char delimiter = delimStr.charAt(0);
          if (delimStr.startsWith("\\")) {
            String unescapedStr = StringEscapeUtils.unescapeJava(delimStr);
            if (unescapedStr == null) {
              throw new DirectiveParseException("Invalid delimiter for CSV Parser: " + delimStr);
            }
            delimiter = unescapedStr.charAt(0);
          }

          boolean hasHeader;
          String hasHeaderLinesOpt = getNextToken(tokenizer, "\n", command, "true|false", lineno, true);
          if (hasHeaderLinesOpt == null || hasHeaderLinesOpt.equalsIgnoreCase("false")) {
            hasHeader = false;
          } else {
            hasHeader = true;
          }
          CsvParser.Options opt = new CsvParser.Options(delimiter, true);
          steps.add(new CsvParser(lineno, directive, opt, column, hasHeader));
        }
        break;

        // parse-as-json  [depth]
        case "parse-as-json" : {
          String column = getNextToken(tokenizer, command, "column", lineno);
          String depthOpt = getNextToken(tokenizer, "\n", command, "depth", lineno, true);
          int depth = Integer.MAX_VALUE;
          if (depthOpt != null && !depthOpt.isEmpty()) {
            try {
              depth = Integer.parseInt(depthOpt);
            } catch (NumberFormatException e) {
              throw new DirectiveParseException(
                String.format("Depth '%s' specified is not a valid number.", depthOpt)
              );
            }
          }
          steps.add(new JsParser(lineno, directive, column, depth));
        }
        break;

        // parse-as-avro    [version]
        case "parse-as-avro" : {
          String column = getNextToken(tokenizer, command, "column", lineno);
          String schemaId = getNextToken(tokenizer, command, "schema-id", lineno);
          String type = getNextToken(tokenizer, command, "type", lineno);
          if (!"json".equalsIgnoreCase(type) && !"binary".equalsIgnoreCase(type)) {
           throw new DirectiveParseException(
             String.format("Parsing AVRO can be either of type 'json' or 'binary'")
           );
          }
          String versionOpt = getNextToken(tokenizer, "\n", command, "depth", lineno, true);
          int version = -1;
          if (versionOpt != null && !versionOpt.isEmpty()) {
            try {
              version = Integer.parseInt(versionOpt);
            } catch (NumberFormatException e) {
              throw new DirectiveParseException(
                String.format("Version '%s' specified is not a valid number.", versionOpt)
              );
            }
          }
          steps.add(new ParseAvro(lineno, directive, column, schemaId, type, version));
        }
        break;

        // parse-as-protobuf    [version]
        case "parse-as-protobuf" : {
          String column = getNextToken(tokenizer, command, "column", lineno);
          String schemaId = getNextToken(tokenizer, command, "schema-id", lineno);
          String recordName = getNextToken(tokenizer, command, "record-name", lineno);
          String versionOpt = getNextToken(tokenizer, "\n", command, "depth", lineno, true);
          int version = -1;
          if (versionOpt != null && !versionOpt.isEmpty()) {
            try {
              version = Integer.parseInt(versionOpt);
            } catch (NumberFormatException e) {
              throw new DirectiveParseException(
                String.format("Version '%s' specified is not a valid number.", versionOpt)
              );
            }
          }
          steps.add(new ParseProtobuf(lineno, directive, column, schemaId, recordName, version));
        }
        break;

        // json-path   
        case "json-path" : {
          String src = getNextToken(tokenizer, command, "source", lineno);
          String dest = getNextToken(tokenizer, command, "dest", lineno);
          String path = getNextToken(tokenizer, "\n", command, "json-path", lineno);
          steps.add(new JsPath(lineno, directive, src, dest, path));
        }
        break;

        // set-charset  
        case "set-charset" : {
          String column = getNextToken(tokenizer, command, "column", lineno);
          String charset = getNextToken(tokenizer, "\n", command, "charset", lineno, true);
          steps.add(new SetCharset(lineno, directive, column, charset));
        }
        break;

        // invoke-http  [,] 
[,
] case "invoke-http" : { String url = getNextToken(tokenizer, command, "url", lineno); String columnsOpt = getNextToken(tokenizer, command, "columns", lineno); List columns = new ArrayList<>(); for (String column : columnsOpt.split(",")) { columns.add(column.trim()); } String headers = getNextToken(tokenizer, "\n", command, "headers", lineno, true); steps.add(new InvokeHttp(lineno, directive, url, columns, headers)); } break; // set-record-delim [] case "set-record-delim" : { String column = getNextToken(tokenizer, command, "column", lineno); String delimiter = getNextToken(tokenizer, command, "delimiter", lineno); String limitStr = getNextToken(tokenizer, "\n", column, "limit", lineno, true); if (limitStr == null || limitStr.isEmpty()) { limitStr = "1"; } try { int limit = Integer.parseInt(limitStr); steps.add(new SetRecordDelimiter(lineno, directive, column, delimiter, limit)); } catch (NumberFormatException e) { throw new DirectiveParseException( String.format("Limit '%s' specified is not a number.", limitStr) ); } } break; // parse-as-fixed-length [] case "parse-as-fixed-length" : { String column = getNextToken(tokenizer, command, "column", lineno); String widthStr = getNextToken(tokenizer, command, "widths", lineno); String padding = getNextToken(tokenizer, "\n", column, "padding", lineno, true); if (padding == null || padding.isEmpty()) { padding = null; // Add space as padding. } else { padding = StringUtils.substringBetween(padding, "'", "'"); } String[] widthsStr = widthStr.split(","); int[] widths = new int[widthsStr.length]; int i = 0; for (String w : widthsStr) { try { widths[i] = Integer.parseInt(StringUtils.deleteWhitespace(w)); } catch (NumberFormatException e) { throw new DirectiveParseException( String.format("Width specified '%s' at location %d is not a number.", w, i) ); } ++i; } steps.add(new FixedLengthParser(lineno, directive, column, widths, padding)); } break; // split-to-rows case "split-to-rows" : { String column = getNextToken(tokenizer, command, "column", lineno); String regex = getNextToken(tokenizer, "\n", "separator", lineno); steps.add(new SplitToRows(lineno, directive, column, regex)); } break; // split-to-columns case "split-to-columns" : { String column = getNextToken(tokenizer, command, "column", lineno); String regex = getNextToken(tokenizer, "\n", "regex", lineno); steps.add(new SplitToColumns(lineno, directive, column, regex)); } break; // parse-xml-to-json [] case "parse-xml-to-json" : { String column = getNextToken(tokenizer, command, "column", lineno); String depthOpt = getNextToken(tokenizer, "\n", command, "depth", lineno, true); int depth = Integer.MAX_VALUE; try { if(depthOpt != null && !depthOpt.isEmpty()) { depth = Integer.parseInt(depthOpt); } } catch (NumberFormatException e) { throw new DirectiveParseException(e.getMessage()); } steps.add(new XmlToJson(lineno, directive, column, depth)); } break; // parse-as-xml case "parse-as-xml" : { String column = getNextToken(tokenizer, command, "column", lineno); steps.add(new XmlParser(lineno, directive, column)); } break; // parse-as-excel case "parse-as-excel" : { String column = getNextToken(tokenizer, command, "column", lineno); String sheet = getNextToken(tokenizer, "\n", command, "sheet", lineno, true); steps.add(new ParseExcel(lineno, directive, column, sheet)); } break; // xpath case "xpath" : { String column = getNextToken(tokenizer, command, "column", lineno); String destination = getNextToken(tokenizer, command, "destination", lineno); String xpath = getNextToken(tokenizer, "\n", command, "xpath", lineno); steps.add(new XPathElement(lineno, directive, column, destination, xpath)); } break; // xpath-array case "xpath-array" : { String column = getNextToken(tokenizer, command, "column", lineno); String destination = getNextToken(tokenizer, command, "destination", lineno); String xpath = getNextToken(tokenizer, "\n", command, "xpath", lineno); steps.add(new XPathArrayElement(lineno, directive, column, destination, xpath)); } break; // flatten [,,,...] case "flatten" : { String cols = getNextToken(tokenizer, command, "columns", lineno); if (cols.equalsIgnoreCase("*")) { throw new DirectiveParseException( "Flatten does not support wildcard ('*') flattening. Please specify column names" ); } String[] columns = cols.split(","); for (String column : columns) { if (column.trim().equalsIgnoreCase("*")) { throw new DirectiveParseException( "Flatten does not support wildcard ('*') flattening. Please specify column names" ); } } steps.add(new Flatten(lineno, directive, columns)); } break; // copy [force] case "copy" : { String source = getNextToken(tokenizer, command, "source", lineno); String destination = getNextToken(tokenizer, command, "destination", lineno); String forceOpt = getNextToken(tokenizer, "\n", command, "force", lineno, true); boolean force = false; if (forceOpt != null && forceOpt.equalsIgnoreCase("true")) { force = true; } steps.add(new Copy(lineno, directive, source, destination, force)); } break; // fill-null-or-empty case "fill-null-or-empty" : { String column = getNextToken(tokenizer, command, "column", lineno); String value = getNextToken(tokenizer, command, "fixed-value", lineno); if (value != null && value.isEmpty()) { throw new DirectiveParseException( "Fixed value cannot be a empty string" ); } steps.add(new FillNullOrEmpty(lineno, directive, column, value)); } break; // cut-character case "cut-character" : { String source = getNextToken(tokenizer, command, "source", lineno); String destination = getNextToken(tokenizer, command, "destination", lineno); String range = getNextToken(tokenizer, command, "range", lineno); steps.add(new CharacterCut(lineno, directive, source, destination, range)); } break; // generate-uuid case "generate-uuid" : { String column = getNextToken(tokenizer, command, "column", lineno); steps.add(new GenerateUUID(lineno, directive, column)); } break; // url-encode case "url-encode" : { String column = getNextToken(tokenizer, command, "column", lineno); steps.add(new UrlEncode(lineno, directive, column)); } break; // url-decode case "url-decode" : { String column = getNextToken(tokenizer, command, "column", lineno); steps.add(new UrlEncode(lineno, directive, column)); } break; // parse-as-log case "parse-as-log" : { String column = getNextToken(tokenizer, command, "column", lineno); String format = getNextToken(tokenizer, "\n", command, "format", lineno); steps.add(new ParseLog(lineno, directive, column, format)); } break; // parse-as-date [] case "parse-as-date" : { String column = getNextToken(tokenizer, command, "column", lineno); String timezone = getNextToken(tokenizer, "\n", command, "timezone", lineno, true); steps.add(new ParseDate(lineno, directive, column, timezone)); } break; // parse-as-simple-date case "parse-as-simple-date" : { String column = getNextToken(tokenizer, command, "column", lineno); String pattern = getNextToken(tokenizer, "\n", command, "format", lineno); steps.add(new ParseSimpleDate(lineno, directive, column, pattern)); } break; // diff-date case "diff-date" : { String column1 = getNextToken(tokenizer, command, "column1", lineno); String column2 = getNextToken(tokenizer, command, "column2", lineno); String destColumn = getNextToken(tokenizer, "\n", command, "destColumn", lineno); steps.add(new DiffDate(lineno, directive, column1, column2, destColumn)); } break; // keep [,]* case "keep" : { String columns = getNextToken(tokenizer, command, "columns", lineno); steps.add(new Keep(lineno, directive, columns.split(","))); } break; // parse-as-hl7 [] case "parse-as-hl7" : { String column = getNextToken(tokenizer, command, "column", lineno); String depthOpt = getNextToken(tokenizer, "\n", command, "depth", lineno, true); int depth = Integer.MAX_VALUE; try { if (depthOpt != null && !depthOpt.isEmpty()) { depth = Integer.parseInt(depthOpt); } } catch (NumberFormatException e) { throw new DirectiveParseException(e.getMessage()); } steps.add(new HL7Parser(lineno, directive, column, depth)); } break; // split-email case "split-email" : { String column = getNextToken(tokenizer, command, "column", lineno); steps.add(new SplitEmail(lineno, directive, column)); } break; // swap case "swap" : { String column1 = getNextToken(tokenizer, command, "column1", lineno); String column2 = getNextToken(tokenizer, command, "column2", lineno); steps.add(new Swap(lineno, directive, column1, column2)); } break; // hash [encode] case "hash" : { String column = getNextToken(tokenizer, command, "column", lineno); String algorithm = getNextToken(tokenizer, command, "algorithm", lineno); String encodeOpt = getNextToken(tokenizer, "\n", command, "encode", lineno, true); if (!MessageHash.isValid(algorithm)) { throw new DirectiveParseException( String.format("Algorithm '%s' specified in directive '%s' at line %d is not supported", algorithm, command, lineno) ); } boolean encode = true; if (encodeOpt.equalsIgnoreCase("false")) { encode = false; } try { MessageDigest digest = MessageDigest.getInstance(algorithm); steps.add(new MessageHash(lineno, directive, column, digest, encode)); } catch (NoSuchAlgorithmException e) { throw new DirectiveParseException( String.format("Unable to find algorithm specified '%s' in directive '%s' at line %d.", algorithm, command, lineno) ); } } break; // write-as-json case "write-as-json-map" : { String column = getNextToken(tokenizer, command, "column", lineno); steps.add(new WriteAsJsonMap(lineno, directive, column)); } break; // write-as-csv case "write-as-csv" : { String column = getNextToken(tokenizer, command, "column", lineno); steps.add(new WriteAsCSV(lineno, directive, column)); } break; // filter-rows-on condition-false // filter-rows-on condition-true // filter-rows-on empty-or-null-columns [,*] // filter-rows-on regex-match // filter-rows-on regex-not-match case "filter-rows-on" : { String cmd = getNextToken(tokenizer, command, "command", lineno); if (cmd.equalsIgnoreCase("condition-false")) { String condition = getNextToken(tokenizer, "\n", command, "condition", lineno); steps.add(new RecordConditionFilter(lineno, directive, condition, false)); } else if (cmd.equalsIgnoreCase("condition-true")) { String condition = getNextToken(tokenizer, "\n", command, "condition", lineno); steps.add(new RecordConditionFilter(lineno, directive, condition, true)); } else if (cmd.equalsIgnoreCase("empty-or-null-columns")) { String columns = getNextToken(tokenizer, "\n", command, "columns", lineno); steps.add(new RecordMissingOrNullFilter(lineno, directive, columns.split(","))); } else if (cmd.equalsIgnoreCase("regex-match")) { String column = getNextToken(tokenizer, command, "column", lineno); String pattern = getNextToken(tokenizer, "\n", command, "regex", lineno); steps.add(new RecordRegexFilter(lineno, directive, column, pattern, true)); } else if (cmd.equalsIgnoreCase("regex-not-match")) { String column = getNextToken(tokenizer, command, "column", lineno); String pattern = getNextToken(tokenizer, "\n", command, "regex", lineno); steps.add(new RecordRegexFilter(lineno, directive, column, pattern, false)); } else { throw new DirectiveParseException( String.format("Unknown option '%s' specified for filter-rows-on directive at line no %s", cmd, lineno) ); } } break; // parse-as-avro-file case "parse-as-avro-file": { String column = getNextToken(tokenizer, command, "column", lineno); steps.add(new ParseAvroFile(lineno, directive, column)); } break; // send-to-error case "send-to-error": { String condition = getNextToken(tokenizer, "\n", command, "condition", lineno); steps.add(new SendToError(lineno, directive, condition)); } break; // fail case "fail": { String condition = getNextToken(tokenizer, "\n", command, "condition", lineno); steps.add(new Fail(lineno, directive, condition)); } break; // text-distance case "text-distance" : { String method = getNextToken(tokenizer, command, "method", lineno); String column1 = getNextToken(tokenizer, command, "column1", lineno); String column2 = getNextToken(tokenizer, command, "column2", lineno); String destination = getNextToken(tokenizer, command, "destination", lineno); steps.add(new TextDistanceMeasure(lineno, directive, method, column1, column2, destination)); } break; // text-metric case "text-metric" : { String method = getNextToken(tokenizer, command, "method", lineno); String column1 = getNextToken(tokenizer, command, "column1", lineno); String column2 = getNextToken(tokenizer, command, "column2", lineno); String destination = getNextToken(tokenizer, command, "destination", lineno); steps.add(new TextMetricMeasure(lineno, directive, method, column1, column2, destination)); } break; // catalog-lookup ICD-9|ICD-10 case "catalog-lookup" : { String type = getNextToken(tokenizer, command, "type", lineno); String column = getNextToken(tokenizer, command, "column", lineno); if (!type.equalsIgnoreCase("ICD-9") && !type.equalsIgnoreCase("ICD-10-2016") && !type.equalsIgnoreCase("ICD-10-2017")) { throw new IllegalArgumentException("Invalid ICD type - should be 9 (ICD-9) or 10 (ICD-10-2016 " + "or ICD-10-2017)."); } else { ICDCatalog catalog = new ICDCatalog(type.toLowerCase()); if (!catalog.configure()) { throw new DirectiveParseException( String.format("Failed to configure ICD StaticCatalog. Check with your administrator") ); } steps.add(new CatalogLookup(lineno, directive, catalog, column)); } } break; // table-lookup case "table-lookup" : { String column = getNextToken(tokenizer, command, "column", lineno); String table = getNextToken(tokenizer, command, "table", lineno); steps.add(new TableLookup(lineno, directive, column, table)); } break; // stemming case "stemming" : { String column = getNextToken(tokenizer, command, "column", lineno); steps.add(new Stemming(lineno, directive, column)); } break; // columns case "columns-replace" : { String sed = getNextToken(tokenizer, command, "sed-expression", lineno); steps.add(new ColumnsReplace(lineno, directive, sed)); } break; // extract-regex-groups case "extract-regex-groups" : { String column = getNextToken(tokenizer, command, "column", lineno); String regex = getNextToken(tokenizer, command, "regex", lineno); steps.add(new ExtractRegexGroups(lineno, directive, column, regex)); } break; // split-url case "split-url" : { String column = getNextToken(tokenizer, command, "column", lineno); steps.add(new SplitURL(lineno, directive, column)); } break; // cleanse-column-names case "cleanse-column-names" : { steps.add(new CleanseColumnNames(lineno, directive)); } break; // change-column-case case "change-column-case" : { String casing = getNextToken(tokenizer, command, "case", lineno); boolean toLower = false; if (casing == null || casing.isEmpty() || casing.equalsIgnoreCase("lower") || casing.equalsIgnoreCase("lowercase")) { toLower = true; } steps.add(new ChangeColCaseNames(lineno, directive, toLower)); } break; // set-column case "set-column" : { String column = getNextToken(tokenizer, command, "column", lineno); String expr = getNextToken(tokenizer, "\n", command, "expression", lineno); steps.add(new SetColumn(lineno, directive, column, expr)); } break; // encode case "encode" : { String type = getNextToken(tokenizer, command, "type", lineno); String column = getNextToken(tokenizer, command, "column", lineno); type = type.toUpperCase(); if (!type.equals("BASE64") && !type.equals("BASE32") && !type.equals("HEX")) { throw new DirectiveParseException( String.format("Type of encoding specified '%s' is not supported. Supports base64, base32 & hex.", type) ); } steps.add(new Encode(lineno, directive, Encode.Type.valueOf(type), column)); } break; // decode case "decode" : { String type = getNextToken(tokenizer, command, "type", lineno); String column = getNextToken(tokenizer, command, "column", lineno); type = type.toUpperCase(); if (!type.equals("BASE64") && !type.equals("BASE32") && !type.equals("HEX")) { throw new DirectiveParseException( String.format("Type of decoding specified '%s' is not supported. Supports base64, base32 & hex.", type) ); } steps.add(new Decode(lineno, directive, Decode.Type.valueOf(type), column)); } break; //trim case "trim": { String col = getNextToken(tokenizer, command, "col", lineno); steps.add(new Trim(lineno, directive, col)); } break; //ltrim case "ltrim": { String col = getNextToken(tokenizer, command, "col", lineno); steps.add(new LeftTrim(lineno, directive, col)); } break; //rtrim case "rtrim": { String col = getNextToken(tokenizer, command, "col", lineno); steps.add(new RightTrim(lineno, directive, col)); } break; default: throw new DirectiveParseException( String.format("Unknown directive '%s' found in the directive at line %d", command, lineno) ); } lineno++; } return steps; } // If there are more tokens, then it proceeds with parsing, else throws exception. private String getNextToken(StringTokenizer tokenizer, String directive, String field, int lineno) throws DirectiveParseException { return getNextToken(tokenizer, null, directive, field, lineno, false); } private String getNextToken(StringTokenizer tokenizer, String delimiter, String directive, String field, int lineno) throws DirectiveParseException { return getNextToken(tokenizer, delimiter, directive, field, lineno, false); } private String getNextToken(StringTokenizer tokenizer, String delimiter, String directive, String field, int lineno, boolean optional) throws DirectiveParseException { String value = null; if (tokenizer.hasMoreTokens()) { if (delimiter == null) { value = tokenizer.nextToken().trim(); } else { value = tokenizer.nextToken(delimiter).trim(); } } else { if (!optional) { String usage = usageRegistry.getUsage(directive); throw new DirectiveParseException( String.format("Missing field '%s' at line number %d for directive <%s> (usage: %s)", field, lineno, directive, usage) ); } } return value; } /** * @return List of steps to executed in the order they are specified. * @throws ParseException throw in case of parsing exception of specification. */ @Override public List getSteps() throws DirectiveParseException { return parse(); } }