org.apache.gobblin.converter.grok.GrokToJsonConverter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gobblin-grok Show documentation
A distributed data integration framework for streaming and batch data ecosystems.
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.gobblin.converter.grok;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.List;
import java.util.regex.Pattern;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonNull;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;

import io.thekraken.grok.api.Grok;
import io.thekraken.grok.api.Match;
import io.thekraken.grok.api.exception.GrokException;

import org.apache.gobblin.configuration.WorkUnitState;
import org.apache.gobblin.converter.Converter;
import org.apache.gobblin.converter.DataConversionException;
import org.apache.gobblin.converter.SchemaConversionException;
import org.apache.gobblin.converter.SingleRecordIterable;
import org.apache.gobblin.util.DatasetFilterUtils;


/**
 * GrokToJsonConverter accepts already deserialized text row, String, where you can use.
 *
 * Converts Text to JSON based on Grok pattern. Schema is represented by the form of JsonArray same interface being used by CsvToJonConverter.
 * Each text record is represented by a String.
 * The converter only supports Grok patterns where every group is named because it uses the group names as column names.
 *
 * The following config properties can be set:
 * The grok pattern to use for the conversion:
 * converter.grokToJsonConverter.grokPattern ="^%{IPORHOST:clientip} (?:-|%{USER:ident}) (?:-|%{USER:auth}) \[%{HTTPDATE:timestamp}\] \"(?:%{WORD:verb} %{NOTSPACE:request}(?: HTTP/%{NUMBER:httpversion})?|-)\" %{NUMBER:response} (?:-|%{NUMBER:bytes})"
 *
 * Path to the file which contains the base grok patterns which can be used in the converter's GROK pattern (if not set it will use the default ones):
 * converter.grokToJsonConverter.baseGrokPatternsFile=
 **
 * Specify a comma separated list of regexes which will be applied on the fields and matched one will be converted to json null:
 * converter.grokToJsonConverter.nullStringRegexes="[-\s]"
 *
 * Example of schema:
 * [
 {
 "columnName": "Day",
 "comment": "",
 "isNullable": "true",
 "dataType": {
 "type": "string"
 }
 },
 {
 "columnName": "Pageviews",
 "comment": "",
 "isNullable": "true",
 "dataType": {
 "type": "long"
 }
 }
 ]
 */
public class GrokToJsonConverter extends Converter {

  private static final Logger LOG = LoggerFactory.getLogger(GrokToJsonConverter.class);
  private static final JsonParser JSON_PARSER = new JsonParser();
  private static final String COLUMN_NAME_KEY = "columnName";
  private static final String DATA_TYPE = "dataType";
  private static final String TYPE_KEY = "type";
  private static final String NULLABLE = "isNullable";

  public static final String GROK_PATTERN = "converter.grokToJsonConverter.grokPattern";
  public static final String BASE_PATTERNS_FILE = "converter.grokToJsonConverter.baseGrokPatternsFile";
  public static final String NULLSTRING_REGEXES = "converter.grokToJsonConverter.nullStringRegexes";

  public static final String DEFAULT_GROK_PATTERNS_FILE = "/grok/grok-patterns";

  private List nullStringRegexes;

  private Grok grok;

  @Override
  public Converter init(WorkUnitState workUnit) {
    super.init(workUnit);
    String pattern = workUnit.getProp(GROK_PATTERN);
    String patternsFile = workUnit.getProp(BASE_PATTERNS_FILE);
    this.nullStringRegexes = DatasetFilterUtils.getPatternsFromStrings(workUnit.getPropAsList(NULLSTRING_REGEXES, ""));

    InputStreamReader grokPatterns;
    try {
      if (patternsFile == null) {
        grokPatterns = new InputStreamReader(getClass().getResourceAsStream("/grok/grok-base-patterns"), "UTF8");
      } else {
        grokPatterns = new InputStreamReader(new FileInputStream(patternsFile), "UTF8");
      }
      grok = new Grok();
      grok.addPatternFromReader(grokPatterns);
      grok.compile(pattern);
    } catch (GrokException | FileNotFoundException | UnsupportedEncodingException e) {
      throw new RuntimeException("Error initializing GROK: " + e);
    }

    return this;
  }

  @Override
  public JsonArray convertSchema(String inputSchema, WorkUnitState workUnit)
      throws SchemaConversionException {
    Preconditions.checkNotNull(inputSchema, "inputSchema is required.");
    return JSON_PARSER.parse(inputSchema).getAsJsonArray();
  }

  /**
   * Converts Text (String) to JSON based on a Grok regexp expression.
   * By default, fields between Text and JSON are mapped by Grok SEMANTIC which is the identifier you give to the piece of text being matched in your Grok expression.
   *
   *
   * e.g:
   * {@inheritDoc}
   * @see Converter#convertRecord(Object, Object, WorkUnitState)
   */
  @Override
  public Iterable convertRecord(JsonArray outputSchema, String inputRecord, WorkUnitState workUnit)
      throws DataConversionException {

    JsonObject outputRecord = createOutput(outputSchema, inputRecord);

    LOG.debug("Converted into " + outputRecord);

    return new SingleRecordIterable(outputRecord);
  }

  @VisibleForTesting
  JsonObject createOutput(JsonArray outputSchema, String inputRecord)
      throws DataConversionException {
    JsonObject outputRecord = new JsonObject();

    Match gm = grok.match(inputRecord);
    gm.captures();

    JsonElement capturesJson = JSON_PARSER.parse(gm.toJson());

    for (JsonElement anOutputSchema : outputSchema) {
      JsonObject outputSchemaJsonObject = anOutputSchema.getAsJsonObject();
      String key = outputSchemaJsonObject.get(COLUMN_NAME_KEY).getAsString();
      String type = outputSchemaJsonObject.getAsJsonObject(DATA_TYPE).get(TYPE_KEY).getAsString();

      if (isFieldNull(capturesJson, key)) {
        if (!outputSchemaJsonObject.get(NULLABLE).getAsBoolean()) {
          throw new DataConversionException(
              "Field " + key + " is null or not exists but it is non-nullable by the schema.");
        }
        outputRecord.add(key, JsonNull.INSTANCE);
      } else {
        JsonElement jsonElement = capturesJson.getAsJsonObject().get(key);
        switch (type) {
          case "int":
            outputRecord.addProperty(key, jsonElement.getAsInt());
            break;
          case "long":
            outputRecord.addProperty(key, jsonElement.getAsLong());
            break;
          case "double":
            outputRecord.addProperty(key, jsonElement.getAsDouble());
            break;
          case "float":
            outputRecord.addProperty(key, jsonElement.getAsFloat());
            break;
          case "boolean":
            outputRecord.addProperty(key, jsonElement.getAsBoolean());
            break;
          case "string":
          default:
            outputRecord.addProperty(key, jsonElement.getAsString());
        }
      }
    }
    return outputRecord;
  }

  private boolean isFieldNull(JsonElement capturesJson, String key) {
    JsonObject jsonObject = capturesJson.getAsJsonObject();

    if (!jsonObject.has(key)) {
      return true;
    }

    for (Pattern pattern : this.nullStringRegexes) {
      if (pattern.matcher(jsonObject.get(key).getAsString()).matches()) {
        return true;
      }
    }

    return false;
  }
}