com.tigergraph.spark.read.TigerGraphResultAccessor Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of tigergraph-spark-connector Show documentation
TigerGraph Ecosystem - Spark Connector
The newest version!
/**
 * Copyright (c) 2024 TigerGraph Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
 * except in compliance with the License. You may obtain a copy of the License at
 *
 * 
http://www.apache.org/licenses/LICENSE-2.0
 *
 * 
Unless required by applicable law or agreed to in writing, software distributed under the
 * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
 * express or implied. See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.tigergraph.spark.read;

import com.fasterxml.jackson.core.JsonPointer;
import com.fasterxml.jackson.databind.JsonNode;
import com.tigergraph.spark.TigerGraphConnection;
import com.tigergraph.spark.log.LoggerFactory;
import com.tigergraph.spark.util.Utils;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map.Entry;
import java.util.Set;
import java.util.function.Function;
import java.util.stream.Stream;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.slf4j.Logger;

/**
 * An extension of Spark dataframe schema. It can tell how to map the dataframe column to the exact
 * json path of the TG query results.
 */
public class TigerGraphResultAccessor implements Serializable {

  private static final Logger logger = LoggerFactory.getLogger(TigerGraphConnection.class);
  private static final String EXTRACT_FIRST_OBJ =
      "0:"; // extract first obj of first PRING statement

  private List fieldMetas = new ArrayList();
  private StructType schema;
  // For extracting obj from query results
  private Boolean extractObj = false;
  private Integer rowNumber;
  private String objKey;

  /**
   * User provided schema, the column name will be used to searched in the query results JSON
   * (recursively if not present in the first level)
   */
  public static TigerGraphResultAccessor fromExternalSchema(
      StructType schema, String resultsExtract) {
    TigerGraphResultAccessor accessor = new TigerGraphResultAccessor();
    accessor.parseResultsExtract(resultsExtract);
    accessor.schema = schema;
    StructField[] fields = schema.fields();
    for (StructField field : fields) {
      accessor.add(field.name(), "/".concat(field.name()), true);
    }
    return accessor;
  }

  /**
   * Input meta format: { "Config": {...}, "Attributes": [...], "PrimaryId": {...}, "Name": "person"
   * }
   * https://docs.tigergraph.com/tigergraph-server/current/api/built-in-endpoints#_show_graph_schema_metadata
   *
   * 
Parsed schema: v_id | attribute 1 | attribute 2 | ... | attribute n
   */
  public static TigerGraphResultAccessor fromVertexMeta(JsonNode meta, Set columnPrune) {
    TigerGraphResultAccessor accessor = new TigerGraphResultAccessor();
    StructType schema = new StructType();
    // Parse primary id type
    String vIdType = meta.path("PrimaryId").path("AttributeType").path("Name").asText("STRING");
    schema = schema.add("v_id", mapTGTypeToSparkType(vIdType));
    accessor.add("v_id", "/v_id", false);
    // Parse attributes type
    Iterator attrIter = meta.path("Attributes").elements();
    while (attrIter.hasNext()) {
      JsonNode attr = attrIter.next();
      String attrName = attr.path("AttributeName").asText();
      String attrType = attr.path("AttributeType").path("Name").asText("STRING");
      if (!Utils.isEmpty(attrName) && (columnPrune == null || columnPrune.contains(attrName))) {
        schema = schema.add(attrName, mapTGTypeToSparkType(attrType));
        accessor.add(attrName, "/attributes/".concat(attrName), false);
      }
    }
    accessor.schema = schema;
    return accessor;
  }

  /**
   * Input meta format: { "IsDirected": false, "ToVertexTypeName": "company", "Config": {},
   * "Attributes": [...], "FromVertexTypeName": "person", "Name": "worksFor" }
   * https://docs.tigergraph.com/tigergraph-server/current/api/built-in-endpoints#_show_graph_schema_metadata
   *
   * 
Parsed schema: from_type | from_id | to_type | to_id | attribute 1 | attribute 2 | ... |
   * attribute n
   */
  public static TigerGraphResultAccessor fromEdgeMeta(JsonNode meta, Set columnPrune) {
    TigerGraphResultAccessor accessor = new TigerGraphResultAccessor();
    StructType fixedSchema =
        StructType.fromDDL("from_type STRING, from_id STRING, to_type STRING, to_id STRING");
    accessor
        .add("from_type", "/from_type", false)
        .add("from_id", "/from_id", false)
        .add("to_type", "/to_type", false)
        .add("to_id", "/to_id", false);

    StructType attrSchema = new StructType();
    // Parse attributes type
    Iterator attrIter = meta.path("Attributes").elements();
    while (attrIter.hasNext()) {
      JsonNode attr = attrIter.next();
      String attrName = attr.path("AttributeName").asText();
      String attrType = attr.path("AttributeType").path("Name").asText("STRING");
      if (!Utils.isEmpty(attrName) && (columnPrune == null || columnPrune.contains(attrName))) {
        attrSchema = attrSchema.add(attrName, mapTGTypeToSparkType(attrType));
        accessor.add(attrName, "/attributes/".concat(attrName), false);
      }
    }
    accessor.schema = fixedSchema.merge(attrSchema);
    return accessor;
  }

  /*
   * Parse the schema of the query output based on query signature(including output format).
   *
   * 1) multi-print query: each print will be a row
   * 2) vertex expression set: each vertex will be a row
   * 3) MapAccum: both key and value are meaningful, will have 2 columns: key and value
   * 4) non-map accum: each element will be a row
   * 5) unknown/hard-to-determine schema: read the entire JSON object as a row
   *
   * 
Note: can use resultsExtract to extract one obj from multi-print query and flatten
   * it.
   */
  public static TigerGraphResultAccessor fromQueryMeta(JsonNode meta, String resultsExtract) {
    TigerGraphResultAccessor accessor = new TigerGraphResultAccessor();
    accessor.parseResultsExtract(resultsExtract);
    JsonNode objMeta = null;
    if (accessor.extractObj()) {
      // explicitly extract an obj from query output
      if (meta.get(accessor.getRowNumber()) != null
          && meta.get(accessor.getRowNumber()).get(accessor.getObjKey()) != null) {
        objMeta = meta.get(accessor.getRowNumber()).get(accessor.getObjKey());
      } else {
        return fromUnknownMeta(resultsExtract);
      }
    } else if (meta.isArray() && meta.size() == 1 && meta.get(0).size() == 1) {
      // Only single PRINT statement and only PRINT one object
      objMeta = meta.get(0).elements().next();
      resultsExtract = EXTRACT_FIRST_OBJ;
    }

    if (objMeta != null) {
      // Infer the results type
      if (objMeta.isTextual()) {
        String format = objMeta.asText();
        String[] nonMapAccumTypes = {
          "SumAccum",
          "MinAccum",
          "MaxAccum",
          "AvgAccum",
          "PercentileContAccum",
          "AndAccum",
          "OrAccum",
          "BitwiseAndAccum",
          "BitwiseOrAccum",
          "ListAccum",
          "SetAccum",
          "BagAccum",
          "ArrayAccum",
          "HeapAccum",
          "GroupByAccum"
        };
        if (Stream.of(nonMapAccumTypes).anyMatch((prefix) -> format.startsWith(prefix))) {
          return fromNonMapAccumQueryMeta(format, resultsExtract);
        } else if (format.startsWith("MapAccum")) {
          return fromMapAccumQueryMeta(format, resultsExtract);
        }
      } else if (objMeta.isArray()) {
        // typed vertex/edge set
        if (objMeta.size() == 1) {
          JsonNode ele = objMeta.elements().next();
          if ((ele.has("v_id") && ele.has("attributes"))) {
            return fromVertexSetQueryMeta(ele, resultsExtract);
          }
        }
        return fromNormalQueryMeta(objMeta, resultsExtract);
      }
    }
    return fromNormalQueryMeta(meta, null);
  }

  /**
   * When failed to get the schema, or no common schema for each row, and no external schema
   * provided, then use the default schema "results STRING" to parse the entire JSON object.
   *
   * 
   * E.g.: [{"a":123},{"b":456,"c":789}] =>
   * |           results |
   * |         {"a":123} |
   * | {"b":456,"c":789} |
   * 
   */
  public static TigerGraphResultAccessor fromUnknownMeta(String resultsExtract) {
    logger.warn(
        "Failed to infer schema, use default schema 'results STRING'. You can set custom schema"
            + " manually based on the output JSON keys.");
    TigerGraphResultAccessor accessor = new TigerGraphResultAccessor();
    accessor.parseResultsExtract(resultsExtract);
    StructType schema = new StructType();
    schema = schema.add("results", "STRING");
    accessor.add("results", "", false);
    accessor.schema = schema;
    return accessor;
  }

  public List getFieldMetas() {
    return this.fieldMetas;
  }

  /**
   * Parse the query result schema if and only if the schema is unique. 

   *
   * Input meta format: {"output": [{"a":"string", "b":"int"}, {"c":"string", "d":"int"}]} => |
   * results |
   *
   * Input meta format: {"output": [{"a":"string", "b":"int"}, {"b":"int", "a":"string"}]} => | a
   * | b |
   */
  private static TigerGraphResultAccessor fromNormalQueryMeta(
      JsonNode meta, String resultsExtract) {
    // The Jackson can compare JsonNode via `equal(obj)` regardless of the order of the fields
    Set dedupMeta = new HashSet<>();
    meta.iterator().forEachRemaining((ele) -> dedupMeta.add(ele));
    if (dedupMeta.size() != 1) {
      // different lines have different schemas, so we don't flatten it
      return fromUnknownMeta(resultsExtract);
    } else {
      TigerGraphResultAccessor accessor = new TigerGraphResultAccessor();
      accessor.parseResultsExtract(resultsExtract);
      StructType schema = new StructType();
      JsonNode uniqueMeta = dedupMeta.iterator().next();
      Iterator> iter = uniqueMeta.fields();
      while (iter.hasNext()) {
        Entry field = iter.next();
        schema =
            schema.add(field.getKey(), mapTGTypeToSparkType(field.getValue().asText("STRING")));
        accessor.add(field.getKey(), "/".concat(field.getKey()), false);
      }
      accessor.schema = schema;
      return accessor;
    }
  }

  /** Parse schema for vertex expression set */
  private static TigerGraphResultAccessor fromVertexSetQueryMeta(
      JsonNode meta, String resultsExtract) {
    TigerGraphResultAccessor accessor = new TigerGraphResultAccessor();
    accessor.parseResultsExtract(resultsExtract);
    StructType schema = new StructType();
    Iterator> iter = meta.fields();
    Iterator> attrIter = null;
    while (iter.hasNext()) {
      Entry field = iter.next();
      String key = field.getKey();
      // extract the attributes to top level
      if ("attributes".equals(key)) {
        attrIter = field.getValue().fields();
        continue;
      }
      String typeStr = field.getValue().asText("STRING");
      // HACK: GSQL incorrectly return the v_id type, mandatorily set it to STRING to avoid error.
      if ("v_id".equals(key)) {
        typeStr = "STRING";
      }
      schema = schema.add(key, mapTGTypeToSparkType(typeStr));
      accessor.add(key, "/".concat(key), false);
    }
    if (attrIter != null) {
      while (attrIter.hasNext()) {
        Entry attrField = attrIter.next();
        String attrKey = attrField.getKey();
        String attrTypeStr = attrField.getValue().asText("STRING");
        schema = schema.add(attrKey, mapTGTypeToSparkType(attrTypeStr));
        accessor.add(attrKey, "/attributes/".concat(attrKey), false);
      }
    }
    accessor.schema = schema;
    return accessor;
  }

  private static TigerGraphResultAccessor fromNonMapAccumQueryMeta(
      String format, String resultsExtract) {
    return fromUnknownMeta(resultsExtract);
  }

  private static TigerGraphResultAccessor fromMapAccumQueryMeta(
      String format, String resultsExtract) {
    TigerGraphResultAccessor accessor = new TigerGraphResultAccessor();
    accessor.parseResultsExtract(resultsExtract);
    StructType schema = new StructType();
    schema = schema.add("key", "STRING");
    schema = schema.add("value", "STRING");
    accessor.add("key", "/key", false);
    accessor.add("value", "/value", false);
    accessor.schema = schema;
    return accessor;
  }

  public StructType getSchema() {
    return schema;
  }

  public Boolean extractObj() {
    return extractObj;
  }

  public Integer getRowNumber() {
    return rowNumber;
  }

  public String getObjKey() {
    return objKey;
  }

  /**
   * Map TigerGraph data types to Spark types. Currently all the complex data types are just mapped
   * to String.
   *
   * @param tgType
   * @return
   */
  protected static DataType mapTGTypeToSparkType(String tgType) {
    if (Utils.isEmpty(tgType)) return DataTypes.StringType;
    tgType = tgType.toLowerCase();
    switch (tgType) {
      case "int":
      case "uint":
        return DataTypes.createDecimalType(38, 0);
      case "float":
        return DataTypes.FloatType;
      case "double":
        return DataTypes.DoubleType;
      case "bool":
      case "boolean":
        return DataTypes.BooleanType;
      case "fixed_binary":
        return DataTypes.BinaryType;
      default:
        return DataTypes.StringType;
    }
  }

  /**
   * E.g. MapAccum> => [vertex, ListAccum]
   *
   * @param accumType
   * @return
   */
  private static List extractElementType(String accumType) {
    // TODO in next relese
    return null;
  }

  private TigerGraphResultAccessor() {}

  /**
   * Add accessor(json pointer) for the column.
   *
   * @param colName
   * @param jsPath
   * @param isQueryable whether search recursively
   */
  private TigerGraphResultAccessor add(String colName, String jsPath, Boolean isQueryable) {
    this.fieldMetas.add(FieldMeta.fromValue(colName, jsPath, isQueryable));
    return this;
  }

  private void parseResultsExtract(String resultsExtract) {
    if (!Utils.isEmpty(resultsExtract)) {
      extractObj = true;
      Integer colonIndex = resultsExtract.indexOf(":");
      if (colonIndex != -1) {
        // Extract the index and key using substring
        try {
          rowNumber = Integer.parseInt(resultsExtract.substring(0, colonIndex));
          objKey = resultsExtract.substring(colonIndex + 1);
        } catch (NumberFormatException e) {
          throw new IllegalArgumentException(
              "Failed to parse row number from " + resultsExtract, e);
        }
      } else {
        throw new IllegalArgumentException(
            "The row number and object key should be separated by colon, got " + resultsExtract);
      }
    }
  }

  /** Metadata for a Spark dataframe column, and how to access it from RESTPP JSON response. */
  public static class FieldMeta implements Serializable {
    private String name;
    private JsonPointer jsPtr;
    private Boolean isQueryable;

    /** Retrieve the value by JSON key */
    protected static FieldMeta fromValue(String name, String jsPath, Boolean isQueryable) {
      FieldMeta meta = new FieldMeta();
      meta.name = name;
      meta.jsPtr = JsonPointer.compile(jsPath);
      meta.isQueryable = isQueryable;
      return meta;
    }

    /** Convert meta to lambda to retrieve value from JSON Object */
    public Function toAccessor() {
      return (node) -> {
        if (node.isValueNode()) {
          return node;
        } else {
          JsonNode tgt = node.at(jsPtr);
          if (tgt.isMissingNode()) {
            if (isQueryable) {
              return node.findValue(name);
            } else {
              return null;
            }
          } else {
            return tgt;
          }
        }
      };
    }

    private FieldMeta() {}
  }
}