All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hive.ql.hooks.LineageInfo Maven / Gradle / Ivy

There is a newer version: 4.0.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.hooks;

import java.io.Serializable;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;

import org.apache.commons.collections.SetUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.hive.common.StringInternUtils;
import org.apache.hadoop.hive.metastore.Warehouse;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.Partition;
import org.apache.hadoop.hive.metastore.api.Table;
import org.apache.hadoop.hive.ql.stats.StatsUtils;

/**
 * This class contains the lineage information that is passed
 * to the PreExecution hook.
 */
public class LineageInfo implements Serializable {

  /**
   * Serial version id.
   */
  private static final long serialVersionUID = 1L;

  /**
   * Enum to track dependency. This enum has the following values:
   * 1. SIMPLE - Indicates that the column is derived from another table column
   *             with no transformations e.g. T2.c1 = T1.c1.
   * 2. EXPRESSION - Indicates that the column is derived from a UDF, UDAF, UDTF or
   *                 set operations like union on columns on other tables
   *                 e.g. T2.c1 = T1.c1 + T3.c1.
   * 4. SCRIPT - Indicates that the column is derived from the output
   *             of a user script through a TRANSFORM, MAP or REDUCE syntax
   *             or from the output of a PTF chain execution.
   */
  public static enum DependencyType {
    SIMPLE, EXPRESSION, SCRIPT
  }

  /**
   * Table or Partition data container. We need this class because the output
   * of the query can either go to a table or a partition within a table. The
   * data container class subsumes both of these.
   */
  public static class DataContainer implements Serializable {

    /**
     * Serial version id.
     */
    private static final long serialVersionUID = 1L;

    /**
     * The table in case this container is a table.
     */
    private final Table tab;

    /**
     * The partition in case this container is a partition.
     */
    private final Partition part;

    /**
     * Constructor for non partitioned tables.
     *
     * @param tab The associated table.
     */
    public DataContainer(Table tab) {
      this.tab = tab;
      this.part = null;
    }

    /**
     * Constructor for a partitioned tables partition.
     *
     * @param part The associated partition.
     */
    public DataContainer(Table tab, Partition part) {
      this.tab = tab;
      this.part = part;
    }

    /**
     * Returns true in case this data container is a partition.
     *
     * @return boolean TRUE if the container is a table partition.
     */
    public boolean isPartition() {
      return (part != null);
    }

    public Table getTable() {
      return this.tab;
    }

    public Partition getPartition() {
      return this.part;
    }

    @Override
    public String toString() {
      return isPartition() ?
        StatsUtils.getFullyQualifiedTableName(part.getDbName(), part.getTableName()) + "@"
            + part.getValues()
        : Warehouse.getQualifiedName(tab);
    }
  }

  /**
   * Class that captures the lookup key for the dependency. The dependency
   * is from (DataContainer, FieldSchema) to a Dependency structure. This
   * class captures the (DataContainer, FieldSchema) tuple.
   */
  public static class DependencyKey implements Serializable {

    /**
     * Serial version id.
     */
    private static final long serialVersionUID = 1L;

    /**
     * The data container for this key.
     */
    private final DataContainer dc;

    /**
     * The field schema for this key.
     */
    private final FieldSchema fld;

    /**
     * Constructor.
     *
     * @param dc The associated data container.
     * @param fld The associated field schema.
     */
    public DependencyKey(DataContainer dc, FieldSchema fld) {
      this.dc = dc;
      this.fld = fld;
    }

    public DataContainer getDataContainer() {
      return this.dc;
    }

    public FieldSchema getFieldSchema() {
      return this.fld;
    }

    /* (non-Javadoc)
     * @see java.lang.Object#hashCode()
     */
    @Override
    public int hashCode() {
      final int prime = 31;
      int result = 1;
      result = prime * result + ((dc == null) ? 0 : dc.hashCode());
      result = prime * result + ((fld == null) ? 0 : fld.hashCode());
      return result;
    }

    /* (non-Javadoc)
     * @see java.lang.Object#equals(java.lang.Object)
     */
    @Override
    public boolean equals(Object obj) {
      if (this == obj) {
        return true;
      }
      if (obj == null) {
        return false;
      }
      if (getClass() != obj.getClass()) {
        return false;
      }
      DependencyKey other = (DependencyKey) obj;
      if (dc != other.dc) {
        return false;
      }
      if (fld != other.fld) {
        return false;
      }
      return true;
    }

    @Override
    public String toString() {
      return dc + ":" + fld;
    }
  }

  /**
   * Base Column information.
   */
  public static class BaseColumnInfo implements Serializable {

    /**
     * Serial version id.
     */
    private static final long serialVersionUID = 1L;

    /**
     * The table and alias info encapsulated in a different class.
     */
    private TableAliasInfo tabAlias;

    /**
     * The metastore column information. The column can be null
     * and that denotes that the expression is dependent on the row
     * of the table and not particular column. This can happen in case
     * of count(1).
     */
    private FieldSchema column;

    /**
     * @return the tabAlias
     */
    public TableAliasInfo getTabAlias() {
      return tabAlias;
    }

    /**
     * @param tabAlias the tabAlias to set
     */
    public void setTabAlias(TableAliasInfo tabAlias) {
      this.tabAlias = tabAlias;
    }

    /**
     * @return the column
     */
    public FieldSchema getColumn() {
      return column;
    }

    /**
     * @param column the column to set
     */
    public void setColumn(FieldSchema column) {
      this.column = column;
    }

    @Override
    public String toString() {
      return tabAlias + ":" + column;
    }

    @Override
    public int hashCode() {
      return (column != null ? column.hashCode() : 7)
        + (tabAlias != null ? tabAlias.hashCode() : 11);
    }

    @Override
    public boolean equals(Object obj) {
      if (this == obj) {
        return true;
      }
      if (!(obj instanceof BaseColumnInfo)) {
        return false;
      }
      BaseColumnInfo ci = (BaseColumnInfo) obj;
      return (column == null ? ci.column == null : column.equals(ci.column))
        && (tabAlias == null ? ci.tabAlias == null : tabAlias.equals(ci.tabAlias));
    }
  }

  public static class TableAliasInfo implements Serializable {

    /**
     * Serail version id.
     */
    private static final long serialVersionUID = 1L;

    /**
     * The alias for the table.
     */
    private String alias;

    /**
     * The metastore table information.
     */
    private Table table;

    /**
     * @return the alias
     */
    public String getAlias() {
      return alias;
    }

    /**
     * @param alias the alias to set
     */
    public void setAlias(String alias) {
      this.alias = alias;
    }

    /**
     * @return the table
     */
    public Table getTable() {
      return table;
    }

    /**
     * @param table the table to set
     */
    public void setTable(Table table) {
      this.table = table;
    }

    @Override
    public String toString() {
      return Warehouse.getQualifiedName(table) + "(" + alias + ")";
    }

    @Override
    public int hashCode() {
      return (alias != null ? alias.hashCode() : 7)
        + (table != null ? table.hashCode() : 11);
    }

    @Override
    public boolean equals(Object obj) {
      if (this == obj) {
        return true;
      }
      if (!(obj instanceof TableAliasInfo)) {
        return false;
      }
      TableAliasInfo tabAlias = (TableAliasInfo) obj;
      return StringUtils.equals(alias, tabAlias.alias)
        && (table == null ? tabAlias.table == null : table.equals(tabAlias.table));
    }
  }

  /**
   * This class tracks the dependency information for the base column.
   */
  public static class Dependency implements Serializable {

    /**
     *
     */
    private static final long serialVersionUID = 1L;

    /**
     * The type of dependency.
     */
    private DependencyType type;

    /**
     * Expression string for the dependency.
     */
    private String expr;

    /**
     * The set of base columns that the particular column depends on.
     */
    private Set baseCols;

    /**
     * @return the type
     */
    public DependencyType getType() {
      return type;
    }

    /**
     * @param type the type to set
     */
    public void setType(DependencyType type) {
      this.type = type;
    }

    /**
     * @return the expr
     */
    public String getExpr() {
      return expr;
    }

    /**
     * @param expr the expr to set
     */
    public void setExpr(String expr) {
      this.expr = StringInternUtils.internIfNotNull(expr);
    }

    /**
     * @return the baseCols
     */
    public Set getBaseCols() {
      return baseCols;
    }

    /**
     * @param baseCols the baseCols to set
     */
    public void setBaseCols(Set baseCols) {
      this.baseCols = baseCols;
    }

    @Override
    public String toString() {
      return "[" + type + "]" + baseCols;
    }
  }

  /**
   * This class tracks the predicate information for an operator.
   */
  public static class Predicate implements Serializable {

    /**
     * Expression string for the predicate.
     */
    private String expr;

    /**
     * The set of base columns that the predicate depends on.
     */
    private Set baseCols = new LinkedHashSet();

    /**
     * @return the expr
     */
    public String getExpr() {
      return expr;
    }

    /**
     * @param expr the expr to set
     */
    public void setExpr(String expr) {
      this.expr = expr;
    }

    /**
     * @return the baseCols
     */
    public Set getBaseCols() {
      return baseCols;
    }

    @Override
    public int hashCode() {
      return baseCols.hashCode() + (expr != null ? expr.hashCode() : 11);
    }

    @Override
    public boolean equals(Object obj) {
      if (this == obj) {
        return true;
      }
      if (!(obj instanceof Predicate)) {
        return false;
      }
      Predicate cond = (Predicate) obj;
      return StringUtils.equals(cond.expr, expr)
        && SetUtils.isEqualSet(cond.baseCols, baseCols);
    }
  }

  /**
   * The map contains an index from the (datacontainer, columnname) to the
   * dependency vector for that tuple. This is used to generate the
   * dependency vectors during the walk of the operator tree.
   */
  protected Map index;

  /**
   * Constructor.
   */
  public LineageInfo() {
    index = Collections.synchronizedMap(new LinkedHashMap());
  }

  /**
   * Gets the dependency for a table, column tuple.
   * @param dc The data container of the column whose dependency is being inspected.
   * @param col The column whose dependency is being inspected.
   * @return Dependency for that particular table, column tuple.
   *         null if no dependency is found.
   */
  public Dependency getDependency(DataContainer dc, FieldSchema col) {
    return index.get(new DependencyKey(dc, col));
  }

  /**
   * Puts the dependency for a table, column tuple.
   * @param dc The datacontainer whose dependency is being inserted.
   * @param col The column whose dependency is being inserted.
   * @param dep The dependency.
   */
  public void putDependency(DataContainer dc, FieldSchema col, Dependency dep) {
    index.put(new DependencyKey(dc, col), dep);
  }

  /**
   * Gets the entry set on this structure.
   *
   * @return LineageInfo entry set
   */
  public Set> entrySet() {
    return index.entrySet();
  }

  public void clear() {
    index.clear();
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy