org.apache.hadoop.hive.ql.parse.UpdateDeleteSemanticAnalyzer Maven / Gradle / Ivy

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.ql.parse;

import java.io.IOException;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.ql.Context;
import org.apache.hadoop.hive.ql.ErrorMsg;
import org.apache.hadoop.hive.ql.hooks.Entity;
import org.apache.hadoop.hive.ql.hooks.ReadEntity;
import org.apache.hadoop.hive.ql.hooks.WriteEntity;
import org.apache.hadoop.hive.ql.io.AcidUtils;
import org.apache.hadoop.hive.ql.lib.Node;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.InvalidTableException;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.metadata.VirtualColumn;
import org.apache.hadoop.hive.ql.session.SessionState;


/**
 * A subclass of the {@link org.apache.hadoop.hive.ql.parse.SemanticAnalyzer} that just handles
 * update and delete statements.  It works by rewriting the updates and deletes into insert
 * statements (since they are actually inserts) and then doing some patch up to make them work as
 * updates and deletes instead.
 */
public class UpdateDeleteSemanticAnalyzer extends SemanticAnalyzer {

  boolean useSuper = false;

  public UpdateDeleteSemanticAnalyzer(HiveConf conf) throws SemanticException {
    super(conf);
  }

  @Override
  public void analyzeInternal(ASTNode tree) throws SemanticException {
    if (useSuper) {
      super.analyzeInternal(tree);
    } else {

      if (!SessionState.get().getTxnMgr().supportsAcid()) {
        throw new SemanticException(ErrorMsg.ACID_OP_ON_NONACID_TXNMGR.getMsg());
      }
      switch (tree.getToken().getType()) {
        case HiveParser.TOK_DELETE_FROM:
          analyzeDelete(tree);
          return;

        case HiveParser.TOK_UPDATE_TABLE:
          analyzeUpdate(tree);
          return;

        default:
          throw new RuntimeException("Asked to parse token " + tree.getName() + " in " +
              "UpdateDeleteSemanticAnalyzer");
      }
    }
  }

  @Override
  protected boolean updating() {
    return ctx.getAcidOperation() == AcidUtils.Operation.UPDATE;
  }

  @Override
  protected boolean deleting() {
    return ctx.getAcidOperation() == AcidUtils.Operation.DELETE;
  }

  private void analyzeUpdate(ASTNode tree) throws SemanticException {
    ctx.setAcidOperation(AcidUtils.Operation.UPDATE);
    reparseAndSuperAnalyze(tree);
  }

  private void analyzeDelete(ASTNode tree) throws SemanticException {
    ctx.setAcidOperation(AcidUtils.Operation.DELETE);
    reparseAndSuperAnalyze(tree);
  }

  private void reparseAndSuperAnalyze(ASTNode tree) throws SemanticException {
    List children = tree.getChildren();
    // The first child should be the table we are deleting from
    ASTNode tabName = (ASTNode)children.get(0);
    assert tabName.getToken().getType() == HiveParser.TOK_TABNAME :
        "Expected tablename as first child of " + operation() + " but found " + tabName.getName();
    String[] tableName = getQualifiedTableName(tabName);

    // Rewrite the delete or update into an insert.  Crazy, but it works as deletes and update
    // actually are inserts into the delta file in Hive.  A delete
    // DELETE FROM _tablename_ [WHERE ...]
    // will be rewritten as
    // INSERT INTO TABLE _tablename_ [PARTITION (_partcols_)] SELECT ROW__ID[,
    // _partcols_] from _tablename_ SORT BY ROW__ID
    // An update
    // UPDATE _tablename_ SET x = _expr_ [WHERE...]
    // will be rewritten as
    // INSERT INTO TABLE _tablename_ [PARTITION (_partcols_)] SELECT _all_,
    // _partcols_from _tablename_ SORT BY ROW__ID
    // where _all_ is all the non-partition columns.  The expressions from the set clause will be
    // re-attached later.
    // The where clause will also be re-attached later.
    // The sort by clause is put in there so that records come out in the right order to enable
    // merge on read.

    StringBuilder rewrittenQueryStr = new StringBuilder();
    Table mTable;
    try {
      mTable = db.getTable(tableName[0], tableName[1]);
    } catch (InvalidTableException e) {
      LOG.error("Failed to find table " + getDotName(tableName) + " got exception "
          + e.getMessage());
      throw new SemanticException(ErrorMsg.INVALID_TABLE.getMsg(getDotName(tableName)), e);
    } catch (HiveException e) {
      LOG.error("Failed to find table " + getDotName(tableName) + " got exception "
          + e.getMessage());
      throw new SemanticException(e.getMessage(), e);
    }

    List partCols = mTable.getPartCols();
    List bucketingCols = mTable.getBucketCols();

    rewrittenQueryStr.append("insert into table ");
    rewrittenQueryStr.append(getDotName(tableName));

    // If the table is partitioned we have to put the partition() clause in
    if (partCols != null && partCols.size() > 0) {
      rewrittenQueryStr.append(" partition (");
      boolean first = true;
      for (FieldSchema fschema : partCols) {
        if (first) first = false;
        else rewrittenQueryStr.append(", ");
        rewrittenQueryStr.append(fschema.getName());
      }
      rewrittenQueryStr.append(")");
    }

    rewrittenQueryStr.append(" select ROW__ID");
    Map setColExprs = null;
    Map setCols = null;
    // Must be deterministic order set for consistent q-test output across Java versions
    Set setRCols = new LinkedHashSet();
    if (updating()) {
      // An update needs to select all of the columns, as we rewrite the entire row.  Also,
      // we need to figure out which columns we are going to replace.  We won't write the set
      // expressions in the rewritten query.  We'll patch that up later.
      // The set list from update should be the second child (index 1)
      assert children.size() >= 2 : "Expected update token to have at least two children";
      ASTNode setClause = (ASTNode)children.get(1);
      assert setClause.getToken().getType() == HiveParser.TOK_SET_COLUMNS_CLAUSE :
          "Expected second child of update token to be set token";

      // Get the children of the set clause, each of which should be a column assignment
      List assignments = setClause.getChildren();
      // Must be deterministic order map for consistent q-test output across Java versions
      setCols = new LinkedHashMap(assignments.size());
      setColExprs = new HashMap(assignments.size());
      for (Node a : assignments) {
        ASTNode assignment = (ASTNode)a;
        assert assignment.getToken().getType() == HiveParser.EQUAL :
            "Expected set assignments to use equals operator but found " + assignment.getName();
        ASTNode tableOrColTok = (ASTNode)assignment.getChildren().get(0);
        assert tableOrColTok.getToken().getType() == HiveParser.TOK_TABLE_OR_COL :
            "Expected left side of assignment to be table or column";
        ASTNode colName = (ASTNode)tableOrColTok.getChildren().get(0);
        assert colName.getToken().getType() == HiveParser.Identifier :
            "Expected column name";

        addSetRCols((ASTNode) assignment.getChildren().get(1), setRCols);

        String columnName = normalizeColName(colName.getText());

        // Make sure this isn't one of the partitioning columns, that's not supported.
        if (partCols != null) {
          for (FieldSchema fschema : partCols) {
            if (fschema.getName().equalsIgnoreCase(columnName)) {
              throw new SemanticException(ErrorMsg.UPDATE_CANNOT_UPDATE_PART_VALUE.getMsg());
            }
          }
        }
        //updating bucket column should move row from one file to another - not supported
        if(bucketingCols != null && bucketingCols.contains(columnName)) {
          throw new SemanticException(ErrorMsg.UPDATE_CANNOT_UPDATE_BUCKET_VALUE,columnName);
        }
        // This means that in UPDATE T SET x = _something_
        // _something_ can be whatever is supported in SELECT _something_
        setCols.put(columnName, (ASTNode)assignment.getChildren().get(1));
      }

      List nonPartCols = mTable.getCols();
      for (int i = 0; i < nonPartCols.size(); i++) {
        rewrittenQueryStr.append(',');
        String name = nonPartCols.get(i).getName();
        ASTNode setCol = setCols.get(name);
        rewrittenQueryStr.append(name);
        if (setCol != null) {
          // This is one of the columns we're setting, record it's position so we can come back
          // later and patch it up.
          // Add one to the index because the select has the ROW__ID as the first column.
          setColExprs.put(i + 1, setCol);
        }
      }
    }

    // If the table is partitioned, we need to select the partition columns as well.
    if (partCols != null) {
      for (FieldSchema fschema : partCols) {
        rewrittenQueryStr.append(", ");
        rewrittenQueryStr.append(fschema.getName());
      }
    }
    rewrittenQueryStr.append(" from ");
    rewrittenQueryStr.append(getDotName(tableName));

    ASTNode where = null;
    int whereIndex = deleting() ? 1 : 2;
    if (children.size() > whereIndex) {
      where = (ASTNode)children.get(whereIndex);
      assert where.getToken().getType() == HiveParser.TOK_WHERE :
          "Expected where clause, but found " + where.getName();
    }

    // Add a sort by clause so that the row ids come out in the correct order
    rewrittenQueryStr.append(" sort by ROW__ID ");

    // Parse the rewritten query string
    Context rewrittenCtx;
    try {
      // Set dynamic partitioning to nonstrict so that queries do not need any partition
      // references.
      HiveConf.setVar(conf, HiveConf.ConfVars.DYNAMICPARTITIONINGMODE, "nonstrict");
      rewrittenCtx = new Context(conf);
    } catch (IOException e) {
      throw new SemanticException(ErrorMsg.UPDATEDELETE_IO_ERROR.getMsg());
    }
    rewrittenCtx.setCmd(rewrittenQueryStr.toString());
    rewrittenCtx.setAcidOperation(ctx.getAcidOperation());

    ParseDriver pd = new ParseDriver();
    ASTNode rewrittenTree;
    try {
      LOG.info("Going to reparse " + operation() + " as <" + rewrittenQueryStr.toString() + ">");
      rewrittenTree = pd.parse(rewrittenQueryStr.toString(), rewrittenCtx);
      rewrittenTree = ParseUtils.findRootNonNullToken(rewrittenTree);

    } catch (ParseException e) {
      throw new SemanticException(ErrorMsg.UPDATEDELETE_PARSE_ERROR.getMsg(), e);
    }

    ASTNode rewrittenInsert = (ASTNode)rewrittenTree.getChildren().get(1);
    assert rewrittenInsert.getToken().getType() == HiveParser.TOK_INSERT :
        "Expected TOK_INSERT as second child of TOK_QUERY but found " + rewrittenInsert.getName();

    if (where != null) {
      // The structure of the AST for the rewritten insert statement is:
      // TOK_QUERY -> TOK_FROM
      //          \-> TOK_INSERT -> TOK_INSERT_INTO
      //                        \-> TOK_SELECT
      //                        \-> TOK_SORTBY
      // The following adds the TOK_WHERE and its subtree from the original query as a child of
      // TOK_INSERT, which is where it would have landed if it had been there originally in the
      // string.  We do it this way because it's easy then turning the original AST back into a
      // string and reparsing it.  We have to move the SORT_BY over one,
      // so grab it and then push it to the second slot, and put the where in the first slot
      ASTNode sortBy = (ASTNode)rewrittenInsert.getChildren().get(2);
      assert sortBy.getToken().getType() == HiveParser.TOK_SORTBY :
          "Expected TOK_SORTBY to be first child of TOK_SELECT, but found " + sortBy.getName();
      rewrittenInsert.addChild(sortBy);
      rewrittenInsert.setChild(2, where);
    }

    // Patch up the projection list for updates, putting back the original set expressions.
    if (updating() && setColExprs != null) {
      // Walk through the projection list and replace the column names with the
      // expressions from the original update.  Under the TOK_SELECT (see above) the structure
      // looks like:
      // TOK_SELECT -> TOK_SELEXPR -> expr
      //           \-> TOK_SELEXPR -> expr ...
      ASTNode rewrittenSelect = (ASTNode)rewrittenInsert.getChildren().get(1);
      assert rewrittenSelect.getToken().getType() == HiveParser.TOK_SELECT :
          "Expected TOK_SELECT as second child of TOK_INSERT but found " +
              rewrittenSelect.getName();
      for (Map.Entry entry : setColExprs.entrySet()) {
        ASTNode selExpr = (ASTNode)rewrittenSelect.getChildren().get(entry.getKey());
        assert selExpr.getToken().getType() == HiveParser.TOK_SELEXPR :
            "Expected child of TOK_SELECT to be TOK_SELEXPR but was " + selExpr.getName();
        // Now, change it's child
        selExpr.setChild(0, entry.getValue());
      }
    }

    try {
      useSuper = true;
      super.analyze(rewrittenTree, rewrittenCtx);
    } finally {
      useSuper = false;
    }

    // Walk through all our inputs and set them to note that this read is part of an update or a
    // delete.
    for (ReadEntity input : inputs) {
      input.setUpdateOrDelete(true);
    }

    if (inputIsPartitioned(inputs)) {
      // In order to avoid locking the entire write table we need to replace the single WriteEntity
      // with a WriteEntity for each partition
      outputs.clear();
      for (ReadEntity input : inputs) {
        if (input.getTyp() == Entity.Type.PARTITION) {
          WriteEntity.WriteType writeType = deleting() ? WriteEntity.WriteType.DELETE :
              WriteEntity.WriteType.UPDATE;
          outputs.add(new WriteEntity(input.getPartition(), writeType));
        }
      }
    } else {
      // We still need to patch up the WriteEntities as they will have an insert type.  Change
      // them to the appropriate type for our operation.
      for (WriteEntity output : outputs) {
        output.setWriteType(deleting() ? WriteEntity.WriteType.DELETE :
            WriteEntity.WriteType.UPDATE);
      }
    }

    // For updates, we need to set the column access info so that it contains information on
    // the columns we are updating.
    if (updating()) {
      ColumnAccessInfo cai = new ColumnAccessInfo();
      for (String colName : setCols.keySet()) {
        cai.add(Table.getCompleteName(mTable.getDbName(), mTable.getTableName()), colName);
      }
      setUpdateColumnAccessInfo(cai);

      // Add the setRCols to the input list
      for (String colName : setRCols) {
        if(columnAccessInfo != null) {//assuming this means we are not doing Auth
          columnAccessInfo.add(Table.getCompleteName(mTable.getDbName(), mTable.getTableName()),
            colName);
        }
      }
    }

    // We need to weed ROW__ID out of the input column info, as it doesn't make any sense to
    // require the user to have authorization on that column.
    if (columnAccessInfo != null) {
      columnAccessInfo.stripVirtualColumn(VirtualColumn.ROWID);
    }
  }

  private String operation() {
    if (updating()) return "update";
    else if (deleting()) return "delete";
    else throw new IllegalStateException("UpdateDeleteSemanticAnalyzer neither updating nor " +
          "deleting, operation not known.");
  }

  private boolean inputIsPartitioned(Set inputs) {
    // We cannot simply look at the first entry, as in the case where the input is partitioned
    // there will be a table entry as well.  So look for at least one partition entry.
    for (ReadEntity re : inputs) {
      if (re.getTyp() == Entity.Type.PARTITION) {
        return true;
      }
    }
    return false;
  }

  // This method find any columns on the right side of a set statement (thus rcols) and puts them
  // in a set so we can add them to the list of input cols to check.
  private void addSetRCols(ASTNode node, Set setRCols) {

    // See if this node is a TOK_TABLE_OR_COL.  If so, find the value and put it in the list.  If
    // not, recurse on any children
    if (node.getToken().getType() == HiveParser.TOK_TABLE_OR_COL) {
      ASTNode colName = (ASTNode)node.getChildren().get(0);
      assert colName.getToken().getType() == HiveParser.Identifier :
          "Expected column name";
      setRCols.add(normalizeColName(colName.getText()));
    } else if (node.getChildren() != null) {
      for (Node n : node.getChildren()) {
        addSetRCols((ASTNode)n, setRCols);
      }
    }
  }

  /**
   * Column names are stored in metastore in lower case, regardless of the CREATE TABLE statement.
   * Unfortunately there is no single place that normalizes the input query.
   * @param colName not null
   */
  private static String normalizeColName(String colName) {
    return colName.toLowerCase();
  }
}