All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hive.ql.exec.ReplCopyTask Maven / Gradle / Ivy

There is a newer version: 4.0.1
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.exec;

import org.apache.hadoop.hive.ql.parse.EximUtil;
import org.apache.hadoop.hive.ql.parse.ReplicationSpec;
import org.apache.hadoop.hive.ql.plan.CopyWork;
import org.apache.hadoop.hive.ql.plan.ReplCopyWork;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.FileUtils;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.DriverContext;
import org.apache.hadoop.hive.ql.parse.LoadSemanticAnalyzer;
import org.apache.hadoop.hive.ql.plan.api.StageType;
import org.apache.hadoop.util.StringUtils;

public class ReplCopyTask extends Task implements Serializable {


  private static final long serialVersionUID = 1L;

  private static transient final Logger LOG = LoggerFactory.getLogger(ReplCopyTask.class);

  public ReplCopyTask(){
    super();
  }

  @Override
  protected int execute(DriverContext driverContext) {
    LOG.debug("ReplCopyTask.execute()");
    FileSystem dstFs = null;
    Path toPath = null;
    try {
      Path fromPath = work.getFromPath();
      toPath = work.getToPath();

      console.printInfo("Copying data from " + fromPath.toString(), " to "
          + toPath.toString());

      ReplCopyWork rwork = ((ReplCopyWork)work);

      FileSystem srcFs = fromPath.getFileSystem(conf);
      dstFs = toPath.getFileSystem(conf);

      List srcFiles = new ArrayList();
      FileStatus[] srcs = LoadSemanticAnalyzer.matchFilesOrDir(srcFs, fromPath);
      LOG.debug("ReplCopyTasks srcs=" + (srcs == null ? "null" : srcs.length));
      if (! rwork.getReadListFromInput()){
        if (srcs == null || srcs.length == 0) {
          if (work.isErrorOnSrcEmpty()) {
            console.printError("No files matching path: " + fromPath.toString());
            return 3;
          } else {
            return 0;
          }
        }
      } else {
        LOG.debug("ReplCopyTask making sense of _files");
        // Our input is probably the result of a _files listing, we should expand out _files.
        srcFiles = filesInFileListing(srcFs,fromPath);
        LOG.debug("ReplCopyTask _files contains:" + (srcFiles == null ? "null" : srcFiles.size()));
        if (srcFiles == null){
          if (work.isErrorOnSrcEmpty()) {
            console.printError("No _files entry found on source: " + fromPath.toString());
            return 5;
          } else {
            return 0;
          }
        }
      }
      // Add in all the lone filecopies expected as well - applies to
      // both _files case stragglers and regular copies
      srcFiles.addAll(Arrays.asList(srcs));
      LOG.debug("ReplCopyTask numFiles:" + (srcFiles == null ? "null" : srcFiles.size()));

      boolean inheritPerms = conf.getBoolVar(HiveConf.ConfVars.HIVE_WAREHOUSE_SUBDIR_INHERIT_PERMS);
      if (!FileUtils.mkdir(dstFs, toPath, inheritPerms, conf)) {
        console.printError("Cannot make target directory: " + toPath.toString());
        return 2;
      }

      BufferedWriter listBW = null;
      if (rwork.getListFilesOnOutputBehaviour()){
        Path listPath = new Path(toPath,EximUtil.FILES_NAME);
        LOG.debug("ReplCopyTask : generating _files at :" + listPath.toUri().toString());
        if (dstFs.exists(listPath)){
          console.printError("Cannot make target _files file:" + listPath.toString());
          return 4;
        }
        listBW = new BufferedWriter(new OutputStreamWriter(dstFs.create(listPath)));
        // TODO : verify that not specifying charset here does not bite us
        // later(for cases where filenames have unicode chars)
      }

      for (FileStatus oneSrc : srcFiles) {
        console.printInfo("Copying file: " + oneSrc.getPath().toString());
        LOG.debug("Copying file: " + oneSrc.getPath().toString());
        if (!rwork.getListFilesOnOutputBehaviour(oneSrc)){
          FileSystem actualSrcFs = null;
          if (rwork.getReadListFromInput()){
            // TODO : filesystemcache prevents this from being a perf nightmare, but we
            // should still probably follow up to see if we need to do something better here.
            actualSrcFs = oneSrc.getPath().getFileSystem(conf);
          } else {
            actualSrcFs = srcFs;
          }

          LOG.debug("ReplCopyTask :cp:" + oneSrc.getPath() + "=>" + toPath);
          if (!FileUtils.copy(actualSrcFs, oneSrc.getPath(), dstFs, toPath,
            false, // delete source
            true, // overwrite destination
            conf)) {
          console.printError("Failed to copy: '" + oneSrc.getPath().toString()
              + "to: '" + toPath.toString() + "'");
          return 1;
          }
        }else{
          LOG.debug("ReplCopyTask _files now tracks:" + oneSrc.getPath().toUri());
          console.printInfo("Tracking file: " + oneSrc.getPath().toUri());
          listBW.write(oneSrc.getPath().toUri().toString() + "\n");
        }
      }

      if (listBW != null){
        listBW.close();
      }

      return 0;

    } catch (Exception e) {
      console.printError("Failed with exception " + e.getMessage(), "\n"
          + StringUtils.stringifyException(e));
      return (1);
    }
  }


  private List filesInFileListing(FileSystem fs, Path path)
      throws IOException {
    Path fileListing = new Path(path, EximUtil.FILES_NAME);
    LOG.debug("ReplCopyTask filesInFileListing() reading " + fileListing.toUri());
    if (! fs.exists(fileListing)){
      LOG.debug("ReplCopyTask : _files does not exist");
      return null; // Returning null from this fn can serve as an err condition.
      // On success, but with nothing to return, we can return an empty list.
    }

    List ret = new ArrayList();
    BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(fileListing)));
    // TODO : verify if skipping charset here is okay

    String line = null;
    while ( (line = br.readLine()) != null){
      LOG.debug("ReplCopyTask :_filesReadLine:" + line);
      String fileUriStr = EximUtil.getCMDecodedFileName(line);
      // TODO HIVE-15490: Add checksum validation here
      Path p = new Path(fileUriStr);
      // TODO: again, fs cache should make this okay, but if not, revisit
      FileSystem srcFs = p.getFileSystem(conf);
      ret.add(srcFs.getFileStatus(p));
      // Note - we need srcFs rather than fs, because it is possible that the _files lists files
      // which are from a different filesystem than the fs where the _files file itself was loaded
      // from. Currently, it is possible, for eg., to do REPL LOAD hdfs:///dir/ and for the _files
      // in it to contain hdfs:/// entries, and/or vice-versa, and this causes errors.
      // It might also be possible that there will be a mix of them in a given _files file.
      // TODO: revisit close to the end of replv2 dev, to see if our assumption now still holds,
      // and if not so, optimize.
    }

    return ret;
  }

  @Override
  public StageType getType() {
    return StageType.COPY;
    // there's no extensive need for this to have its own type - it mirrors
    // the intent of copy enough. This might change later, though.
  }

  @Override
  public String getName() {
    return "REPL_COPY";
  }

  public static Task getLoadCopyTask(ReplicationSpec replicationSpec, Path srcPath, Path dstPath, HiveConf conf) {
    Task copyTask = null;
    LOG.debug("ReplCopyTask:getLoadCopyTask: "+srcPath + "=>" + dstPath);
    if (replicationSpec.isInReplicationScope()){
      ReplCopyWork rcwork = new ReplCopyWork(srcPath, dstPath, false);
      LOG.debug("ReplCopyTask:\trcwork");
      if (replicationSpec.isLazy()){
        LOG.debug("ReplCopyTask:\tlazy");
        rcwork.setReadListFromInput(true);
      }
      copyTask = TaskFactory.get(rcwork, conf);
    } else {
      LOG.debug("ReplCopyTask:\tcwork");
      copyTask = TaskFactory.get(new CopyWork(srcPath, dstPath, false), conf);
    }
    return copyTask;
  }

  public static Task getDumpCopyTask(ReplicationSpec replicationSpec, Path srcPath, Path dstPath, HiveConf conf) {
    Task copyTask = null;
    LOG.debug("ReplCopyTask:getDumpCopyTask: "+srcPath + "=>" + dstPath);
    if (replicationSpec.isInReplicationScope()){
      ReplCopyWork rcwork = new ReplCopyWork(srcPath, dstPath, false);
      LOG.debug("ReplCopyTask:\trcwork");
      if (replicationSpec.isLazy()){
        LOG.debug("ReplCopyTask:\tlazy");
        rcwork.setListFilesOnOutputBehaviour(true);
      }
      copyTask = TaskFactory.get(rcwork, conf);
    } else {
      LOG.debug("ReplCopyTask:\tcwork");
      copyTask = TaskFactory.get(new CopyWork(srcPath, dstPath, false), conf);
    }
    return copyTask;
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy