All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.tencent.angel.utils.HdfsUtil Maven / Gradle / Ivy

/*
 * Tencent is pleased to support the open source community by making Angel available.
 *
 * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 
 * compliance with the License. You may obtain a copy of the License at
 *
 * https://opensource.org/licenses/Apache-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 *
 */


package com.tencent.angel.utils;

import com.tencent.angel.conf.AngelConf;
import com.tencent.angel.ml.predict.PredictResult;
import com.tencent.angel.worker.storage.DataBlock;
import com.tencent.angel.worker.task.TaskContext;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.mapred.InvalidInputException;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.security.TokenCache;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.StringUtils;

import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.List;
import java.util.UUID;

/**
 * HDFS operation(like list,copy etcs) utils.
 */
public class HdfsUtil {
  private static final Log LOG = LogFactory.getLog(HdfsUtil.class);
  public static final String INPUT_DIR = "mapreduce.input.fileinputformat.inputdir";
  public static final String SPLIT_MAXSIZE = "mapreduce.input.fileinputformat.split.maxsize";
  public static final String SPLIT_MINSIZE = "mapreduce.input.fileinputformat.split.minsize";
  public static final String PATHFILTER_CLASS = "mapreduce.input.pathFilter.class";
  public static final String NUM_INPUT_FILES = "mapreduce.input.fileinputformat.numinputfiles";
  public static final String INPUT_DIR_RECURSIVE =
    "mapreduce.input.fileinputformat.input.dir.recursive";


  private static class MultiPathFilter implements PathFilter {
    private List filters;

    public MultiPathFilter(List filters) {
      this.filters = filters;
    }

    public boolean accept(Path path) {
      for (PathFilter filter : filters) {
        if (!filter.accept(path)) {
          return false;
        }
      }
      return true;
    }
  }


  private static final PathFilter hiddenFileFilter = new PathFilter() {
    public boolean accept(Path p) {
      String name = p.getName();
      return !name.startsWith("_") && !name.startsWith(".");
    }
  };
  private static String tmpPrefix = "_tmp.";
  private static String finalPrefix = "_final.";
  private static String pathSep = "/";

  public static Path[] getInputPaths(JobContext context) {
    String dirs = context.getConfiguration().get(INPUT_DIR, "");
    // LOG.info(System.getProperty("user.dir"));
    LOG.info("dirs=" + dirs);
    String[] list = StringUtils.split(dirs);
    Path[] result = new Path[list.length];
    for (int i = 0; i < list.length; i++) {
      result[i] = new Path(StringUtils.unEscapeString(list[i]));
    }
    return result;
  }

  public static boolean getInputDirRecursive(JobContext job) {
    return job.getConfiguration().getBoolean(INPUT_DIR_RECURSIVE, false);
  }

  public static PathFilter getInputPathFilter(JobContext context) {
    Configuration conf = context.getConfiguration();
    Class filterClass = conf.getClass(PATHFILTER_CLASS, null, PathFilter.class);
    return (filterClass != null) ?
      (PathFilter) ReflectionUtils.newInstance(filterClass, conf) :
      null;
  }

  /**
   * List input directories. Subclasses may override to, e.g., select only files matching a regular
   * expression.
   *
   * @param job the job to list input paths for
   * @return array of FileStatus objects
   * @throws IOException if zero items.
   */
  protected static List listStatus(JobContext job) throws IOException {
    List result = new ArrayList();
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
      throw new IOException("No input paths specified in job");
    }

    // get tokens for all the required FileSystems..
    TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration());

    // Whether we need to recursive look into the directory structure
    boolean recursive = getInputDirRecursive(job);

    List errors = new ArrayList();

    // creates a MultiPathFilter with the hiddenFileFilter and the
    // user provided one (if any).
    List filters = new ArrayList();
    filters.add(hiddenFileFilter);
    PathFilter jobFilter = getInputPathFilter(job);
    if (jobFilter != null) {
      filters.add(jobFilter);
    }
    PathFilter inputFilter = new MultiPathFilter(filters);

    for (int i = 0; i < dirs.length; ++i) {
      LOG.info("dirs[" + i + "]=" + dirs[i]);
      Path p = dirs[i];
      FileSystem fs = p.getFileSystem(job.getConfiguration());
      FileStatus[] matches = fs.globStatus(p, inputFilter);
      if (matches == null) {
        errors.add(new IOException("Input path does not exist: " + p));
      } else if (matches.length == 0) {
        errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
      } else {
        for (FileStatus globStat : matches) {
          if (globStat.isDirectory()) {
            RemoteIterator iter = fs.listLocatedStatus(globStat.getPath());
            while (iter.hasNext()) {
              LocatedFileStatus stat = iter.next();
              if (inputFilter.accept(stat.getPath())) {
                if (recursive && stat.isDirectory()) {
                  addInputPathRecursively(result, fs, stat.getPath(), inputFilter);
                } else {
                  result.add(stat);
                }
              }
            }
          } else {
            result.add(globStat);
          }
        }
      }
    }

    if (!errors.isEmpty()) {
      throw new InvalidInputException(errors);
    }
    // LOG.info("Total input paths to process : " + result.size());
    return result;
  }

  protected static void addInputPathRecursively(List result, FileSystem fs, Path path,
    PathFilter inputFilter) throws IOException {
    RemoteIterator iter = fs.listLocatedStatus(path);
    while (iter.hasNext()) {
      LocatedFileStatus stat = iter.next();
      if (inputFilter.accept(stat.getPath())) {
        if (stat.isDirectory()) {
          addInputPathRecursively(result, fs, stat.getPath(), inputFilter);
        } else {
          result.add(stat);
        }
      }
    }
  }

  public static long getInputFileTotalSize(JobContext job) throws IOException {
    long ret = 0;
    LOG.info("before getInputFileTotalSize");
    List fss = listStatus(job);
    if (fss != null) {
      for (FileStatus fs : fss) {
        ret += fs.getLen();
      }
    }
    return ret;
  }

  public static Path[] getInputPaths(JobConf context) {
    String dirs = context.get(INPUT_DIR, "");
    LOG.info("dirs=" + dirs);
    String[] list = StringUtils.split(dirs);
    Path[] result = new Path[list.length];
    for (int i = 0; i < list.length; i++) {
      result[i] = new Path(StringUtils.unEscapeString(list[i]));
    }
    return result;
  }

  public static boolean getInputDirRecursive(JobConf job) {
    return job.getBoolean(INPUT_DIR_RECURSIVE, false);
  }

  public static PathFilter getInputPathFilter(JobConf context) {
    Configuration conf = context;
    Class filterClass = conf.getClass(PATHFILTER_CLASS, null, PathFilter.class);
    return (filterClass != null) ?
      (PathFilter) ReflectionUtils.newInstance(filterClass, conf) :
      null;
  }

  /**
   * List input directories. Subclasses may override to, e.g., select only files matching a regular
   * expression.
   *
   * @param job the job to list input paths for
   * @return array of FileStatus objects
   * @throws IOException if zero items.
   */
  protected static List listStatus(JobConf job) throws IOException {
    List result = new ArrayList();
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
      throw new IOException("No input paths specified in job");
    }

    // get tokens for all the required FileSystems..
    TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job);

    // Whether we need to recursive look into the directory structure
    boolean recursive = getInputDirRecursive(job);

    List errors = new ArrayList();

    // creates a MultiPathFilter with the hiddenFileFilter and the
    // user provided one (if any).
    List filters = new ArrayList();
    filters.add(hiddenFileFilter);
    PathFilter jobFilter = getInputPathFilter(job);
    if (jobFilter != null) {
      filters.add(jobFilter);
    }
    PathFilter inputFilter = new MultiPathFilter(filters);

    for (int i = 0; i < dirs.length; ++i) {
      LOG.info("dirs[" + i + "]=" + dirs[i]);
      Path p = dirs[i];
      FileSystem fs = p.getFileSystem(job);
      FileStatus[] matches = fs.globStatus(p, inputFilter);
      if (matches == null) {
        errors.add(new IOException("Input path does not exist: " + p));
      } else if (matches.length == 0) {
        errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
      } else {
        for (FileStatus globStat : matches) {
          if (globStat.isDirectory()) {
            RemoteIterator iter = fs.listLocatedStatus(globStat.getPath());
            while (iter.hasNext()) {
              LocatedFileStatus stat = iter.next();
              if (inputFilter.accept(stat.getPath())) {
                if (recursive && stat.isDirectory()) {
                  addInputPathRecursively(result, fs, stat.getPath(), inputFilter);
                } else {
                  result.add(stat);
                }
              }
            }
          } else {
            result.add(globStat);
          }
        }
      }
    }

    if (!errors.isEmpty()) {
      throw new InvalidInputException(errors);
    }
    // LOG.info("Total input paths to process : " + result.size());
    return result;
  }

  public static long getInputFileTotalSize(JobConf jobConf) throws IOException {
    long ret = 0;
    LOG.info("before getInputFileTotalSize");
    List fss = listStatus(jobConf);
    if (fss != null) {
      for (FileStatus fs : fss) {
        ret += fs.getLen();
      }
    }
    return ret;
  }

  public static Path toTmpPath(Path path) {
    return new Path(path.getParent(), tmpPrefix + path.getName());
  }

  public static void copy(Path tmpDestFile, Path destFile) {

  }

  // a simple hdfs copy function assume src path and dest path are in same hdfs
  // and FileSystem object has same schema
  static public void copyFilesInSameHdfs(Path srcf, Path destf, FileSystem fs) throws IOException {
    if (!fs.exists(destf))
      fs.mkdirs(destf);

    FileStatus[] srcs = fs.globStatus(srcf);
    if (srcs == null) {
      return;
    }

    for (int i = 0; i < srcs.length; i++) {
      copyDir(srcs[i].getPath(), destf, fs);
    }
  }

  public static boolean isTmpPath(String name) {
    return name.startsWith(tmpPrefix);
  }

  @SuppressWarnings("deprecation") private static void copyDir(Path srcf, Path destf, FileSystem fs)
    throws IOException {
    FileStatus[] items = fs.listStatus(srcf);
    for (int i = 0; i < items.length; i++) {
      if (items[i].isDir()) {
        Path destPath = new Path(destf, items[i].getPath().getName());

        if (!fs.exists(destPath)) {
          fs.mkdirs(destPath);
        }
        copyDir(items[i].getPath(), destPath, fs);
      } else {
        if (isTmpPath(items[i].getPath().getName())) {
          continue;
        }
        if (!fs.rename(items[i].getPath(), new Path(destf, items[i].getPath().getName()))) {
          throw new IOException(
            "rename from " + items[i].getPath() + " to " + destf + "/" + items[i].getPath()
              .getName() + " failed");
        }
      }
    }
  }

  public static Path toFinalPath(Path path) {
    return new Path(path.getParent(), finalPrefix + path.getName());
  }

  public static void rename(Path tmpCombinePath, Path outputPath, FileSystem fs)
    throws IOException {
    // If out path exist , just remove it first
    if (fs.exists(outputPath)) {
      fs.delete(outputPath, true);
    }

    // Create parent directory if not exist
    if(!fs.exists(outputPath.getParent())) {
      fs.mkdirs(outputPath.getParent());
    }

    // Rename
    if (!fs.rename(tmpCombinePath, outputPath)) {
      throw new IOException("rename from " + tmpCombinePath + " to " + outputPath + " failed");
    }
  }

  public static Path generateTmpDirectory(Configuration conf, String appId, Path outputPath) {
    URI uri = outputPath.toUri();
    String path =
      (uri.getScheme() != null ? uri.getScheme() : "hdfs") + "://" + (uri.getHost() != null ?
        uri.getHost() :
        "") + (uri.getPort() > 0 ? (":" + uri.getPort()) : "");
    String user = conf.get(AngelConf.USER_NAME, "");
    String tmpDir = conf.get(AngelConf.ANGEL_JOB_TMP_OUTPUT_PATH_PREFIX, "/tmp/" + user);
    String finalTmpDirForApp = path + tmpDir + "/" + appId + "_" + UUID.randomUUID().toString();
    LOG.info("tmp output dir is " + finalTmpDirForApp);
    return new Path(finalTmpDirForApp);
  }

  public static void writeStorage(DataBlock dataBlock, TaskContext taskContext)
    throws IOException {
    String outDir = taskContext.getConf().get(AngelConf.ANGEL_JOB_TMP_OUTPUT_PATH);
    Path outPath = new Path(outDir, "predict");
    FileSystem fs = outPath.getFileSystem(taskContext.getConf());
    String outFileName = "task_" + taskContext.getTaskIndex();
    String tmpOutFileName = tmpPrefix + outFileName;

    Path outFilePath = new Path(outPath, outFileName);
    Path tmpOutFilePath = new Path(outPath, tmpOutFileName);
    if (fs.exists(tmpOutFilePath)) {
      fs.delete(tmpOutFilePath, true);
    }

    FSDataOutputStream output = fs.create(tmpOutFilePath);
    LOG.info("tmpOutFilePath=" + tmpOutFilePath);
    dataBlock.resetReadIndex();
    PredictResult resultItem = null;
    boolean isFirstRow = true;
    while (true) {
      resultItem = dataBlock.read();
      if (resultItem == null) {
        break;
      }

      output.writeBytes(resultItem.getText() + "\n");
    }

    output.close();

    rename(tmpOutFilePath, outFilePath, fs);
    LOG.info("rename " + tmpOutFilePath + " to " + outFilePath);
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy