org.apache.zeppelin.submarine.hadoop.HdfsClient Maven / Gradle / Ivy

Go to download
/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.zeppelin.submarine.hadoop;

import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.zeppelin.conf.ZeppelinConfiguration;
import org.apache.zeppelin.interpreter.thrift.ParagraphInfo;
import org.apache.zeppelin.submarine.commons.SubmarineConstants;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.ByteArrayOutputStream;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.security.PrivilegedExceptionAction;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Hadoop FileSystem wrapper. Support both secure and no-secure mode
 */
public class HdfsClient {
  private static Logger LOGGER = LoggerFactory.getLogger(HdfsClient.class);

  private ZeppelinConfiguration zConf = ZeppelinConfiguration.create();
  private Configuration hadoopConf;
  private boolean isSecurityEnabled;
  private FileSystem fs;

  private static Pattern REPL_PATTERN =
      Pattern.compile("(\\s*)%([\\w\\.]+)(\\(.*?\\))?.*", Pattern.DOTALL);

  public HdfsClient(Properties properties) {
    String krb5conf = properties.getProperty(SubmarineConstants.SUBMARINE_HADOOP_KRB5_CONF, "");
    if (!StringUtils.isEmpty(krb5conf)) {
      System.setProperty("java.security.krb5.conf", krb5conf);
    }

    this.hadoopConf = new Configuration();
    // disable checksum for local file system. because interpreter.json may be updated by
    // non-hadoop filesystem api
    // disable caching for file:// scheme to avoid getting LocalFS which does CRC checks
    // this.hadoopConf.setBoolean("fs.file.impl.disable.cache", true);
    this.hadoopConf.set("fs.file.impl", "org.apache.hadoop.hdfs.DistributedFileSystem");
    // UserGroupInformation.setConfiguration(hadoopConf);
    this.isSecurityEnabled = UserGroupInformation.isSecurityEnabled();

    if (isSecurityEnabled) {
      String keytab = properties.getProperty(
          SubmarineConstants.SUBMARINE_HADOOP_KEYTAB, "");
      String principal = properties.getProperty(
          SubmarineConstants.SUBMARINE_HADOOP_PRINCIPAL, "");

      ZeppelinConfiguration zConf = ZeppelinConfiguration.create();
      if (StringUtils.isEmpty(keytab)) {
        keytab = zConf.getString(
            ZeppelinConfiguration.ConfVars.ZEPPELIN_SERVER_KERBEROS_KEYTAB);
      }
      if (StringUtils.isEmpty(principal)) {
        principal = zConf.getString(
            ZeppelinConfiguration.ConfVars.ZEPPELIN_SERVER_KERBEROS_PRINCIPAL);
      }
      if (StringUtils.isBlank(keytab) || StringUtils.isBlank(principal)) {
        throw new RuntimeException("keytab and principal can not be empty, keytab: " + keytab
            + ", principal: " + principal);
      }
      try {
        UserGroupInformation.loginUserFromKeytab(principal, keytab);
      } catch (IOException e) {
        throw new RuntimeException("Fail to login via keytab:" + keytab +
        ", principal:" + principal, e);
      } catch (Exception e) {
        LOGGER.error(e.getMessage(), e);
      }
    }

    try {
      this.fs = FileSystem.get(new URI("/"), this.hadoopConf);
    } catch (IOException e) {
      LOGGER.error(e.getMessage(), e);
    } catch (URISyntaxException e) {
      LOGGER.error(e.getMessage(), e);
    }
  }

  public FileSystem getFs() {
    return fs;
  }

  public Path makeQualified(Path path) {
    return fs.makeQualified(path);
  }

  public boolean exists(final Path path) throws IOException {
    return callHdfsOperation(new HdfsOperation() {

      @Override
      public Boolean call() throws IOException {
        return fs.exists(path);
      }
    });
  }

  public void tryMkDir(final Path dir) throws IOException {
    callHdfsOperation(new HdfsOperation() {
      @Override
      public Void call() throws IOException {
        if (!fs.exists(dir)) {
          fs.mkdirs(dir);
          LOGGER.info("Create dir {} in hdfs", dir.toString());
        }
        if (fs.isFile(dir)) {
          throw new IOException(dir.toString() + " is file instead of directory, please remove " +
              "it or specify another directory");
        }
        fs.mkdirs(dir);
        return null;
      }
    });
  }

  public List list(final Path path) throws IOException {
    return callHdfsOperation(new HdfsOperation>() {
      @Override
      public List call() throws IOException {
        List paths = new ArrayList<>();
        for (FileStatus status : fs.globStatus(path)) {
          paths.add(status.getPath());
        }
        return paths;
      }
    });
  }

  // recursive search path, (list folder in sub folder on demand, instead of load all
  // data when zeppelin server start)
  public List listAll(final Path path) throws IOException {
    return callHdfsOperation(new HdfsOperation>() {
      @Override
      public List call() throws IOException {
        List paths = new ArrayList<>();
        collectNoteFiles(path, paths);
        return paths;
      }

      private void collectNoteFiles(Path folder, List noteFiles) throws IOException {
        FileStatus[] paths = fs.listStatus(folder);
        for (FileStatus path : paths) {
          if (path.isDirectory()) {
            collectNoteFiles(path.getPath(), noteFiles);
          } else {
            if (path.getPath().getName().endsWith(".zpln")) {
              noteFiles.add(path.getPath());
            } else {
              LOGGER.warn("Unknown file: " + path.getPath());
            }
          }
        }
      }
    });
  }

  public boolean delete(final Path path) throws IOException {
    return callHdfsOperation(new HdfsOperation() {
      @Override
      public Boolean call() throws IOException {
        return fs.delete(path, true);
      }
    });
  }

  public String readFile(final Path file) throws IOException {
    return callHdfsOperation(new HdfsOperation() {
      @Override
      public String call() throws IOException {
        LOGGER.debug("Read from file: " + file);
        ByteArrayOutputStream noteBytes = new ByteArrayOutputStream();
        IOUtils.copyBytes(fs.open(file), noteBytes, hadoopConf);
        return new String(noteBytes.toString(
            zConf.getString(ZeppelinConfiguration.ConfVars.ZEPPELIN_ENCODING)));
      }
    });
  }

  public void writeFile(final String content, final Path file)
      throws IOException {
    callHdfsOperation(new HdfsOperation() {
      @Override
      public Void call() throws IOException {
        InputStream in = new ByteArrayInputStream(content.getBytes(
            zConf.getString(ZeppelinConfiguration.ConfVars.ZEPPELIN_ENCODING)));
        Path tmpFile = new Path(file.toString() + ".tmp");
        IOUtils.copyBytes(in, fs.create(tmpFile), hadoopConf);
        fs.delete(file, true);
        fs.rename(tmpFile, file);
        return null;
      }
    });
  }

  public void move(Path src, Path dest) throws IOException {
    callHdfsOperation(() -> {
      fs.rename(src, dest);
      return null;
    });
  }

  private interface HdfsOperation {
    T call() throws IOException;
  }

  private synchronized  T callHdfsOperation(final HdfsOperation func) throws IOException {
    if (isSecurityEnabled) {
      try {
        return UserGroupInformation.getCurrentUser().doAs(new PrivilegedExceptionAction() {
          @Override
          public T run() throws Exception {
            return func.call();
          }
        });
      } catch (InterruptedException e) {
        throw new IOException(e);
      }
    } else {
      return func.call();
    }
  }

  public String parseText(String text) {
    String script = "", intpText = "";

    // parse text to get interpreter component
    if (text != null) {
      Matcher matcher = REPL_PATTERN.matcher(text);
      if (matcher.matches()) {
        String headingSpace = matcher.group(1);
        intpText = matcher.group(2);

        if (matcher.groupCount() == 3 && matcher.group(3) != null) {
          String localPropertiesText = matcher.group(3);
          String[] splits = localPropertiesText.substring(1, localPropertiesText.length() - 1)
              .split(",");
          for (String split : splits) {
            String[] kv = split.split("=");
            if (StringUtils.isBlank(split) || kv.length == 0) {
              continue;
            }
            if (kv.length > 2) {
              throw new RuntimeException("Invalid paragraph properties format: " + split);
            }
          }
          script = text.substring(headingSpace.length() + intpText.length() +
              localPropertiesText.length() + 1).trim();
        } else {
          script = text.substring(headingSpace.length() + intpText.length() + 1).trim();
        }
      } else {
        script = text.trim();
      }
    }

    return script;
  }

  public String saveParagraphToFiles(String noteId, List paragraphInfos,
                                     String dirName, Properties properties)
      throws Exception {
    StringBuffer outputMsg = new StringBuffer();

    String hdfsUploadPath = properties.getProperty(
        SubmarineConstants.SUBMARINE_ALGORITHM_HDFS_PATH, "");

    HashMap mapParagraph = new HashMap<>();
    for (int i = 0; i < paragraphInfos.size(); i++) {
      ParagraphInfo paragraph = paragraphInfos.get(i);
      String paragraphTitle = paragraph.getParagraphTitle();
      if (org.apache.commons.lang3.StringUtils.isEmpty(paragraphTitle)) {
        String message = "WARN: The title of the [" + i
            + "] paragraph is empty and was not submitted to HDFS.\n";
        LOGGER.warn(message);
        outputMsg.append(message);
        continue;
      }
      if (!mapParagraph.containsKey(paragraphTitle)) {
        StringBuffer mergeScript = new StringBuffer();
        mapParagraph.put(paragraphTitle, mergeScript);
      }
      StringBuffer mergeScript = mapParagraph.get(paragraphTitle);
      String parapraphText = paragraph.getParagraphText();
      String text = parseText(parapraphText);
      mergeScript.append(text + "\n\n");
    }

    // Clear all files in the local noteId directory
    if (!org.apache.commons.lang3.StringUtils.isEmpty(dirName)) {
      String noteDir = dirName + "/" + noteId;
      File fileNoteDir = new File(noteDir);
      if (fileNoteDir.exists()) {
        fileNoteDir.delete();
      }
      fileNoteDir.mkdirs();
    }

    // Clear all files in the noteid directory in HDFS
    if (!org.apache.commons.lang3.StringUtils.isEmpty(hdfsUploadPath)) {
      Path hdfsPath = new Path(hdfsUploadPath + "/" + noteId);
      try {
        if (exists(hdfsPath)) {
          delete(hdfsPath);
          tryMkDir(hdfsPath);
        }
      } catch (IOException e) {
        LOGGER.error(e.getMessage(), e);
        throw new Exception(e);
      }
    }

    for (Map.Entry entry : mapParagraph.entrySet()) {
      try {
        String fileName = entry.getKey();
        String fileContext = entry.getValue().toString();
        String paragraphFile = dirName + "/" + noteId + "/" + fileName;

        // save to local file
        if (!org.apache.commons.lang3.StringUtils.isEmpty(dirName)) {
          File fileParagraph = new File(paragraphFile);
          if (!fileParagraph.exists()) {
            fileParagraph.createNewFile();
          }
          FileWriter writer = new FileWriter(paragraphFile);
          writer.write(fileContext);
          writer.close();
        }

        // save to hdfs
        if (!StringUtils.isEmpty(hdfsUploadPath)) {
          String fileDir = hdfsUploadPath + "/" + noteId + "/" + fileName;
          // upload algorithm file
          LOGGER.info("Commit algorithm to HDFS: {}", fileDir);
          Path filePath = new Path(fileDir);
          writeFile(fileContext, filePath);
        }
      } catch (IOException e) {
        LOGGER.error(e.getMessage(), e);
        throw new Exception(e);
      }
    }

    return outputMsg.toString();
  }
}