org.apache.hadoop.hive.shims.HadoopShims Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hive-shims-common Show documentation
There is a newer version: 1.2.1.spark2
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.shims;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.ByteBuffer;
import java.security.PrivilegedExceptionAction;
import java.util.Comparator;
import java.util.List;
import java.util.Map;

import javax.security.auth.login.LoginException;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapred.ClusterStatus;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobProfile;
import org.apache.hadoop.mapred.JobStatus;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.JobID;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.TaskID;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.util.Progressable;

/**
 * In order to be compatible with multiple versions of Hadoop, all parts
 * of the Hadoop interface that are not cross-version compatible are
 * encapsulated in an implementation of this class. Users should use
 * the ShimLoader class as a factory to obtain an implementation of
 * HadoopShims corresponding to the version of Hadoop currently on the
 * classpath.
 */
public interface HadoopShims {

  static final Log LOG = LogFactory.getLog(HadoopShims.class);

  /**
   * Constructs and Returns TaskAttempt Log Url
   * or null if the TaskLogServlet is not available
   *
   *  @return TaskAttempt Log Url
   */
  String getTaskAttemptLogUrl(JobConf conf,
      String taskTrackerHttpAddress,
      String taskAttemptId)
          throws MalformedURLException;

  /**
   * Returns a shim to wrap MiniMrCluster
   */
  public MiniMrShim getMiniMrCluster(Configuration conf, int numberOfTaskTrackers,
      String nameNode, int numDir) throws IOException;

  /**
   * Shim for MiniMrCluster
   */
  public interface MiniMrShim {
    public int getJobTrackerPort() throws UnsupportedOperationException;
    public void shutdown() throws IOException;
    public void setupConfiguration(Configuration conf);
  }

  /**
   * Returns a shim to wrap MiniDFSCluster. This is necessary since this class
   * was moved from org.apache.hadoop.dfs to org.apache.hadoop.hdfs
   */
  MiniDFSShim getMiniDfs(Configuration conf,
      int numDataNodes,
      boolean format,
      String[] racks) throws IOException;

  /**
   * Shim around the functions in MiniDFSCluster that Hive uses.
   */
  public interface MiniDFSShim {
    FileSystem getFileSystem() throws IOException;

    void shutdown() throws IOException;
  }

  CombineFileInputFormatShim getCombineFileInputFormat();

  String getInputFormatClassName();

  int createHadoopArchive(Configuration conf, Path parentDir, Path destDir,
      String archiveName) throws Exception;

  public URI getHarUri(URI original, URI base, URI originalBase)
      throws URISyntaxException;
  /**
   * Hive uses side effect files exclusively for it's output. It also manages
   * the setup/cleanup/commit of output from the hive client. As a result it does
   * not need support for the same inside the MR framework
   *
   * This routine sets the appropriate options related to bypass setup/cleanup/commit
   * support in the MR framework, but does not set the OutputFormat class.
   */
  void prepareJobOutput(JobConf conf);

  /**
   * Used by TaskLogProcessor to Remove HTML quoting from a string
   * @param item the string to unquote
   * @return the unquoted string
   *
   */
  public String unquoteHtmlChars(String item);



  public void closeAllForUGI(UserGroupInformation ugi);

  /**
   * Get the UGI that the given job configuration will run as.
   *
   * In secure versions of Hadoop, this simply returns the current
   * access control context's user, ignoring the configuration.
   */
  public UserGroupInformation getUGIForConf(Configuration conf) throws LoginException, IOException;

  /**
   * Used by metastore server to perform requested rpc in client context.
   * @param 
   * @param ugi
   * @param pvea
   * @throws IOException
   * @throws InterruptedException
   */
  public  T doAs(UserGroupInformation ugi, PrivilegedExceptionAction pvea) throws
  IOException, InterruptedException;

  /**
   * Once a delegation token is stored in a file, the location is specified
   * for a child process that runs hadoop operations, using an environment
   * variable .
   * @return Return the name of environment variable used by hadoop to find
   *  location of token file
   */
  public String getTokenFileLocEnvName();


  /**
   * Get delegation token from filesystem and write the token along with
   * metastore tokens into a file
   * @param conf
   * @return Path of the file with token credential
   * @throws IOException
   */
  public Path createDelegationTokenFile(final Configuration conf) throws IOException;


  /**
   * Used to creates UGI object for a remote user.
   * @param userName remote User Name
   * @param groupNames group names associated with remote user name
   * @return UGI created for the remote user.
   */
  public UserGroupInformation createRemoteUser(String userName, List groupNames);

  /**
   * Get the short name corresponding to the subject in the passed UGI
   *
   * In secure versions of Hadoop, this returns the short name (after
   * undergoing the translation in the kerberos name rule mapping).
   * In unsecure versions of Hadoop, this returns the name of the subject
   */
  public String getShortUserName(UserGroupInformation ugi);

  /**
   * Return true if the Shim is based on Hadoop Security APIs.
   */
  public boolean isSecureShimImpl();

  /**
   * Return true if the hadoop configuration has security enabled
   * @return
   */
  public boolean isSecurityEnabled();

  /**
   * Get the string form of the token given a token signature.
   * The signature is used as the value of the "service" field in the token for lookup.
   * Ref: AbstractDelegationTokenSelector in Hadoop. If there exists such a token
   * in the token cache (credential store) of the job, the lookup returns that.
   * This is relevant only when running against a "secure" hadoop release
   * The method gets hold of the tokens if they are set up by hadoop - this should
   * happen on the map/reduce tasks if the client added the tokens into hadoop's
   * credential store in the front end during job submission. The method will
   * select the hive delegation token among the set of tokens and return the string
   * form of it
   * @param tokenSignature
   * @return the string form of the token found
   * @throws IOException
   */
  public String getTokenStrForm(String tokenSignature) throws IOException;

  /**
   * Add a delegation token to the given ugi
   * @param ugi
   * @param tokenStr
   * @param tokenService
   * @throws IOException
   */
  public void setTokenStr(UserGroupInformation ugi, String tokenStr, String tokenService)
      throws IOException;

  /**
   * Add given service to the string format token
   * @param tokenStr
   * @param tokenService
   * @return
   * @throws IOException
   */
  public String addServiceToToken(String tokenStr, String tokenService)
      throws IOException;

  enum JobTrackerState { INITIALIZING, RUNNING };

  /**
   * Convert the ClusterStatus to its Thrift equivalent: JobTrackerState.
   * See MAPREDUCE-2455 for why this is a part of the shim.
   * @param clusterStatus
   * @return the matching JobTrackerState
   * @throws Exception if no equivalent JobTrackerState exists
   */
  public JobTrackerState getJobTrackerState(ClusterStatus clusterStatus) throws Exception;

  public TaskAttemptContext newTaskAttemptContext(Configuration conf, final Progressable progressable);

  public TaskAttemptID newTaskAttemptID(JobID jobId, boolean isMap, int taskId, int id);

  public JobContext newJobContext(Job job);

  /**
   * Check wether MR is configured to run in local-mode
   * @param conf
   * @return
   */
  public boolean isLocalMode(Configuration conf);

  /**
   * All retrieval of jobtracker/resource manager rpc address
   * in the configuration should be done through this shim
   * @param conf
   * @return
   */
  public String getJobLauncherRpcAddress(Configuration conf);

  /**
   * All updates to jobtracker/resource manager rpc address
   * in the configuration should be done through this shim
   * @param conf
   * @return
   */
  public void setJobLauncherRpcAddress(Configuration conf, String val);

  /**
   * All references to jobtracker/resource manager http address
   * in the configuration should be done through this shim
   * @param conf
   * @return
   */
  public String getJobLauncherHttpAddress(Configuration conf);


  /**
   *  Perform kerberos login using the given principal and keytab
   * @throws IOException
   */
  public void loginUserFromKeytab(String principal, String keytabFile) throws IOException;

  /**
   *  Perform kerberos login using the given principal and keytab,
   *  and return the UGI object
   * @throws IOException
   */
  public UserGroupInformation loginUserFromKeytabAndReturnUGI(String principal,
      String keytabFile) throws IOException;

  /**
   * Perform kerberos re-login using the given principal and keytab, to renew
   * the credentials
   * @throws IOException
   */
  public void reLoginUserFromKeytab() throws IOException;

  /***
   * Check if the current UGI is keytab based
   * @return
   * @throws IOException
   */
  public boolean isLoginKeytabBased() throws IOException;

  /**
   * Move the directory/file to trash. In case of the symlinks or mount points, the file is
   * moved to the trashbin in the actual volume of the path p being deleted
   * @param fs
   * @param path
   * @param conf
   * @return false if the item is already in the trash or trash is disabled
   * @throws IOException
   */
  public boolean moveToAppropriateTrash(FileSystem fs, Path path, Configuration conf)
      throws IOException;

  /**
   * Get the default block size for the path. FileSystem alone is not sufficient to
   * determine the same, as in case of CSMT the underlying file system determines that.
   * @param fs
   * @param path
   * @return
   */
  public long getDefaultBlockSize(FileSystem fs, Path path);

  /**
   * Get the default replication for a path. In case of CSMT the given path will be used to
   * locate the actual filesystem.
   * @param fs
   * @param path
   * @return
   */
  public short getDefaultReplication(FileSystem fs, Path path);

  /**
   * Create the proxy ugi for the given userid
   * @param userName
   * @return
   */
  public UserGroupInformation createProxyUser(String userName) throws IOException;

  /**
   * Verify proxy access to given UGI for given user
   * @param ugi
   */
  public void authorizeProxyAccess(String proxyUser, UserGroupInformation realUserUgi,
      String ipAddress, Configuration conf) throws IOException;

  /**
   * The method sets to set the partition file has a different signature between
   * hadoop versions.
   * @param jobConf
   * @param partition
   */
  void setTotalOrderPartitionFile(JobConf jobConf, Path partition);

  Comparator getLongComparator();

  /**
   * InputSplitShim.
   *
   */
  public interface InputSplitShim extends InputSplit {
    JobConf getJob();

    @Override
    long getLength();

    /** Returns an array containing the startoffsets of the files in the split. */
    long[] getStartOffsets();

    /** Returns an array containing the lengths of the files in the split. */
    long[] getLengths();

    /** Returns the start offset of the i^th Path. */
    long getOffset(int i);

    /** Returns the length of the i^th Path. */
    long getLength(int i);

    /** Returns the number of Paths in the split. */
    int getNumPaths();

    /** Returns the i^th Path. */
    Path getPath(int i);

    /** Returns all the Paths in the split. */
    Path[] getPaths();

    /** Returns all the Paths where this input-split resides. */
    @Override
    String[] getLocations() throws IOException;

    void shrinkSplit(long length);

    @Override
    String toString();

    @Override
    void readFields(DataInput in) throws IOException;

    @Override
    void write(DataOutput out) throws IOException;
  }

  /**
   * CombineFileInputFormatShim.
   *
   * @param 
   * @param 
   */
  interface CombineFileInputFormatShim {
    Path[] getInputPathsShim(JobConf conf);

    void createPool(JobConf conf, PathFilter... filters);

    InputSplitShim[] getSplits(JobConf job, int numSplits) throws IOException;

    InputSplitShim getInputSplitShim() throws IOException;

    RecordReader getRecordReader(JobConf job, InputSplitShim split, Reporter reporter,
        Class> rrClass) throws IOException;
  }

  /**
   * Get the block locations for the given directory.
   * @param fs the file system
   * @param path the directory name to get the status and block locations
   * @param filter a filter that needs to accept the file (or null)
   * @return an list for the located file status objects
   * @throws IOException
   */
  List listLocatedStatus(FileSystem fs, Path path,
                                     PathFilter filter) throws IOException;

  /**
   * For file status returned by listLocatedStatus, convert them into a list
   * of block locations.
   * @param fs the file system
   * @param status the file information
   * @return the block locations of the file
   * @throws IOException
   */
  BlockLocation[] getLocations(FileSystem fs,
      FileStatus status) throws IOException;

  /**
   * Flush and make visible to other users the changes to the given stream.
   * @param stream the stream to hflush.
   * @throws IOException
   */
  public void hflush(FSDataOutputStream stream) throws IOException;

  public HCatHadoopShims getHCatShim();
  public interface HCatHadoopShims {

    enum PropertyName {CACHE_ARCHIVES, CACHE_FILES, CACHE_SYMLINK}

    public TaskID createTaskID();

    public TaskAttemptID createTaskAttemptID();

    public org.apache.hadoop.mapreduce.TaskAttemptContext createTaskAttemptContext(Configuration conf,
        TaskAttemptID taskId);

    public org.apache.hadoop.mapred.TaskAttemptContext createTaskAttemptContext(JobConf conf,
        org.apache.hadoop.mapred.TaskAttemptID taskId, Progressable progressable);

    public JobContext createJobContext(Configuration conf, JobID jobId);

    public org.apache.hadoop.mapred.JobContext createJobContext(JobConf conf, JobID jobId, Progressable progressable);

    public void commitJob(OutputFormat outputFormat, Job job) throws IOException;

    public void abortJob(OutputFormat outputFormat, Job job) throws IOException;

    /* Referring to job tracker in 0.20 and resource manager in 0.23 */
    public InetSocketAddress getResourceManagerAddress(Configuration conf);

    public String getPropertyName(PropertyName name);

    /**
     * Checks if file is in HDFS filesystem.
     *
     * @param fs
     * @param path
     * @return true if the file is in HDFS, false if the file is in other file systems.
     */
    public boolean isFileInHDFS(FileSystem fs, Path path) throws IOException;
  }
  /**
   * Provides a Hadoop JobTracker shim.
   * @param conf not {@code null}
   */
  public WebHCatJTShim getWebHCatShim(Configuration conf, UserGroupInformation ugi) throws IOException;
  public interface WebHCatJTShim {
    /**
     * Grab a handle to a job that is already known to the JobTracker.
     *
     * @return Profile of the job, or null if not found.
     */
    public JobProfile getJobProfile(org.apache.hadoop.mapred.JobID jobid) throws IOException;
    /**
     * Grab a handle to a job that is already known to the JobTracker.
     *
     * @return Status of the job, or null if not found.
     */
    public JobStatus getJobStatus(org.apache.hadoop.mapred.JobID jobid) throws IOException;
    /**
     * Kill a job.
     */
    public void killJob(org.apache.hadoop.mapred.JobID jobid) throws IOException;
    /**
     * Get all the jobs submitted.
     */
    public JobStatus[] getAllJobs() throws IOException;
    /**
     * Close the connection to the Job Tracker.
     */
    public void close();
    /**
     * Does exactly what org.apache.hadoop.mapreduce.Job#addCacheFile(URI) in Hadoop 2.
     * Assumes that both parameters are not {@code null}.
     */
    public void addCacheFile(URI uri, Job job);
  }

  /**
   * Create a proxy file system that can serve a given scheme/authority using some
   * other file system.
   */
  public FileSystem createProxyFileSystem(FileSystem fs, URI uri);

  public Map getHadoopConfNames();

  /**
   * a hadoop.io ByteBufferPool shim.
   */
  public interface ByteBufferPoolShim {
    /**
     * Get a new ByteBuffer from the pool.  The pool can provide this from
     * removing a buffer from its internal cache, or by allocating a
     * new buffer.
     *
     * @param direct     Whether the buffer should be direct.
     * @param length     The minimum length the buffer will have.
     * @return           A new ByteBuffer. Its capacity can be less
     *                   than what was requested, but must be at
     *                   least 1 byte.
     */
    ByteBuffer getBuffer(boolean direct, int length);

    /**
     * Release a buffer back to the pool.
     * The pool may choose to put this buffer into its cache/free it.
     *
     * @param buffer    a direct bytebuffer
     */
    void putBuffer(ByteBuffer buffer);
  }

  /**
   * Provides an HDFS ZeroCopyReader shim.
   * @param in FSDataInputStream to read from (where the cached/mmap buffers are tied to)
   * @param in ByteBufferPoolShim to allocate fallback buffers with
   *
   * @return returns null if not supported
   */
  public ZeroCopyReaderShim getZeroCopyReader(FSDataInputStream in, ByteBufferPoolShim pool) throws IOException;

  public interface ZeroCopyReaderShim {
    /**
     * Get a ByteBuffer from the FSDataInputStream - this can be either a HeapByteBuffer or an MappedByteBuffer.
     * Also move the in stream by that amount. The data read can be small than maxLength.
     *
     * @return ByteBuffer read from the stream,
     */
    public ByteBuffer readBuffer(int maxLength, boolean verifyChecksums) throws IOException;
    /**
     * Release a ByteBuffer obtained from a read on the
     * Also move the in stream by that amount. The data read can be small than maxLength.
     *
     */
    public void releaseBuffer(ByteBuffer buffer);
  }

  public enum DirectCompressionType {
    NONE,
    ZLIB_NOHEADER,
    ZLIB,
    SNAPPY,
  };

  public interface DirectDecompressorShim {
    public void decompress(ByteBuffer src, ByteBuffer dst) throws IOException;
  }

  public DirectDecompressorShim getDirectDecompressor(DirectCompressionType codec);

  /**
   * Get configuration from JobContext
   */
  public Configuration getConfiguration(JobContext context);

  public FileSystem getNonCachedFileSystem(URI uri, Configuration conf) throws IOException;

}