All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.mapred.FileInputFormat Maven / Gradle / Ivy

There is a newer version: 3.4.1
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in org.apache.hadoop.shaded.com.liance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org.apache.hadoop.shaded.org.licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.shaded.org.apache.hadoop.mapred;

import java.org.apache.hadoop.shaded.io.IOException;
import java.org.apache.hadoop.shaded.io.InterruptedIOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.IdentityHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.TimeUnit;

import org.apache.hadoop.shaded.org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.shaded.org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.shaded.org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.shaded.org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.shaded.org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.shaded.org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.shaded.org.apache.hadoop.fs.Path;
import org.apache.hadoop.shaded.org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.shaded.org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.shaded.org.apache.hadoop.mapreduce.security.TokenCache;
import org.apache.hadoop.shaded.org.apache.hadoop.org.apache.hadoop.shaded.net.NetworkTopology;
import org.apache.hadoop.shaded.org.apache.hadoop.org.apache.hadoop.shaded.net.Node;
import org.apache.hadoop.shaded.org.apache.hadoop.org.apache.hadoop.shaded.net.NodeBase;
import org.apache.hadoop.shaded.org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.shaded.org.apache.hadoop.util.StopWatch;
import org.apache.hadoop.shaded.org.apache.hadoop.util.StringUtils;

import org.apache.hadoop.shaded.org.apache.hadoop.thirdparty.org.apache.hadoop.shaded.com.google.org.apache.hadoop.shaded.com.on.collect.Iterables;
import org.apache.hadoop.shaded.org.slf4j.Logger;
import org.apache.hadoop.shaded.org.slf4j.LoggerFactory;

/** 
 * A base class for file-based {@link InputFormat}.
 * 
 * 

FileInputFormat is the base class for all file-based * InputFormats. This provides a generic implementation of * {@link #getSplits(JobConf, int)}. * * Implementations of FileInputFormat can also override the * {@link #isSplitable(FileSystem, Path)} method to prevent input files * from being split-up in certain situations. Implementations that may * deal with non-splittable files must override this method, since * the default implementation assumes splitting is always possible. */ @InterfaceAudience.Public @InterfaceStability.Stable public abstract class FileInputFormat implements InputFormat { public static final Logger LOG = LoggerFactory.getLogger(FileInputFormat.class); @Deprecated public enum Counter { BYTES_READ } public static final String NUM_INPUT_FILES = org.apache.hadoop.shaded.org.apache.hadoop.mapreduce.lib.input.FileInputFormat.NUM_INPUT_FILES; public static final String INPUT_DIR_RECURSIVE = org.apache.hadoop.shaded.org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR_RECURSIVE; public static final String INPUT_DIR_NONRECURSIVE_IGNORE_SUBDIRS = org.apache.hadoop.shaded.org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR_NONRECURSIVE_IGNORE_SUBDIRS; private static final double SPLIT_SLOP = 1.1; // 10% slop private long minSplitSize = 1; private static final PathFilter hiddenFileFilter = new PathFilter(){ public boolean accept(Path p){ String name = p.getName(); return !name.startsWith("_") && !name.startsWith("."); } }; protected void setMinSplitSize(long minSplitSize) { this.minSplitSize = minSplitSize; } /** * Proxy PathFilter that accepts a path only if all filters given in the * constructor do. Used by the listPaths() to apply the built-in * hiddenFileFilter together with a user provided one (if any). */ private static class MultiPathFilter implements PathFilter { private List filters; public MultiPathFilter(List filters) { this.filters = filters; } public boolean accept(Path path) { for (PathFilter filter : filters) { if (!filter.accept(path)) { return false; } } return true; } } /** * Is the given filename splittable? Usually, true, but if the file is * stream org.apache.hadoop.shaded.com.ressed, it will not be. * * The default implementation in FileInputFormat always returns * true. Implementations that may deal with non-splittable files must * override this method. * * FileInputFormat implementations can override this and return * false to ensure that individual input files are never split-up * so that {@link Mapper}s process entire files. * * @param fs the file system that the file is on * @param filename the file name to check * @return is this file splitable? */ protected boolean isSplitable(FileSystem fs, Path filename) { return true; } public abstract RecordReader getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException; /** * Set a PathFilter to be applied to the input paths for the map-reduce job. * * @param filter the PathFilter class use for filtering the input paths. */ public static void setInputPathFilter(JobConf conf, Class filter) { conf.setClass(org.apache.hadoop.shaded.org.apache.hadoop.mapreduce.lib.input. FileInputFormat.PATHFILTER_CLASS, filter, PathFilter.class); } /** * Get a PathFilter instance of the filter set for the input paths. * * @return the PathFilter instance set for the job, NULL if none has been set. */ public static PathFilter getInputPathFilter(JobConf conf) { Class filterClass = conf.getClass( org.apache.hadoop.shaded.org.apache.hadoop.mapreduce.lib.input.FileInputFormat.PATHFILTER_CLASS, null, PathFilter.class); return (filterClass != null) ? ReflectionUtils.newInstance(filterClass, conf) : null; } /** * Add files in the input path recursively into the results. * @param result * The List to store all files. * @param fs * The FileSystem. * @param path * The input path. * @param inputFilter * The input filter that can be used to filter files/dirs. * @throws IOException */ protected void addInputPathRecursively(List result, FileSystem fs, Path path, PathFilter inputFilter) throws IOException { RemoteIterator iter = fs.listLocatedStatus(path); while (iter.hasNext()) { LocatedFileStatus stat = iter.next(); if (inputFilter.accept(stat.getPath())) { if (stat.isDirectory()) { addInputPathRecursively(result, fs, stat.getPath(), inputFilter); } else { result.add(stat); } } } } /** * List input directories. * Subclasses may override to, e.g., select only files matching a regular * expression. * * If security is enabled, this method collects * delegation tokens from the input paths and adds them to the job's * credentials. * @param job the job to list input paths for and attach tokens to. * @return array of FileStatus objects * @throws IOException if zero items. */ protected FileStatus[] listStatus(JobConf job) throws IOException { Path[] dirs = getInputPaths(job); if (dirs.length == 0) { throw new IOException("No input paths specified in job"); } // get tokens for all the required FileSystems.. TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job); // Whether we need to recursive look into the directory structure boolean recursive = job.getBoolean(INPUT_DIR_RECURSIVE, false); // creates a MultiPathFilter with the hiddenFileFilter and the // user provided one (if any). List filters = new ArrayList(); filters.add(hiddenFileFilter); PathFilter jobFilter = getInputPathFilter(job); if (jobFilter != null) { filters.add(jobFilter); } PathFilter inputFilter = new MultiPathFilter(filters); FileStatus[] result; int numThreads = job .getInt( org.apache.hadoop.shaded.org.apache.hadoop.mapreduce.lib.input.FileInputFormat.LIST_STATUS_NUM_THREADS, org.apache.hadoop.shaded.org.apache.hadoop.mapreduce.lib.input.FileInputFormat.DEFAULT_LIST_STATUS_NUM_THREADS); StopWatch sw = new StopWatch().start(); if (numThreads == 1) { List locatedFiles = singleThreadedListStatus(job, dirs, inputFilter, recursive); result = locatedFiles.toArray(new FileStatus[locatedFiles.size()]); } else { Iterable locatedFiles = null; try { LocatedFileStatusFetcher locatedFileStatusFetcher = new LocatedFileStatusFetcher( job, dirs, recursive, inputFilter, false); locatedFiles = locatedFileStatusFetcher.getFileStatuses(); } catch (InterruptedException e) { throw (IOException) new InterruptedIOException("Interrupted while getting file statuses") .initCause(e); } result = Iterables.toArray(locatedFiles, FileStatus.class); } sw.stop(); if (LOG.isDebugEnabled()) { LOG.debug("Time taken to get FileStatuses: " + sw.now(TimeUnit.MILLISECONDS)); } LOG.info("Total input files to process : " + result.length); return result; } private List singleThreadedListStatus(JobConf job, Path[] dirs, PathFilter inputFilter, boolean recursive) throws IOException { List result = new ArrayList(); List errors = new ArrayList(); for (Path p: dirs) { FileSystem fs = p.getFileSystem(job); FileStatus[] matches = fs.globStatus(p, inputFilter); if (matches == null) { errors.add(new IOException("Input path does not exist: " + p)); } else if (matches.length == 0) { errors.add(new IOException("Input Pattern " + p + " matches 0 files")); } else { for (FileStatus globStat: matches) { if (globStat.isDirectory()) { RemoteIterator iter = fs.listLocatedStatus(globStat.getPath()); while (iter.hasNext()) { LocatedFileStatus stat = iter.next(); if (inputFilter.accept(stat.getPath())) { if (recursive && stat.isDirectory()) { addInputPathRecursively(result, fs, stat.getPath(), inputFilter); } else { result.add(stat); } } } } else { result.add(globStat); } } } } if (!errors.isEmpty()) { throw new InvalidInputException(errors); } return result; } /** * A factory that makes the split for this class. It can be overridden * by sub-classes to make sub-types */ protected FileSplit makeSplit(Path file, long start, long length, String[] hosts) { return new FileSplit(file, start, length, hosts); } /** * A factory that makes the split for this class. It can be overridden * by sub-classes to make sub-types */ protected FileSplit makeSplit(Path file, long start, long length, String[] hosts, String[] inMemoryHosts) { return new FileSplit(file, start, length, hosts, inMemoryHosts); } /** Splits files returned by {@link #listStatus(JobConf)} when * they're too big.*/ public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { StopWatch sw = new StopWatch().start(); FileStatus[] stats = listStatus(job); // Save the number of input files for metrics/loadgen job.setLong(NUM_INPUT_FILES, stats.length); long totalSize = 0; // org.apache.hadoop.shaded.com.ute total size boolean ignoreDirs = !job.getBoolean(INPUT_DIR_RECURSIVE, false) && job.getBoolean(INPUT_DIR_NONRECURSIVE_IGNORE_SUBDIRS, false); List files = new ArrayList<>(stats.length); for (FileStatus file: stats) { // check we have valid files if (file.isDirectory()) { if (!ignoreDirs) { throw new IOException("Not a file: "+ file.getPath()); } } else { files.add(file); totalSize += file.getLen(); } } long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits); long minSize = Math.max(job.getLong(org.apache.hadoop.shaded.org.apache.hadoop.mapreduce.lib.input. FileInputFormat.SPLIT_MINSIZE, 1), minSplitSize); // generate splits ArrayList splits = new ArrayList(numSplits); NetworkTopology clusterMap = new NetworkTopology(); for (FileStatus file: files) { Path path = file.getPath(); long length = file.getLen(); if (length != 0) { FileSystem fs = path.getFileSystem(job); BlockLocation[] blkLocations; if (file instanceof LocatedFileStatus) { blkLocations = ((LocatedFileStatus) file).getBlockLocations(); } else { blkLocations = fs.getFileBlockLocations(file, 0, length); } if (isSplitable(fs, path)) { long blockSize = file.getBlockSize(); long splitSize = org.apache.hadoop.shaded.com.uteSplitSize(goalSize, minSize, blockSize); long bytesRemaining = length; while (((double) bytesRemaining)/splitSize > SPLIT_SLOP) { String[][] splitHosts = getSplitHostsAndCachedHosts(blkLocations, length-bytesRemaining, splitSize, clusterMap); splits.add(makeSplit(path, length-bytesRemaining, splitSize, splitHosts[0], splitHosts[1])); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { String[][] splitHosts = getSplitHostsAndCachedHosts(blkLocations, length - bytesRemaining, bytesRemaining, clusterMap); splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining, splitHosts[0], splitHosts[1])); } } else { if (LOG.isDebugEnabled()) { // Log only if the file is big enough to be splitted if (length > Math.min(file.getBlockSize(), minSize)) { LOG.debug("File is not splittable so no parallelization " + "is possible: " + file.getPath()); } } String[][] splitHosts = getSplitHostsAndCachedHosts(blkLocations,0,length,clusterMap); splits.add(makeSplit(path, 0, length, splitHosts[0], splitHosts[1])); } } else { //Create empty hosts array for zero length files splits.add(makeSplit(path, 0, length, new String[0])); } } sw.stop(); if (LOG.isDebugEnabled()) { LOG.debug("Total # of splits generated by getSplits: " + splits.size() + ", TimeTaken: " + sw.now(TimeUnit.MILLISECONDS)); } return splits.toArray(new FileSplit[splits.size()]); } protected long org.apache.hadoop.shaded.com.uteSplitSize(long goalSize, long minSize, long blockSize) { return Math.max(minSize, Math.min(goalSize, blockSize)); } protected int getBlockIndex(BlockLocation[] blkLocations, long offset) { for (int i = 0 ; i < blkLocations.length; i++) { // is the offset inside this block? if ((blkLocations[i].getOffset() <= offset) && (offset < blkLocations[i].getOffset() + blkLocations[i].getLength())){ return i; } } BlockLocation last = blkLocations[blkLocations.length -1]; long fileLength = last.getOffset() + last.getLength() -1; throw new IllegalArgumentException("Offset " + offset + " is outside of file (0.." + fileLength + ")"); } /** * Sets the given org.apache.hadoop.shaded.com.a separated paths as the list of inputs * for the map-reduce job. * * @param conf Configuration of the job * @param org.apache.hadoop.shaded.com.aSeparatedPaths Comma separated paths to be set as * the list of inputs for the map-reduce job. */ public static void setInputPaths(JobConf conf, String org.apache.hadoop.shaded.com.aSeparatedPaths) { setInputPaths(conf, StringUtils.stringToPath( getPathStrings(org.apache.hadoop.shaded.com.aSeparatedPaths))); } /** * Add the given org.apache.hadoop.shaded.com.a separated paths to the list of inputs for * the map-reduce job. * * @param conf The configuration of the job * @param org.apache.hadoop.shaded.com.aSeparatedPaths Comma separated paths to be added to * the list of inputs for the map-reduce job. */ public static void addInputPaths(JobConf conf, String org.apache.hadoop.shaded.com.aSeparatedPaths) { for (String str : getPathStrings(org.apache.hadoop.shaded.com.aSeparatedPaths)) { addInputPath(conf, new Path(str)); } } /** * Set the array of {@link Path}s as the list of inputs * for the map-reduce job. * * @param conf Configuration of the job. * @param inputPaths the {@link Path}s of the input directories/files * for the map-reduce job. */ public static void setInputPaths(JobConf conf, Path... inputPaths) { Path path = new Path(conf.getWorkingDirectory(), inputPaths[0]); StringBuffer str = new StringBuffer(StringUtils.escapeString(path.toString())); for(int i = 1; i < inputPaths.length;i++) { str.append(StringUtils.COMMA_STR); path = new Path(conf.getWorkingDirectory(), inputPaths[i]); str.append(StringUtils.escapeString(path.toString())); } conf.set(org.apache.hadoop.shaded.org.apache.hadoop.mapreduce.lib.input. FileInputFormat.INPUT_DIR, str.toString()); } /** * Add a {@link Path} to the list of inputs for the map-reduce job. * * @param conf The configuration of the job * @param path {@link Path} to be added to the list of inputs for * the map-reduce job. */ public static void addInputPath(JobConf conf, Path path ) { path = new Path(conf.getWorkingDirectory(), path); String dirStr = StringUtils.escapeString(path.toString()); String dirs = conf.get(org.apache.hadoop.shaded.org.apache.hadoop.mapreduce.lib.input. FileInputFormat.INPUT_DIR); conf.set(org.apache.hadoop.shaded.org.apache.hadoop.mapreduce.lib.input. FileInputFormat.INPUT_DIR, dirs == null ? dirStr : dirs + StringUtils.COMMA_STR + dirStr); } // This method escapes org.apache.hadoop.shaded.com.as in the glob pattern of the given paths. private static String[] getPathStrings(String org.apache.hadoop.shaded.com.aSeparatedPaths) { int length = org.apache.hadoop.shaded.com.aSeparatedPaths.length(); int curlyOpen = 0; int pathStart = 0; boolean globPattern = false; List pathStrings = new ArrayList(); for (int i=0; i mylist) { Collections.sort(mylist, new Comparator () { public int org.apache.hadoop.shaded.com.are(NodeInfo obj1, NodeInfo obj2) { if (obj1 == null || obj2 == null) return -1; if (obj1.getValue() == obj2.getValue()) { return 0; } else { return ((obj1.getValue() < obj2.getValue()) ? 1 : -1); } } } ); } /** * This function identifies and returns the hosts that contribute * most for a given split. For calculating the contribution, rack * locality is treated on par with host locality, so hosts from racks * that contribute the most are preferred over hosts on racks that * contribute less * @param blkLocations The list of block locations * @param offset * @param splitSize * @return an array of hosts that contribute most to this split * @throws IOException */ protected String[] getSplitHosts(BlockLocation[] blkLocations, long offset, long splitSize, NetworkTopology clusterMap) throws IOException { return getSplitHostsAndCachedHosts(blkLocations, offset, splitSize, clusterMap)[0]; } /** * This function identifies and returns the hosts that contribute * most for a given split. For calculating the contribution, rack * locality is treated on par with host locality, so hosts from racks * that contribute the most are preferred over hosts on racks that * contribute less * @param blkLocations The list of block locations * @param offset * @param splitSize * @return two arrays - one of hosts that contribute most to this split, and * one of hosts that contribute most to this split that have the data * cached on them * @throws IOException */ private String[][] getSplitHostsAndCachedHosts(BlockLocation[] blkLocations, long offset, long splitSize, NetworkTopology clusterMap) throws IOException { int startIndex = getBlockIndex(blkLocations, offset); long bytesInThisBlock = blkLocations[startIndex].getOffset() + blkLocations[startIndex].getLength() - offset; //If this is the only block, just return if (bytesInThisBlock >= splitSize) { return new String[][] { blkLocations[startIndex].getHosts(), blkLocations[startIndex].getCachedHosts() }; } long bytesInFirstBlock = bytesInThisBlock; int index = startIndex + 1; splitSize -= bytesInThisBlock; while (splitSize > 0) { bytesInThisBlock = Math.min(splitSize, blkLocations[index++].getLength()); splitSize -= bytesInThisBlock; } long bytesInLastBlock = bytesInThisBlock; int endIndex = index - 1; Map hostsMap = new IdentityHashMap(); Map racksMap = new IdentityHashMap(); String [] allTopos = new String[0]; // Build the hierarchy and aggregate the contribution of // bytes at each level. See TestGetSplitHosts.java for (index = startIndex; index <= endIndex; index++) { // Establish the bytes in this block if (index == startIndex) { bytesInThisBlock = bytesInFirstBlock; } else if (index == endIndex) { bytesInThisBlock = bytesInLastBlock; } else { bytesInThisBlock = blkLocations[index].getLength(); } allTopos = blkLocations[index].getTopologyPaths(); // If no topology information is available, just // prefix a fakeRack if (allTopos.length == 0) { allTopos = fakeRacks(blkLocations, index); } // NOTE: This code currently works only for one level of // hierarchy (rack/host). However, it is relatively easy // to extend this to support aggregation at different // levels for (String topo: allTopos) { Node node, parentNode; NodeInfo nodeInfo, parentNodeInfo; node = clusterMap.getNode(topo); if (node == null) { node = new NodeBase(topo); clusterMap.add(node); } nodeInfo = hostsMap.get(node); if (nodeInfo == null) { nodeInfo = new NodeInfo(node); hostsMap.put(node,nodeInfo); parentNode = node.getParent(); parentNodeInfo = racksMap.get(parentNode); if (parentNodeInfo == null) { parentNodeInfo = new NodeInfo(parentNode); racksMap.put(parentNode,parentNodeInfo); } parentNodeInfo.addLeaf(nodeInfo); } else { nodeInfo = hostsMap.get(node); parentNode = node.getParent(); parentNodeInfo = racksMap.get(parentNode); } nodeInfo.addValue(index, bytesInThisBlock); parentNodeInfo.addValue(index, bytesInThisBlock); } // for all topos } // for all indices // We don't yet support cached hosts when bytesInThisBlock > splitSize return new String[][] { identifyHosts(allTopos.length, racksMap), new String[0]}; } private String[] identifyHosts(int replicationFactor, Map racksMap) { String [] retVal = new String[replicationFactor]; List rackList = new LinkedList(); rackList.addAll(racksMap.values()); // Sort the racks based on their contribution to this split sortInDescendingOrder(rackList); boolean done = false; int index = 0; // Get the host list for all our aggregated items, sort // them and return the top entries for (NodeInfo ni: rackList) { Set hostSet = ni.getLeaves(); ListhostList = new LinkedList(); hostList.addAll(hostSet); // Sort the hosts in this rack based on their contribution sortInDescendingOrder(hostList); for (NodeInfo host: hostList) { // Strip out the port number from the host name retVal[index++] = host.node.getName().split(":")[0]; if (index == replicationFactor) { done = true; break; } } if (done == true) { break; } } return retVal; } private String[] fakeRacks(BlockLocation[] blkLocations, int index) throws IOException { String[] allHosts = blkLocations[index].getHosts(); String[] allTopos = new String[allHosts.length]; for (int i = 0; i < allHosts.length; i++) { allTopos[i] = NetworkTopology.DEFAULT_RACK + "/" + allHosts[i]; } return allTopos; } private static class NodeInfo { final Node node; final Set blockIds; final Set leaves; private long value; NodeInfo(Node node) { this.node = node; blockIds = new HashSet(); leaves = new HashSet(); } long getValue() {return value;} void addValue(int blockIndex, long value) { if (blockIds.add(blockIndex) == true) { this.value += value; } } Set getLeaves() { return leaves;} void addLeaf(NodeInfo nodeInfo) { leaves.add(nodeInfo); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy