Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
org.apache.hadoop.mapred.BlockMapOutputBuffer Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.mapred;
import static org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_BYTES;
import static org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_RECORDS;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.DefaultCodec;
import org.apache.hadoop.mapred.IFile.Writer;
import org.apache.hadoop.mapred.Merger.Segment;
import org.apache.hadoop.mapred.Task.TaskReporter;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.ResourceCalculatorPlugin.ProcResourceValues;
public class BlockMapOutputBuffer
implements BlockMapOutputCollector {
private static final Log LOG = LogFactory.getLog(BlockMapOutputBuffer.class.getName());
private final Partitioner partitioner;
private final int partitions;
private final JobConf job;
private final TaskReporter reporter;
private final Class keyClass;
private final Class valClass;
private final int softBufferLimit;
// Compression for map-outputs
private CompressionCodec codec = null;
// main output buffer
private byte[] kvbuffer;
private int kvBufferSize;
// spill accounting
private volatile int numSpills = 0;
// number of spills for big records
private volatile int numBigRecordsSpills = 0;
private volatile int numBigRecordsWarnThreshold = 500;
private final FileSystem localFs;
private final FileSystem rfs;
private final Counters.Counter mapOutputByteCounter;
private final Counters.Counter mapOutputRecordCounter;
private MapSpillSortCounters mapSpillSortCounter;
private MapTask task;
private ReducePartition[] reducePartitions;
private ArrayList indexCacheList;
// an array of memory segments, one for each reduce partition.
private Segment[] inMemorySegments;
private boolean hasInMemorySpill;
private boolean lastSpillInMem;
private int totalIndexCacheMemory;
private static final int INDEX_CACHE_MEMORY_LIMIT = 2 * 1024 * 1024;
private final MemoryBlockAllocator memoryBlockAllocator;
@SuppressWarnings( { "unchecked", "deprecation" })
public BlockMapOutputBuffer(TaskUmbilicalProtocol umbilical, JobConf job,
TaskReporter reporter, MapTask task) throws IOException,
ClassNotFoundException {
this.task = task;
this.job = job;
this.reporter = reporter;
localFs = FileSystem.getLocal(job);
partitions = job.getNumReduceTasks();
indexCacheList = new ArrayList();
if (partitions > 0) {
partitioner = (Partitioner) ReflectionUtils.newInstance(job
.getPartitionerClass(), job);
} else {
partitioner = new Partitioner() {
@Override
public int getPartition(Object key, Object value, int numPartitions) {
return -1;
}
@Override
public void configure(JobConf job) {
}
};
}
rfs = ((LocalFileSystem) localFs).getRaw();
float spillper = job.getFloat("io.sort.spill.percent", (float) 0.9);
if (spillper > (float) 1.0 || spillper < (float) 0.0) {
LOG.error("Invalid \"io.sort.spill.percent\": " + spillper);
spillper = 0.8f;
}
lastSpillInMem = job.getBoolean("mapred.map.lastspill.memory", true);
numBigRecordsWarnThreshold =
job.getInt("mapred.map.bigrecord.spill.warn.threshold", 500);
int sortmb = job.getInt("io.sort.mb", 100);
boolean localMode = job.get("mapred.job.tracker", "local").equals("local");
if (localMode) {
sortmb = job.getInt("io.sort.mb.localmode", 100);
}
if ((sortmb & 0x7FF) != sortmb) {
throw new IOException("Invalid \"io.sort.mb\": " + sortmb);
}
LOG.info("io.sort.mb = " + sortmb);
// buffers and accounting
kvBufferSize = sortmb << 20;
kvbuffer = new byte[kvBufferSize];
softBufferLimit = (int) (kvbuffer.length * spillper);
// k/v serialization
keyClass = (Class) job.getMapOutputKeyClass();
valClass = (Class) job.getMapOutputValueClass();
if (!BytesWritable.class.isAssignableFrom(keyClass)
|| !BytesWritable.class.isAssignableFrom(valClass)) {
throw new IOException(this.getClass().getName()
+ " only support " + BytesWritable.class.getName()
+ " as key and value classes, MapOutputKeyClass is "
+ keyClass.getName() + ", MapOutputValueClass is "
+ valClass.getName());
}
int numMappers = job.getNumMapTasks();
memoryBlockAllocator =
new MemoryBlockAllocator(kvBufferSize, softBufferLimit, numMappers,
partitions, this);
// counters
mapOutputByteCounter = reporter.getCounter(MAP_OUTPUT_BYTES);
mapOutputRecordCounter = reporter.getCounter(MAP_OUTPUT_RECORDS);
mapSpillSortCounter = new MapSpillSortCounters(reporter);
reducePartitions = new ReducePartition[partitions];
inMemorySegments = new Segment[partitions];
for (int i = 0; i < partitions; i++) {
reducePartitions[i] = new ReducePartition(i, this.memoryBlockAllocator,
this.kvbuffer, this, this.reporter);
}
// compression
if (job.getCompressMapOutput()) {
Class extends CompressionCodec> codecClass = job
.getMapOutputCompressorClass(DefaultCodec.class);
codec = ReflectionUtils.newInstance(codecClass, job);
}
}
private TaskAttemptID getTaskID() {
return task.getTaskID();
}
public void collect(K key, V value, int partition) throws IOException {
reporter.progress();
if (key.getClass() != keyClass) {
throw new IOException("Type mismatch in key from map: expected "
+ keyClass.getName() + ", recieved " + key.getClass().getName());
}
if (value.getClass() != valClass) {
throw new IOException("Type mismatch in value from map: expected "
+ valClass.getName() + ", recieved " + value.getClass().getName());
}
int collected = reducePartitions[partition].collect(key, value);
mapOutputRecordCounter.increment(1);
mapOutputByteCounter.increment(collected);
}
@SuppressWarnings("deprecation")
@Override
public void collect(K key, V value) throws IOException {
collect(key, value, partitioner.getPartition(key, value,
partitions));
}
/*
* return the value of ProcResourceValues for later use
*/
protected ProcResourceValues sortReduceParts() {
long sortStartMilli = System.currentTimeMillis();
ProcResourceValues sortStartProcVals =
task.getCurrentProcResourceValues();
// sort
for (int i = 0; i < reducePartitions.length; i++) {
reducePartitions[i].groupOrSort();
}
long sortEndMilli = System.currentTimeMillis();
ProcResourceValues sortEndProcVals =
task.getCurrentProcResourceValues();
mapSpillSortCounter.incCountersPerSort(sortStartProcVals,
sortEndProcVals, sortEndMilli - sortStartMilli);
return sortEndProcVals;
}
@Override
public void sortAndSpill() throws IOException {
ProcResourceValues sortEndProcVals = sortReduceParts();
long sortEndMilli = System.currentTimeMillis();
// spill
FSDataOutputStream out = null;
long spillBytes = 0;
try {
// create spill file
final SpillRecord spillRec = new SpillRecord(partitions);
final Path filename =
task.mapOutputFile
.getSpillFileForWrite(getTaskID(), numSpills,
this.memoryBlockAllocator.getEstimatedSize());
out = rfs.create(filename);
for (int i = 0; i < partitions; ++i) {
IndexRecord rec =
reducePartitions[i].spill(job, out, keyClass, valClass,
codec, task.spilledRecordsCounter);
// record offsets
spillBytes += rec.partLength;
spillRec.putIndex(rec, i);
}
if (totalIndexCacheMemory >= INDEX_CACHE_MEMORY_LIMIT) {
// create spill index file
Path indexFilename =
task.mapOutputFile.getSpillIndexFileForWrite(getTaskID(),
numSpills, partitions
* MapTask.MAP_OUTPUT_INDEX_RECORD_LENGTH);
spillRec.writeToFile(indexFilename, job);
} else {
indexCacheList.add(spillRec);
totalIndexCacheMemory +=
spillRec.size() * MapTask.MAP_OUTPUT_INDEX_RECORD_LENGTH;
}
LOG.info("Finished spill " + numSpills);
++numSpills;
} finally {
if (out != null)
out.close();
}
long spillEndMilli = System.currentTimeMillis();
ProcResourceValues spillEndProcVals =
task.getCurrentProcResourceValues();
mapSpillSortCounter.incCountersPerSpill(sortEndProcVals,
spillEndProcVals, spillEndMilli - sortEndMilli, spillBytes);
}
public void spillSingleRecord(K key, V value, int part)
throws IOException {
ProcResourceValues spillStartProcVals =
task.getCurrentProcResourceValues();
long spillStartMilli = System.currentTimeMillis();
// spill
FSDataOutputStream out = null;
long spillBytes = 0;
try {
// create spill file
final SpillRecord spillRec = new SpillRecord(partitions);
final Path filename =
task.mapOutputFile.getSpillFileForWrite(getTaskID(),
numSpills, key.getLength() + value.getLength());
out = rfs.create(filename);
IndexRecord rec = new IndexRecord();
for (int i = 0; i < partitions; ++i) {
IFile.Writer writer = null;
try {
long segmentStart = out.getPos();
// Create a new codec, don't care!
writer =
new IFile.Writer(job, out, keyClass, valClass,
codec, task.spilledRecordsCounter);
if (i == part) {
final long recordStart = out.getPos();
writer.append(key, value);
// Note that our map byte count will not be accurate with
// compression
mapOutputByteCounter
.increment(out.getPos() - recordStart);
}
writer.close();
// record offsets
rec.startOffset = segmentStart;
rec.rawLength = writer.getRawLength();
rec.partLength = writer.getCompressedLength();
spillBytes += writer.getCompressedLength();
spillRec.putIndex(rec, i);
writer = null;
} catch (IOException e) {
if (null != writer)
writer.close();
throw e;
}
}
if (totalIndexCacheMemory >= INDEX_CACHE_MEMORY_LIMIT) {
// create spill index file
Path indexFilename =
task.mapOutputFile.getSpillIndexFileForWrite(getTaskID(),
numSpills, partitions
* MapTask.MAP_OUTPUT_INDEX_RECORD_LENGTH);
spillRec.writeToFile(indexFilename, job);
} else {
indexCacheList.add(spillRec);
totalIndexCacheMemory +=
spillRec.size() * MapTask.MAP_OUTPUT_INDEX_RECORD_LENGTH;
}
LOG.info("Finished spill big record " + numBigRecordsSpills);
++numBigRecordsSpills;
++numSpills;
} finally {
if (out != null)
out.close();
}
long spillEndMilli = System.currentTimeMillis();
ProcResourceValues spillEndProcVals =
task.getCurrentProcResourceValues();
mapSpillSortCounter.incCountersPerSpill(spillStartProcVals,
spillEndProcVals, spillEndMilli - spillStartMilli, spillBytes);
mapSpillSortCounter.incSpillSingleRecord();
}
public synchronized void flush() throws IOException, ClassNotFoundException,
InterruptedException {
if (numSpills > 0 && lastSpillInMem) {
// if there is already one spills, we can try to hold this last spill in
// memory.
sortReduceParts();
for (int i = 0; i < partitions; i++) {
this.inMemorySegments[i] =
new Segment(this.reducePartitions[i].getIReader(),
true);
}
hasInMemorySpill=true;
} else {
sortAndSpill();
}
long mergeStartMilli = System.currentTimeMillis();
ProcResourceValues mergeStartProcVals = task.getCurrentProcResourceValues();
mergeParts();
long mergeEndMilli = System.currentTimeMillis();
ProcResourceValues mergeEndProcVals = task.getCurrentProcResourceValues();
mapSpillSortCounter.incMergeCounters(mergeStartProcVals, mergeEndProcVals,
mergeEndMilli - mergeStartMilli);
}
private void mergeParts() throws IOException, InterruptedException,
ClassNotFoundException {
// get the approximate size of the final output/index files
long finalOutFileSize = 0;
long finalIndexFileSize = 0;
final Path[] filename = new Path[numSpills];
final TaskAttemptID mapId = getTaskID();
for (int i = 0; i < numSpills; i++) {
filename[i] = task.mapOutputFile.getSpillFile(mapId, i);
finalOutFileSize += rfs.getFileStatus(filename[i]).getLen();
}
for (Segment segement : this.inMemorySegments) {
if(segement != null) {
finalOutFileSize += segement.getLength();
}
}
// the spill is the final output
if (numSpills == 1 && !hasInMemorySpill) {
Path outFile = new Path(filename[0].getParent(), "file.out");
rfs.rename(filename[0], outFile);
if (indexCacheList.size() == 0) {
rfs.rename(task.mapOutputFile.getSpillIndexFile(mapId, 0), new Path(
filename[0].getParent(), "file.out.index"));
} else {
indexCacheList.get(0).writeToFile(
new Path(filename[0].getParent(), "file.out.index"), job);
}
return;
}
// read in paged indices
for (int i = indexCacheList.size(); i < numSpills; ++i) {
Path indexFileName = task.mapOutputFile.getSpillIndexFile(mapId, i);
indexCacheList.add(new SpillRecord(indexFileName, job));
}
// make correction in the length to include the file header
// lengths for each partition
finalOutFileSize += partitions * MapTask.APPROX_HEADER_LENGTH;
finalIndexFileSize = partitions * MapTask.MAP_OUTPUT_INDEX_RECORD_LENGTH;
Path finalOutputFile = task.mapOutputFile.getOutputFileForWrite(mapId,
finalOutFileSize);
Path finalIndexFile = task.mapOutputFile.getOutputIndexFileForWrite(mapId,
finalIndexFileSize);
// The output stream for the final single output file
FSDataOutputStream finalOut = rfs.create(finalOutputFile, true, 4096);
if (numSpills == 0) {
// create dummy files
IndexRecord rec = new IndexRecord();
SpillRecord sr = new SpillRecord(partitions);
try {
for (int i = 0; i < partitions; i++) {
long segmentStart = finalOut.getPos();
Writer writer = new Writer(job, finalOut, keyClass,
valClass, codec, null);
writer.close();
rec.startOffset = segmentStart;
rec.rawLength = writer.getRawLength();
rec.partLength = writer.getCompressedLength();
sr.putIndex(rec, i);
}
sr.writeToFile(finalIndexFile, job);
} finally {
finalOut.close();
}
return;
}
{
IndexRecord rec = new IndexRecord();
final SpillRecord spillRec = new SpillRecord(partitions);
for (int parts = 0; parts < partitions; parts++) {
// create the segments to be merged
List> segmentList = new ArrayList>(
numSpills + this.inMemorySegments.length);
for (int i = 0; i < numSpills; i++) {
IndexRecord indexRecord = indexCacheList.get(i).getIndex(parts);
Segment s = new Segment(job, rfs, filename[i],
indexRecord.startOffset, indexRecord.partLength, codec, true);
segmentList.add(i, s);
if (LOG.isDebugEnabled()) {
LOG.debug("MapId=" + mapId + " Reducer=" + parts + "Spill =" + i
+ "(" + indexRecord.startOffset + "," + indexRecord.rawLength
+ ", " + indexRecord.partLength + ")");
}
}
if(this.inMemorySegments[parts] != null) {
// add the in memory spill to the end of segmentList
segmentList.add(numSpills, this.inMemorySegments[parts]);
}
// merge
RawKeyValueIterator kvIter =
Merger.merge(job, rfs, keyClass, valClass, codec,
segmentList, job.getInt("io.sort.factor", 100),
new Path(mapId.toString()), new RawComparator() {
@Override
public int compare(byte[] b1, int s1, int l1,
byte[] b2, int s2, int l2) {
return LexicographicalComparerHolder.BEST_COMPARER
.compareTo(
b1,
s1 + WritableUtils.INT_LENGTH_BYTES,
l1 - WritableUtils.INT_LENGTH_BYTES,
b2,
s2 + WritableUtils.INT_LENGTH_BYTES,
l2 - WritableUtils.INT_LENGTH_BYTES
);
}
@Override
public int compare(K o1, K o2) {
return LexicographicalComparerHolder.BEST_COMPARER
.compareTo( o1.getBytes(), 0, o1.getLength(),
o2.getBytes(), 0, o2.getLength());
}
}, reporter, null,
task.spilledRecordsCounter);
// write merged output to disk
long segmentStart = finalOut.getPos();
Writer writer = new Writer(job, finalOut, keyClass,
valClass, codec, task.spilledRecordsCounter);
Merger.writeFile(kvIter, writer, reporter, job);
// close
writer.close();
// record offsets
rec.startOffset = segmentStart;
rec.rawLength = writer.getRawLength();
rec.partLength = writer.getCompressedLength();
spillRec.putIndex(rec, parts);
}
spillRec.writeToFile(finalIndexFile, job);
finalOut.close();
for (int i = 0; i < numSpills; i++) {
rfs.delete(filename[i], true);
}
}
}
public void close() {
this.mapSpillSortCounter.finalCounterUpdate();
if(numBigRecordsSpills > numBigRecordsWarnThreshold) {
LOG.warn("Spilled a large number of big records: "
+ numBigRecordsSpills);
}
}
}