com.hadoop.mapreduce.LzoTextInputFormat Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of hadoop-gpl-compression Show documentation
Show all versions of hadoop-gpl-compression Show documentation
This project is a set of plugins for Apache Hadoop that provide
access to the GPL'ed compression codecs.
The newest version!
/*
* This file is part of Hadoop-Gpl-Compression.
*
* Hadoop-Gpl-Compression is free software: you can redistribute it
* and/or modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* Hadoop-Gpl-Compression is distributed in the hope that it will be
* useful, but WITHOUT ANY WARRANTY; without even the implied warranty
* of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Hadoop-Gpl-Compression. If not, see
* .
*/
package com.hadoop.mapreduce;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import com.hadoop.compression.lzo.LzopCodec;
import com.hadoop.compression.lzo.LzopCodec.LzopDecompressor;
/**
* An {@link InputFormat} for lzop compressed text files. Files are broken into
* lines. Either linefeed or carriage-return are used to signal end of line.
* Keys are the position in the file, and values are the line of text.
*/
public class LzoTextInputFormat extends FileInputFormat {
public static final String LZO_INDEX_SUFFIX = ".index";
private Map indexes = new HashMap();
@Override
protected List listStatus(JobContext job) throws IOException {
List files = super.listStatus(job);
FileSystem fs = FileSystem.get(job.getConfiguration());
String fileExtension = new LzopCodec().getDefaultExtension();
for (Iterator iterator = files.iterator(); iterator.hasNext();) {
FileStatus fileStatus = (FileStatus) iterator.next();
Path file = fileStatus.getPath();
if (!file.toString().endsWith(fileExtension)) {
//get rid of non lzo files
iterator.remove();
} else {
//read the index file
LzoIndex index = readIndex(file, fs);
indexes.put(file, index);
}
}
return files;
}
@Override
protected boolean isSplitable(JobContext context, Path filename) {
LzoIndex index = indexes.get(filename);
return !index.isEmpty();
}
@Override
public List getSplits(JobContext job) throws IOException {
List splits = super.getSplits(job);
// find new start/ends of the filesplit that aligns
// with the lzo blocks
List result = new ArrayList();
FileSystem fs = FileSystem.get(job.getConfiguration());
for (InputSplit genericSplit : splits) {
// load the index
FileSplit fileSplit = (FileSplit) genericSplit;
Path file = fileSplit.getPath();
LzoIndex index = indexes.get(file);
if (index == null) {
throw new IOException("Index not found for " + file);
}
if (index.isEmpty()) {
// empty index, keep as is
result.add(fileSplit);
continue;
}
long start = fileSplit.getStart();
long end = start + fileSplit.getLength();
if (start != 0) {
// find the next block position from
// the start of the split
long newStart = index.findNextPosition(start);
if (newStart == -1 || newStart >= end) {
// just skip this since it will be handled by another split
continue;
}
start = newStart;
}
long newEnd = index.findNextPosition(end);
if (newEnd != -1) {
end = newEnd;
} else {
//didn't find the next position
//we have hit the end of the file
end = fs.getFileStatus(file).getLen();
}
result.add(new FileSplit(file, start, end - start, fileSplit
.getLocations()));
}
return result;
}
/**
* Read the index of the lzo file.
*
* @param split
* Read the index of this file.
* @param fs
* The index file is on this file system.
* @throws IOException
*/
private LzoIndex readIndex(Path file, FileSystem fs) throws IOException {
FSDataInputStream indexIn = null;
try {
Path indexFile = new Path(file.toString() + LZO_INDEX_SUFFIX);
if (!fs.exists(indexFile)) {
// return empty index, fall back to the unsplittable mode
return new LzoIndex();
}
long indexLen = fs.getFileStatus(indexFile).getLen();
int blocks = (int) (indexLen / 8);
LzoIndex index = new LzoIndex(blocks);
indexIn = fs.open(indexFile);
for (int i = 0; i < blocks; i++) {
index.set(i, indexIn.readLong());
}
return index;
} finally {
if (indexIn != null) {
indexIn.close();
}
}
}
/**
* Index an lzo file to allow the input format to split them into separate map
* jobs.
*
* @param fs
* File system that contains the file.
* @param lzoFile
* the lzo file to index.
* @throws IOException
*/
public static void createIndex(FileSystem fs, Path lzoFile)
throws IOException {
Configuration conf = fs.getConf();
CompressionCodecFactory factory = new CompressionCodecFactory(fs.getConf());
CompressionCodec codec = factory.getCodec(lzoFile);
((Configurable) codec).setConf(conf);
InputStream lzoIs = null;
FSDataOutputStream os = null;
Path outputFile = new Path(lzoFile.toString()
+ LzoTextInputFormat.LZO_INDEX_SUFFIX);
Path tmpOutputFile = outputFile.suffix(".tmp");
try {
FSDataInputStream is = fs.open(lzoFile);
os = fs.create(tmpOutputFile);
LzopDecompressor decompressor = (LzopDecompressor) codec
.createDecompressor();
// for reading the header
lzoIs = codec.createInputStream(is, decompressor);
int numChecksums = decompressor.getChecksumsCount();
while (true) {
// read and ignore, we just want to get to the next int
int uncompressedBlockSize = is.readInt();
if (uncompressedBlockSize == 0) {
break;
} else if (uncompressedBlockSize < 0) {
throw new EOFException();
}
int compressedBlockSize = is.readInt();
if (compressedBlockSize <= 0) {
throw new IOException("Could not read compressed block size");
}
long pos = is.getPos();
// write the pos of the block start
os.writeLong(pos - 8);
// seek to the start of the next block, skip any checksums
is.seek(pos + compressedBlockSize + (4 * numChecksums));
}
} finally {
if (lzoIs != null) {
lzoIs.close();
}
if (os != null) {
os.close();
}
}
fs.rename(tmpOutputFile, outputFile);
}
/**
* Represents the lzo index.
*/
static class LzoIndex {
private long[] blockPositions;
LzoIndex() {
}
LzoIndex(int blocks) {
blockPositions = new long[blocks];
}
/**
* Set the position for the block.
*
* @param blockNumber
* Block to set pos for.
* @param pos
* Position.
*/
public void set(int blockNumber, long pos) {
blockPositions[blockNumber] = pos;
}
/**
* Find the next lzo block start from the given position.
*
* @param pos
* The position to start looking from.
* @return Either the start position of the block or -1 if it couldn't be
* found.
*/
public long findNextPosition(long pos) {
int block = Arrays.binarySearch(blockPositions, pos);
if (block >= 0) {
// direct hit on a block start position
return blockPositions[block];
} else {
block = Math.abs(block) - 1;
if (block > blockPositions.length - 1) {
return -1;
}
return blockPositions[block];
}
}
public boolean isEmpty() {
return blockPositions == null || blockPositions.length == 0;
}
}
@Override
public RecordReader createRecordReader(InputSplit split,
TaskAttemptContext taskAttempt) throws IOException, InterruptedException {
return new LzoLineRecordReader();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy