org.apache.pig.bzip2r.Bzip2TextInputFormat Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pig.bzip2r;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigFileInputFormat;
import org.apache.tools.bzip2r.CBZip2InputStream;
@SuppressWarnings("unchecked")
public class Bzip2TextInputFormat extends PigFileInputFormat {
/**
* Treats keys as offset in file and value as line. Since the input file is
* compressed, the offset for a particular line is not well-defined. This
* implementation returns the starting position of a compressed block as the
* key for every line in that block.
*/
private static class BZip2LineRecordReader extends RecordReader {
private long start;
private long end;
private long pos;
private CBZip2InputStream in;
private ByteArrayOutputStream buffer = new ByteArrayOutputStream(256);
// flag to indicate if previous character read was Carriage Return ('\r')
// and the next character was not Line Feed ('\n')
private boolean CRFollowedByNonLF = false;
// in the case where a Carriage Return ('\r') was not followed by a
// Line Feed ('\n'), this variable will hold that non Line Feed character
// that was read from the underlying stream.
private byte nonLFChar;
/**
* Provide a bridge to get the bytes from the ByteArrayOutputStream without
* creating a new byte array.
*/
private static class TextStuffer extends OutputStream {
public Text target;
@Override
public void write(int b) {
throw new UnsupportedOperationException("write(byte) not supported");
}
@Override
public void write(byte[] data, int offset, int len) throws IOException {
target.clear();
target.set(data, offset, len);
}
}
private TextStuffer bridge = new TextStuffer();
private LongWritable key = new LongWritable();
private Text value = new Text();
public BZip2LineRecordReader(Configuration job, FileSplit split)
throws IOException {
start = split.getStart();
end = start + split.getLength();
final Path file = split.getPath();
// open the file and seek to the start of the split
FileSystem fs = file.getFileSystem(job);
FSDataInputStream fileIn = fs.open(split.getPath());
fileIn.seek(start);
in = new CBZip2InputStream(fileIn, 9, end);
if (start != 0) {
// skip first line and re-establish "start".
// LineRecordReader.readLine(this.in, null);
readLine(this.in, null);
start = in.getPos();
}
pos = in.getPos();
}
public LongWritable createKey() {
return new LongWritable();
}
public Text createValue() {
return new Text();
}
/*
* LineRecordReader.readLine() is depricated in HAdoop 0.17. So it is added here
* locally.
*/
private long readLine(InputStream in,
OutputStream out) throws IOException {
long bytes = 0;
while (true) {
int b = -1;
if(CRFollowedByNonLF) {
// In the previous call, a Carriage Return ('\r') was followed
// by a non Line Feed ('\n') character - in that call we would
// have not returned the non Line Feed character but would have
// read it from the stream - lets use that already read character
// now
b = nonLFChar;
CRFollowedByNonLF = false;
} else {
b = in.read();
}
if (b == -1) {
break;
}
bytes += 1;
byte c = (byte)b;
if (c == '\n') {
break;
}
if (c == '\r') {
byte nextC = (byte)in.read();
if (nextC != '\n') {
CRFollowedByNonLF = true;
nonLFChar = nextC;
} else {
bytes += 1;
}
break;
}
if (out != null) {
out.write(c);
}
}
return bytes;
}
/** Read a line. */
public boolean next(LongWritable key, Text value)
throws IOException {
if (pos > end)
return false;
key.set(pos); // key is position
buffer.reset();
// long bytesRead = LineRecordReader.readLine(in, buffer);
long bytesRead = readLine(in, buffer);
if (bytesRead == 0) {
return false;
}
pos = in.getPos();
// if we have read ahead because we encountered a carriage return
// char followed by a non line feed char, decrement the pos
if(CRFollowedByNonLF) {
pos--;
}
bridge.target = value;
buffer.writeTo(bridge);
return true;
}
/**
* Get the progress within the split
*/
@Override
public float getProgress() {
if (start == end) {
return 0.0f;
} else {
return Math.min(1.0f, (pos - start) / (float) (end - start));
}
}
public long getPos() throws IOException {
return pos;
}
@Override
public void close() throws IOException {
in.close();
}
@Override
public LongWritable getCurrentKey() throws IOException,
InterruptedException {
return key;
}
@Override
public Text getCurrentValue() throws IOException, InterruptedException {
return value;
}
@Override
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
// no op
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
return next(key, value);
}
}
@Override
public RecordReader createRecordReader(InputSplit split,
TaskAttemptContext context) throws IOException, InterruptedException {
return new BZip2LineRecordReader(context.getConfiguration(),
(FileSplit) split);
}
}