ml.shifu.guagua.hadoop.io.GuaguaLineRecordReader Maven / Gradle / Ivy
/*
* Copyright [2013-2014] PayPal Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ml.shifu.guagua.hadoop.io;
import java.io.IOException;
import ml.shifu.guagua.GuaguaConstants;
import ml.shifu.guagua.io.GuaguaFileSplit;
import ml.shifu.guagua.io.GuaguaRecordReader;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.util.LineReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Copy from LineRecordReader to avoid using Mapreduce-related interfaces.
*/
public class GuaguaLineRecordReader implements
GuaguaRecordReader, GuaguaWritableAdapter> {
private static final Logger LOG = LoggerFactory.getLogger(GuaguaLineRecordReader.class);
private CompressionCodecFactory compressionCodecs = null;
private long start;
private long pos;
private long end;
private LineReader in;
private int maxLineLength;
private GuaguaWritableAdapter key = null;
private GuaguaWritableAdapter value = null;
private byte[] recordDelimiterBytes;
private Configuration conf;
public GuaguaLineRecordReader() {
this.conf = new Configuration();
}
public GuaguaLineRecordReader(byte[] recordDelimiter) {
this.conf = new Configuration();
this.recordDelimiterBytes = recordDelimiter;
}
public GuaguaLineRecordReader(GuaguaFileSplit split) throws IOException {
this(new Configuration(), split);
}
public GuaguaLineRecordReader(Configuration conf, GuaguaFileSplit split) throws IOException {
this.conf = conf;
initialize(split);
}
/*
* (non-Javadoc)
*
* @see ml.shifu.guagua.mapreduce.RecordReader#initialize(ml.shifu.guagua.io.GuaguaFileSplit)
*/
@Override
public void initialize(GuaguaFileSplit genericSplit) throws IOException {
this.maxLineLength = Integer.MAX_VALUE;
start = genericSplit.getOffset();
end = start + genericSplit.getLength();
final Path file = new Path(genericSplit.getPath());
compressionCodecs = new CompressionCodecFactory(this.conf);
final CompressionCodec codec = compressionCodecs.getCodec(file);
// open the file and seek to the start of the split
FileSystem fs = file.getFileSystem(this.conf);
FSDataInputStream fileIn = fs.open(file);
boolean skipFirstLine = false;
if(codec != null) {
if(null == this.recordDelimiterBytes) {
in = new LineReader(codec.createInputStream(fileIn), GuaguaConstants.DEFAULT_IO_BUFFER_SIZE);
} else {
in = new LineReader(codec.createInputStream(fileIn), GuaguaConstants.DEFAULT_IO_BUFFER_SIZE,
this.recordDelimiterBytes);
}
end = Long.MAX_VALUE;
} else {
if(start != 0) {
skipFirstLine = true;
--start;
fileIn.seek(start);
}
if(null == this.recordDelimiterBytes) {
in = new LineReader(fileIn, GuaguaConstants.DEFAULT_IO_BUFFER_SIZE);
} else {
in = new LineReader(fileIn, GuaguaConstants.DEFAULT_IO_BUFFER_SIZE, this.recordDelimiterBytes);
}
}
if(skipFirstLine) { // skip first line and re-establish "start".
start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
}
this.pos = start;
}
/*
* (non-Javadoc)
*
* @see ml.shifu.guagua.mapreduce.RecordReader#nextKeyValue()
*/
@Override
public boolean nextKeyValue() throws IOException {
if(key == null) {
key = new GuaguaWritableAdapter(new LongWritable());
}
key.getWritable().set(pos);
if(value == null) {
value = new GuaguaWritableAdapter(new Text());
}
int newSize = 0;
while(pos < end) {
newSize = in.readLine(value.getWritable(), maxLineLength,
Math.max((int) Math.min(Integer.MAX_VALUE, end - pos), maxLineLength));
if(newSize == 0) {
break;
}
pos += newSize;
if(newSize < maxLineLength) {
break;
}
// line too long. try again
LOG.info("Skipped line of size {} at pos {}", newSize, (pos - newSize));
}
if(newSize == 0) {
key = null;
value = null;
return false;
} else {
return true;
}
}
/*
* (non-Javadoc)
*
* @see ml.shifu.guagua.mapreduce.RecordReader#getCurrentKey()
*/
@Override
public GuaguaWritableAdapter getCurrentKey() {
return key;
}
/*
* (non-Javadoc)
*
* @see ml.shifu.guagua.mapreduce.RecordReader#getCurrentValue()
*/
@Override
public GuaguaWritableAdapter getCurrentValue() {
return value;
}
/**
* Get the progress within the split
*/
public float getProgress() {
if(start == end) {
return 0.0f;
} else {
return Math.min(1.0f, (pos - start) / (float) (end - start));
}
}
/*
* (non-Javadoc)
*
* @see ml.shifu.guagua.mapreduce.RecordReader#close()
*/
@Override
public synchronized void close() throws IOException {
if(in != null) {
in.close();
}
}
}