org.apache.hadoop.hive.ql.io.orc.OrcFileStripeMergeRecordReader Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.io.orc;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.RecordReader;
public class OrcFileStripeMergeRecordReader implements
RecordReader {
private final Reader reader;
private final Path path;
protected Iterator iter;
protected List stripeStatistics;
private int stripeIdx;
private long start;
private long end;
private boolean skipFile;
public OrcFileStripeMergeRecordReader(Configuration conf, FileSplit split) throws IOException {
path = split.getPath();
start = split.getStart();
end = start + split.getLength();
FileSystem fs = path.getFileSystem(conf);
this.reader = OrcFile.createReader(path, OrcFile.readerOptions(conf).filesystem(fs));
this.iter = reader.getStripes().iterator();
this.stripeIdx = 0;
this.stripeStatistics = ((ReaderImpl) reader).getOrcProtoStripeStatistics();
}
public Class getKeyClass() {
return OrcFileKeyWrapper.class;
}
public Class getValueClass() {
return OrcFileValueWrapper.class;
}
public OrcFileKeyWrapper createKey() {
return new OrcFileKeyWrapper();
}
public OrcFileValueWrapper createValue() {
return new OrcFileValueWrapper();
}
@Override
public boolean next(OrcFileKeyWrapper key, OrcFileValueWrapper value) throws IOException {
if (skipFile) {
return false;
}
return nextStripe(key, value);
}
protected boolean nextStripe(OrcFileKeyWrapper keyWrapper, OrcFileValueWrapper valueWrapper)
throws IOException {
// missing stripe stats (old format). If numRows is 0 then its an empty file and no statistics
// is present. We have to differentiate no stats (empty file) vs missing stats (old format).
if ((stripeStatistics == null || stripeStatistics.isEmpty()) && reader.getNumberOfRows() > 0) {
keyWrapper.setInputPath(path);
keyWrapper.setIsIncompatFile(true);
skipFile = true;
return true;
}
while (iter.hasNext()) {
StripeInformation si = iter.next();
// if stripe offset is outside the split boundary then ignore the current
// stripe as it will be handled by some other mapper.
if (si.getOffset() >= start && si.getOffset() < end) {
valueWrapper.setStripeStatistics(stripeStatistics.get(stripeIdx++));
valueWrapper.setStripeInformation(si);
if (!iter.hasNext()) {
valueWrapper.setLastStripeInFile(true);
valueWrapper.setUserMetadata(((ReaderImpl) reader).getOrcProtoUserMetadata());
}
keyWrapper.setInputPath(path);
keyWrapper.setCompression(reader.getCompression());
keyWrapper.setCompressBufferSize(reader.getCompressionSize());
keyWrapper.setVersion(reader.getFileVersion());
keyWrapper.setRowIndexStride(reader.getRowIndexStride());
keyWrapper.setTypes(reader.getTypes());
} else {
stripeIdx++;
continue;
}
return true;
}
return false;
}
/**
* Default progress will be based on number of files processed.
* @return 0.0 to 1.0 of the input byte range
*/
public float getProgress() throws IOException {
return 0.0f;
}
public long getPos() throws IOException {
return 0;
}
protected void seek(long pos) throws IOException {
}
public long getStart() {
return 0;
}
public void close() throws IOException {
}
}