org.apache.hadoop.hive.ql.io.FlatFileInputFormat Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.io;
import java.io.DataInputStream;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.serializer.Deserializer;
import org.apache.hadoop.io.serializer.Serialization;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.ReflectionUtils;
/**
* An {@link org.apache.hadoop.mapred.InputFormat} for Plain files with
* {@link Deserializer} records.
*/
@Deprecated
public class FlatFileInputFormat extends
FileInputFormat> {
/**
* A work-around until HADOOP-1230 is fixed.
*
* Allows boolean next(k,v) to be called by reference but still allow the
* deserializer to create a new object (i.e., row) on every call to next.
*/
public static class RowContainer {
T row;
}
/**
* An implementation of SerializationContext is responsible for looking up the
* Serialization implementation for the given RecordReader. Potentially based
* on the Configuration or some other mechanism
*
* The SerializationFactory does not give this functionality since: 1.
* Requires Serialization implementations to be specified in the Configuration
* a-priori (although same as setting a SerializationContext) 2. Does not
* lookup the actual subclass being deserialized. e.g., for Serializable does
* not have a way of configuring the actual Java class being
* serialized/deserialized.
*/
public static interface SerializationContext extends Configurable {
/**
* An {@link Serialization} object for objects of type S.
*
* @return a serialization object for this context
*/
Serialization getSerialization() throws IOException;
/**
* Produces the specific class to deserialize.
*/
Class extends S> getRealClass() throws IOException;
}
/**
* The JobConf keys for the Serialization implementation.
*/
public static final String SerializationImplKey = "mapred.input.serialization.implKey";
/**
* An implementation of {@link SerializationContext} that reads the
* Serialization class and specific subclass to be deserialized from the
* JobConf.
*
*/
public static class SerializationContextFromConf implements
FlatFileInputFormat.SerializationContext {
/**
* The JobConf keys for the Class that is being deserialized.
*/
public static final String SerializationSubclassKey = "mapred.input.serialization.subclassKey";
/**
* Implements configurable so it can use the configuration to find the right
* classes Note: ReflectionUtils will automatigically call setConf with the
* right configuration.
*/
private Configuration conf;
@Override
public void setConf(Configuration conf) {
this.conf = conf;
}
@Override
public Configuration getConf() {
return conf;
}
/**
* @return the actual class being deserialized.
* @exception does
* not currently throw IOException
*/
@Override
public Class getRealClass() throws IOException {
return (Class) conf.getClass(SerializationSubclassKey, null,
Object.class);
}
/**
* Looks up and instantiates the Serialization Object
*
* Important to note here that we are not relying on the Hadoop
* SerializationFactory part of the Serialization framework. This is because
* in the case of Non-Writable Objects, we cannot make any assumptions about
* the uniformity of the serialization class APIs - i.e., there may not be a
* "write" method call and a subclass may need to implement its own
* Serialization classes. The SerializationFactory currently returns the
* first (de)serializer that is compatible with the class to be
* deserialized; in this context, that assumption isn't necessarily true.
*
* @return the serialization object for this context
* @exception does
* not currently throw any IOException
*/
@Override
public Serialization getSerialization() throws IOException {
Class> tClass = (Class>) conf.getClass(
SerializationImplKey, null, Serialization.class);
return tClass == null ? null : (Serialization) ReflectionUtils
.newInstance(tClass, conf);
}
}
/**
* An {@link RecordReader} for plain files with {@link Deserializer} records
*
* Reads one row at a time of type R. R is intended to be a base class of
* something such as: Record, Writable, Text, ...
*
*/
@Deprecated
public class FlatFileRecordReader implements
RecordReader> {
/**
* An interface for a helper class for instantiating {@link Serialization}
* classes.
*/
/**
* The stream in use - is fsin if not compressed, otherwise, it is dcin.
*/
private final DataInputStream in;
/**
* The decompressed stream or null if the input is not decompressed.
*/
private final InputStream dcin;
/**
* The underlying stream.
*/
private final FSDataInputStream fsin;
/**
* For calculating progress.
*/
private final long end;
/**
* The constructed deserializer.
*/
private final Deserializer deserializer;
/**
* Once EOF is reached, stop calling the deserializer.
*/
private boolean isEOF;
/**
* The JobConf which contains information needed to instantiate the correct
* Deserializer.
*/
private final Configuration conf;
/**
* The actual class of the row's we are deserializing, not just the base
* class.
*/
private final Class realRowClass;
/**
* FlatFileRecordReader constructor constructs the underlying stream
* (potentially decompressed) and creates the deserializer.
*
* @param conf
* the jobconf
* @param split
* the split for this file
*/
public FlatFileRecordReader(Configuration conf, FileSplit split) throws IOException {
final Path path = split.getPath();
FileSystem fileSys = path.getFileSystem(conf);
CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(
conf);
final CompressionCodec codec = compressionCodecs.getCodec(path);
this.conf = conf;
fsin = fileSys.open(path);
if (codec != null) {
dcin = codec.createInputStream(fsin);
in = new DataInputStream(dcin);
} else {
dcin = null;
in = fsin;
}
isEOF = false;
end = split.getLength();
// Instantiate a SerializationContext which this will use to lookup the
// Serialization class and the
// actual class being deserialized
SerializationContext sinfo;
Class> sinfoClass = (Class>) conf
.getClass(SerializationContextImplKey,
SerializationContextFromConf.class);
sinfo = ReflectionUtils.newInstance(sinfoClass, conf);
// Get the Serialization object and the class being deserialized
Serialization serialization = sinfo.getSerialization();
realRowClass = (Class) sinfo.getRealClass();
deserializer = serialization.getDeserializer(realRowClass);
deserializer.open(in);
}
/**
* The JobConf key of the SerializationContext to use.
*/
public static final String SerializationContextImplKey =
"mapred.input.serialization.context_impl";
/**
* @return null
*/
@Override
public Void createKey() {
return null;
}
/**
* @return a new R instance.
*/
@Override
public RowContainer createValue() {
RowContainer r = new RowContainer();
r.row = ReflectionUtils.newInstance(realRowClass, conf);
return r;
}
/**
* Returns the next row # and value.
*
* @param key
* - void as these files have a value only
* @param value
* - the row container which is always re-used, but the internal
* value may be set to a new Object
* @return whether the key and value were read. True if they were and false
* if EOF
* @exception IOException
* from the deserializer
*/
@Override
public synchronized boolean next(Void key, RowContainer value) throws IOException {
if (isEOF || in.available() == 0) {
isEOF = true;
return false;
}
// the deserializer is responsible for actually reading each record from
// the stream
try {
value.row = deserializer.deserialize(value.row);
if (value.row == null) {
isEOF = true;
return false;
}
return true;
} catch (EOFException e) {
isEOF = true;
return false;
}
}
@Override
public synchronized float getProgress() throws IOException {
// this assumes no splitting
if (end == 0) {
return 0.0f;
} else {
// gives progress over uncompressed stream
// assumes deserializer is not buffering itself
return Math.min(1.0f, fsin.getPos() / (float) (end));
}
}
@Override
public synchronized long getPos() throws IOException {
// assumes deserializer is not buffering itself
// position over uncompressed stream. not sure what
// effect this has on stats about job
return fsin.getPos();
}
@Override
public synchronized void close() throws IOException {
// assuming that this closes the underlying streams
deserializer.close();
}
}
protected boolean isSplittable(FileSystem fs, Path filename) {
return false;
}
@Override
public RecordReader> getRecordReader(InputSplit split,
JobConf job, Reporter reporter) throws IOException {
reporter.setStatus(split.toString());
return new FlatFileRecordReader(job, (FileSplit) split);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy