
org.deeplearning4j.hadoop.datasetiterator.BaseHdfsDataSetIterator Maven / Gradle / Ivy
The newest version!
package org.deeplearning4j.hadoop.datasetiterator;
import java.io.FileNotFoundException;
import java.io.InputStream;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.deeplearning4j.datasets.iterator.DataSetIterator;
/**
* Baseline support for a dataset iterator iterating over
*
* hdfs data
*
*
* @author Adam Gibson
*
*/
public abstract class BaseHdfsDataSetIterator implements DataSetIterator {
/**
*
*/
private static final long serialVersionUID = 2299460082862304030L;
private String hdfsUriRootDir;
private Configuration conf;
/**
* Constructs a data applyTransformToDestination iterator with the hdfs uri root directory.
* This assumes that individual files are what are being read from wrt
* helper methods present.
*
* hasNext() and associated data specific methods can be configured
* for the specific implementation.
* @param hdfsUriRootDir
*/
public BaseHdfsDataSetIterator(String hdfsUriRootDir) {
this.hdfsUriRootDir = hdfsUriRootDir;
}
/**
* Reads a file from hdfs in to a string.
* Note that this is not smart on large files,
* however I will not hand hold you here.
*
*
* @param path the path to read from
* @return the contents of the file
* @throws Exception
*/
public String readStringFromPath(String path) throws Exception {
return readStringFromPath(new Path(path));
}
/**
* Reads a file from hdfs in to a string.
* Note that this is not smart on large files,
* however I will not hand hold you here.
*
*
* @param path the path to read from
* @return the contents of the file
* @throws Exception
*/
public String readStringFromPath(Path path) throws Exception {
InputStream is = openInputStream(path);
StringWriter writer = new StringWriter();
IOUtils.copy(is, writer, "UTF-8");
String theString = writer.toString();
is.close();
return theString;
}
/**
* Opens an input stream for the given path
* @param path the path to open an input stream for
* @return the opened input stream
* @throws Exception
*/
public InputStream openInputStream(String path) throws Exception {
return openInputStream(new Path(path));
}
/**
* Forget if need to close file system here.
* @param path the path to open
* @return the input stream for the path
* @throws Exception if one occurs
*/
public InputStream openInputStream(Path path) throws Exception {
FileSystem fs = FileSystem.get(conf);
if(!fs.exists(path))
throw new FileNotFoundException("File does not exist");
if(fs.isDirectory(path))
throw new IllegalArgumentException("Not a file");
InputStream is = fs.open(path);
return is;
}
/**
* List all of the files in the
* hdfsUriRootDir directory
* @return the list of paths in the directory
* @throws Exception if one occurs
*/
public List filesInDir() throws Exception {
FileSystem fs = FileSystem.get(conf);
List paths = new ArrayList();
RemoteIterator iter = fs.listFiles(new Path(hdfsUriRootDir), true);
while(iter.hasNext()) {
LocatedFileStatus l = iter.next();
paths.add(l.getPath());
}
fs.close();
return paths;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy