org.apache.sysml.runtime.io.IOUtilFunctions Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Show all versions of systemml Show documentation
Declarative Machine Learning
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.sysml.runtime.io;
import java.io.ByteArrayOutputStream;
import java.io.Closeable;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.LinkedList;
import org.apache.commons.io.input.ReaderInputStream;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.sysml.conf.ConfigurationManager;
import org.apache.sysml.runtime.transform.TfUtils;
import org.apache.sysml.runtime.util.LocalFileUtils;
import org.apache.sysml.runtime.util.UtilFunctions;
public class IOUtilFunctions
{
private static final Log LOG = LogFactory.getLog(UtilFunctions.class.getName());
private static final char CSV_QUOTE_CHAR = '"';
public static FileSystem getFileSystem(String fname) throws IOException {
return getFileSystem(new Path(fname),
ConfigurationManager.getCachedJobConf());
}
public static FileSystem getFileSystem(Path fname) throws IOException {
return getFileSystem(fname,
ConfigurationManager.getCachedJobConf());
}
public static FileSystem getFileSystem(Path fname, Configuration conf) throws IOException {
return FileSystem.get(fname.toUri(), conf);
}
public static boolean isSameFileScheme(Path path1, Path path2) {
if( path1 == null || path2 == null || path1.toUri() == null || path2.toUri() == null)
return false;
String scheme1 = path1.toUri().getScheme();
String scheme2 = path2.toUri().getScheme();
return (scheme1 == null && scheme2 == null)
|| (scheme1 != null && scheme1.equals(scheme2));
}
public static boolean isObjectStoreFileScheme(Path path) {
if( path == null || path.toUri() == null || path.toUri().getScheme() == null )
return false;
String scheme = path.toUri().getScheme();
//capture multiple alternatives s3, s3n, s3a, swift, swift2d
return scheme.startsWith("s3") || scheme.startsWith("swift");
}
public static String getPartFileName(int pos) {
return String.format("0-m-%05d", pos);
}
public static void closeSilently( Closeable io ) {
try {
if( io != null )
io.close();
}
catch (Exception ex) {
LOG.error("Failed to close IO resource.", ex);
}
}
public static void closeSilently( RecordReader rr )
{
try {
if( rr != null )
rr.close();
}
catch (Exception ex) {
LOG.error("Failed to close record reader.", ex);
}
}
public static double parseDoubleParallel( String str )
{
//return FloatingDecimal.parseDouble(str);
return Double.parseDouble(str);
}
public static void checkAndRaiseErrorCSVEmptyField(String row, boolean fill, boolean emptyFound)
throws IOException
{
if ( !fill && emptyFound) {
throw new IOException("Empty fields found in delimited file. "
+ "Use \"fill\" option to read delimited files with empty fields:" + ((row!=null)?row:""));
}
}
public static void checkAndRaiseErrorCSVNumColumns(String fname, String line, String[] parts, long ncol)
throws IOException
{
int realncol = parts.length;
if( realncol != ncol ) {
throw new IOException("Invalid number of columns (" + realncol + ", expected=" + ncol + ") "
+ "found in delimited file (" + fname + ") for line: " + line);
}
}
/**
* Splits a string by a specified delimiter into all tokens, including empty.
* NOTE: This method is meant as a faster drop-in replacement of the regular
* string split.
*
* @param str string to split
* @param delim delimiter
* @return string array
*/
public static String[] split(String str, String delim)
{
//split by whole separator required for multi-character delimiters, preserve
//all tokens required for empty cells and in order to keep cell alignment
return StringUtils.splitByWholeSeparatorPreserveAllTokens(str, delim);
}
/**
* Splits a string by a specified delimiter into all tokens, including empty
* while respecting the rules for quotes and escapes defined in RFC4180,
* with robustness for various special cases.
*
* @param str string to split
* @param delim delimiter
* @return string array of tokens
*/
public static String[] splitCSV(String str, String delim)
{
// check for empty input
if( str == null || str.isEmpty() )
return new String[]{""};
// scan string and create individual tokens
ArrayList tokens = new ArrayList();
int from = 0, to = 0;
int len = str.length();
int dlen = delim.length();
while( from < len ) { // for all tokens
if( str.charAt(from) == CSV_QUOTE_CHAR
&& str.indexOf(CSV_QUOTE_CHAR, from+1) > 0 ) {
to = str.indexOf(CSV_QUOTE_CHAR, from+1);
// handle escaped inner quotes, e.g. "aa""a"
while( to+1 < len && str.charAt(to+1)==CSV_QUOTE_CHAR )
to = str.indexOf(CSV_QUOTE_CHAR, to+2); // to + ""
to += 1; // last "
// handle remaining non-quoted characters "aa"a
if( to= 0) ? to : len;
tokens.add(str.substring(from, to));
from = to + delim.length();
}
// handle empty string at end
if( from == len )
tokens.add("");
// return tokens
return tokens.toArray(new String[0]);
}
/**
* Splits a string by a specified delimiter into all tokens, including empty
* while respecting the rules for quotes and escapes defined in RFC4180,
* with robustness for various special cases.
*
* @param str string to split
* @param delim delimiter
* @param tokens array for tokens, length needs to match the number of tokens
* @return string array of tokens
*/
public static String[] splitCSV(String str, String delim, String[] tokens)
{
// check for empty input
if( str == null || str.isEmpty() )
return new String[]{""};
// scan string and create individual tokens
int from = 0, to = 0;
int len = str.length();
int dlen = delim.length();
int pos = 0;
while( from < len ) { // for all tokens
if( str.charAt(from) == CSV_QUOTE_CHAR
&& str.indexOf(CSV_QUOTE_CHAR, from+1) > 0 ) {
to = str.indexOf(CSV_QUOTE_CHAR, from+1);
// handle escaped inner quotes, e.g. "aa""a"
while( to+1 < len && str.charAt(to+1)==CSV_QUOTE_CHAR )
to = str.indexOf(CSV_QUOTE_CHAR, to+2); // to + ""
to += 1; // last "
// handle remaining non-quoted characters "aa"a
if( to= 0) ? to : len;
tokens[pos++] = str.substring(from, to);
from = to + delim.length();
}
// handle empty string at end
if( from == len )
tokens[pos] = "";
// return tokens
return tokens;
}
/**
* Counts the number of tokens defined by the given delimiter, respecting
* the rules for quotes and escapes defined in RFC4180,
* with robustness for various special cases.
*
* @param str string to split
* @param delim delimiter
* @return number of tokens split by the given delimiter
*/
public static int countTokensCSV(String str, String delim)
{
// check for empty input
if( str == null || str.isEmpty() )
return 1;
// scan string and compute num tokens
int numTokens = 0;
int from = 0, to = 0;
int len = str.length();
int dlen = delim.length();
while( from < len ) { // for all tokens
if( str.charAt(from) == CSV_QUOTE_CHAR
&& str.indexOf(CSV_QUOTE_CHAR, from+1) > 0 ) {
to = str.indexOf(CSV_QUOTE_CHAR, from+1);
// handle escaped inner quotes, e.g. "aa""a"
while( to+1 < len && str.charAt(to+1)==CSV_QUOTE_CHAR )
to = str.indexOf(CSV_QUOTE_CHAR, to+2); // to + ""
to += 1; // last "
// handle remaining non-quoted characters "aa"a
if( to= 0) ? to : len;
from = to + delim.length();
numTokens++;
}
// handle empty string at end
if( from == len )
numTokens++;
// return number of tokens
return numTokens;
}
/**
* Returns the number of non-zero entries but avoids the expensive
* string to double parsing. This function is guaranteed to never
* underestimate.
*
* @param cols string array
* @return number of non-zeros
*/
public static int countNnz(String[] cols) {
return countNnz(cols, 0, cols.length);
}
/**
* Returns the number of non-zero entries but avoids the expensive
* string to double parsing. This function is guaranteed to never
* underestimate.
*
* @param cols string array
* @param pos starting array index
* @param len ending array index
* @return number of non-zeros
*/
public static int countNnz(String[] cols, int pos, int len) {
int lnnz = 0;
for( int i=pos; i=0x0001 && c<=0x007F) ? 1 :
(c >= 0x0800) ? 3 : 2;
}
return size;
}
public static InputStream toInputStream(String input) throws IOException {
if( input == null )
return null;
return new ReaderInputStream(new StringReader(input), "UTF-8");
}
public static String toString(InputStream input) throws IOException {
if( input == null )
return null;
try {
ByteArrayOutputStream bos = new ByteArrayOutputStream();
byte[] buff = new byte[LocalFileUtils.BUFFER_SIZE];
for( int len=0; (len=input.read(buff))!=-1; )
bos.write(buff, 0, len);
return bos.toString("UTF-8");
}
finally {
IOUtilFunctions.closeSilently(input);
}
}
public static InputSplit[] sortInputSplits(InputSplit[] splits) {
if (splits[0] instanceof FileSplit) {
// The splits do not always arrive in order by file name.
// Sort the splits lexicographically by path so that the header will
// be in the first split.
// Note that we're assuming that the splits come in order by offset
Arrays.sort(splits, new Comparator() {
@Override
public int compare(InputSplit o1, InputSplit o2) {
Path p1 = ((FileSplit) o1).getPath();
Path p2 = ((FileSplit) o2).getPath();
return p1.toString().compareTo(p2.toString());
}
});
}
return splits;
}
/**
* Counts the number of columns in a given collection of csv file splits. This primitive aborts
* if a row with more than 0 columns is found and hence is robust against empty file splits etc.
*
* @param splits input splits
* @param informat input format
* @param job job configruation
* @param delim delimiter
* @return the number of columns in the collection of csv file splits
* @throws IOException if IOException occurs
*/
@SuppressWarnings({ "rawtypes", "unchecked" })
public static int countNumColumnsCSV(InputSplit[] splits, InputFormat informat, JobConf job, String delim )
throws IOException
{
LongWritable key = new LongWritable();
Text value = new Text();
int ncol = -1;
for( int i=0; i reader =
informat.getRecordReader(splits[i], job, Reporter.NULL);
try {
if( reader.next(key, value) ) {
boolean hasValue = true;
if( value.toString().startsWith(TfUtils.TXMTD_MVPREFIX) )
hasValue = reader.next(key, value);
if( value.toString().startsWith(TfUtils.TXMTD_NDPREFIX) )
hasValue = reader.next(key, value);
String row = value.toString().trim();
if( hasValue && !row.isEmpty() ) {
ncol = IOUtilFunctions.countTokensCSV(row, delim);
}
}
}
finally {
closeSilently(reader);
}
}
return ncol;
}
public static Path[] getSequenceFilePaths( FileSystem fs, Path file )
throws IOException
{
Path[] ret = null;
//Note on object stores: Since the object store file system implementations
//only emulate a file system, the directory of a multi-part file does not
//exist physically and hence the isDirectory call returns false. Furthermore,
//listStatus call returns all files with the given directory as prefix, which
//includes the mtd file which needs to be ignored accordingly.
if( fs.isDirectory(file)
|| IOUtilFunctions.isObjectStoreFileScheme(file) )
{
LinkedList tmp = new LinkedList();
FileStatus[] dStatus = fs.listStatus(file);
for( FileStatus fdStatus : dStatus )
if( !fdStatus.getPath().getName().startsWith("_") //skip internal files
&& !fdStatus.getPath().toString().equals(file.toString()+".mtd") ) //mtd file
tmp.add(fdStatus.getPath());
ret = tmp.toArray(new Path[0]);
}
else {
ret = new Path[]{ file };
}
return ret;
}
/**
* Delete the CRC files from the local file system associated with a
* particular file and its metadata file.
*
* @param fs
* the file system
* @param path
* the path to a file
* @throws IOException
* thrown if error occurred attempting to delete crc files
*/
public static void deleteCrcFilesFromLocalFileSystem(FileSystem fs, Path path) throws IOException {
if (fs instanceof LocalFileSystem) {
Path fnameCrc = new Path(path.getParent(), "." + path.getName() + ".crc");
fs.delete(fnameCrc, false);
Path fnameMtdCrc = new Path(path.getParent(), "." + path.getName() + ".mtd.crc");
fs.delete(fnameMtdCrc, false);
}
}
}