org.apache.sysml.runtime.io.WriterTextCSV Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Show all versions of systemml Show documentation
Declarative Machine Learning
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.sysml.runtime.io;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.Collections;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.mapred.JobConf;
import org.apache.sysml.conf.ConfigurationManager;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.matrix.CSVReblockMR;
import org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
import org.apache.sysml.runtime.matrix.data.SparseBlock;
import org.apache.sysml.runtime.util.MapReduceTool;
public class WriterTextCSV extends MatrixWriter
{
//blocksize for string concatenation in order to prevent write OOM
//(can be set to very large value to disable blocking)
public static final int BLOCKSIZE_J = 32; //32 cells (typically ~512B, should be less than write buffer of 1KB)
protected CSVFileFormatProperties _props = null;
public WriterTextCSV( CSVFileFormatProperties props ) {
_props = props;
}
@Override
public final void writeMatrixToHDFS(MatrixBlock src, String fname, long rlen, long clen, int brlen, int bclen, long nnz)
throws IOException, DMLRuntimeException
{
//validity check matrix dimensions
if( src.getNumRows() != rlen || src.getNumColumns() != clen ) {
throw new IOException("Matrix dimensions mismatch with metadata: "+src.getNumRows()+"x"+src.getNumColumns()+" vs "+rlen+"x"+clen+".");
}
//prepare file access
JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
Path path = new Path( fname );
FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
//if the file already exists on HDFS, remove it.
MapReduceTool.deleteFileIfExistOnHDFS( fname );
//core write (sequential/parallel)
writeCSVMatrixToHDFS(path, job, fs, src, _props);
IOUtilFunctions.deleteCrcFilesFromLocalFileSystem(fs, path);
}
@Override
public final void writeEmptyMatrixToHDFS(String fname, long rlen, long clen, int brlen, int bclen)
throws IOException, DMLRuntimeException
{
JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
Path path = new Path( fname );
FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
MatrixBlock src = new MatrixBlock((int)rlen, 1, true);
writeCSVMatrixToHDFS(path, job, fs, src, _props);
IOUtilFunctions.deleteCrcFilesFromLocalFileSystem(fs, path);
}
protected void writeCSVMatrixToHDFS(Path path, JobConf job, FileSystem fs, MatrixBlock src, CSVFileFormatProperties csvprops)
throws IOException
{
//sequential write csv file
writeCSVMatrixToFile(path, job, fs, src, 0, (int)src.getNumRows(), csvprops);
}
protected final void writeCSVMatrixToFile( Path path, JobConf job, FileSystem fs, MatrixBlock src, int rl, int ru, CSVFileFormatProperties props )
throws IOException
{
boolean sparse = src.isInSparseFormat();
int clen = src.getNumColumns();
//create buffered writer
BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(path,true)));
try
{
//for obj reuse and preventing repeated buffer re-allocations
StringBuilder sb = new StringBuilder();
props = (props==null)? new CSVFileFormatProperties() : props;
String delim = props.getDelim();
boolean csvsparse = props.isSparse();
// Write header line, if needed
if( props.hasHeader() && rl==0 )
{
//write row chunk-wise to prevent OOM on large number of columns
for( int bj=0; bj files=new ArrayList();
for(FileStatus stat: fs.listStatus(srcFilePath, CSVReblockMR.hiddenFileFilter))
files.add(stat.getPath());
Collections.sort(files);
// first part file path
Path firstpart = files.get(0);
// create a temp file, and add header and contents of first part
Path tmp = new Path(firstpart.toString() + ".tmp");
OutputStream out = fs.create(tmp, true);
out.write(sb.toString().getBytes());
sb.setLength(0);
// copy rest of the data from firstpart
InputStream in = null;
try {
in = fs.open(firstpart);
IOUtils.copyBytes(in, out, conf, true);
} finally {
IOUtilFunctions.closeSilently(in);
IOUtilFunctions.closeSilently(out);
}
// rename tmp to firstpart
fs.delete(firstpart, true);
fs.rename(tmp, firstpart);
// rename srcfile to destFile
fs.delete(destFilePath, true);
fs.createNewFile(destFilePath); // force the creation of directory structure
fs.delete(destFilePath, true); // delete the file, but preserve the directory structure
fs.rename(srcFilePath, destFilePath); // move the data
} else if (fs.isFile(srcFilePath)) {
// create destination file
OutputStream out = fs.create(destFilePath, true);
// write header
out.write(sb.toString().getBytes());
sb.setLength(0);
// copy the data from srcFile
InputStream in = null;
try {
in = fs.open(srcFilePath);
IOUtils.copyBytes(in, out, conf, true);
}
finally {
IOUtilFunctions.closeSilently(in);
IOUtilFunctions.closeSilently(out);
}
} else {
throw new IOException(srcFilePath.toString() + ": No such file or directory");
}
}
}