org.apache.sysml.runtime.io.ReaderTextCellParallel Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Show all versions of systemml Show documentation
Declarative Machine Learning
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.sysml.runtime.io;
import java.io.IOException;
import java.util.ArrayList;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.sysml.conf.ConfigurationManager;
import org.apache.sysml.hops.OptimizerUtils;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.matrix.data.InputInfo;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
import org.apache.sysml.runtime.util.FastStringTokenizer;
import org.apache.sysml.runtime.util.MapReduceTool;
/**
* Parallel version of ReaderTextCell.java. To summarize, we create read tasks per split
* and use a fixed-size thread pool, to executed these tasks. If the target matrix is dense,
* the inserts are done lock-free. If the matrix is sparse, we use a buffer to collect
* unordered input cells, lock the the target sparse matrix once, and append all buffered values.
*
* Note MatrixMarket:
* 1) For matrix market files each read task probes for comments until it finds data because
* for very small tasks or large comments, any split might encounter % or %%. Hence,
* the parallel reader does not do the validity check for.
* 2) In extreme scenarios, the last comment might be in one split, and the following meta data
* in the subsequent split. This would create incorrect results or errors. However, this
* scenario is extremely unlikely (num threads > num lines if 1 comment line) and hence ignored
* similar to our parallel MR setting (but there we have a 128MB guarantee).
* 3) However, we use MIN_FILESIZE_MM (8KB) to give guarantees for the common case of small headers
* in order the issue described in (2).
*
*/
public class ReaderTextCellParallel extends MatrixReader
{
private static final long MIN_FILESIZE_MM = 8L * 1024; //8KB
private boolean _isMMFile = false;
private int _numThreads = 1;
public ReaderTextCellParallel(InputInfo info)
{
_isMMFile = (info == InputInfo.MatrixMarketInputInfo);
_numThreads = OptimizerUtils.getParallelTextReadParallelism();
}
@Override
public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen, int brlen, int bclen, long estnnz)
throws IOException, DMLRuntimeException
{
//prepare file access
JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
FileSystem fs = FileSystem.get(job);
Path path = new Path( fname );
//check existence and non-empty file
checkValidInputFile(fs, path);
//allocate output matrix block
MatrixBlock ret = createOutputMatrixBlock(rlen, clen, estnnz, true, false);
//core read
readTextCellMatrixFromHDFS(path, job, ret, rlen, clen, brlen, bclen, _isMMFile);
//post-processing (representation-specific, change of sparse/dense block representation)
if( ret.isInSparseFormat() )
ret.sortSparseRows();
else
ret.recomputeNonZeros();
ret.examSparsity();
return ret;
}
/**
*
* @param path
* @param job
* @param dest
* @param rlen
* @param clen
* @param brlen
* @param bclen
* @throws IOException
* @throws IllegalAccessException
* @throws InstantiationException
*/
private void readTextCellMatrixFromHDFS( Path path, JobConf job, MatrixBlock dest, long rlen, long clen, int brlen, int bclen, boolean matrixMarket )
throws IOException
{
int par = _numThreads;
FileInputFormat.addInputPath(job, path);
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
//check for min file size for matrix market (adjust num splits if necessary)
if( _isMMFile ){
long len = MapReduceTool.getFilesizeOnHDFS(path);
par = ( len < MIN_FILESIZE_MM ) ? 1: par;
}
ExecutorService pool = Executors.newFixedThreadPool(par);
InputSplit[] splits = informat.getSplits(job, par);
try
{
//create read tasks for all splits
ArrayList tasks = new ArrayList();
for( InputSplit split : splits ){
ReadTask t = new ReadTask(split, informat, job, dest, rlen, clen, matrixMarket);
tasks.add(t);
}
//wait until all tasks have been executed
pool.invokeAll(tasks);
pool.shutdown();
//early error notify in case not all tasks successful
for(ReadTask rt : tasks) {
if( !rt.getReturnCode() ) {
throw new IOException("Read task for text input failed: " + rt.getErrMsg());
}
}
}
catch (Exception e) {
throw new IOException("Threadpool issue, while parallel read.", e);
}
}
/**
*
*
*/
public static class ReadTask implements Callable