org.apache.sysml.runtime.transform.TfUtils Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Show all versions of systemml Show documentation
Declarative Machine Learning
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.sysml.runtime.transform;
import java.io.EOFException;
import java.io.IOException;
import java.io.Serializable;
import java.util.Arrays;
import java.util.regex.Pattern;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.ByteWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.Reader;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.wink.json4j.JSONException;
import org.apache.wink.json4j.JSONObject;
import org.apache.sysml.conf.ConfigurationManager;
import org.apache.sysml.lops.Lop;
import org.apache.sysml.parser.DataExpression;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
import org.apache.sysml.runtime.io.MatrixReader;
import org.apache.sysml.runtime.matrix.CSVReblockMR;
import org.apache.sysml.runtime.matrix.CSVReblockMR.OffsetCount;
import org.apache.sysml.runtime.matrix.mapred.MRConfigurationNames;
import org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration;
import org.apache.sysml.runtime.util.MapReduceTool;
import org.apache.sysml.runtime.util.UtilFunctions;
@SuppressWarnings("deprecation")
public class TfUtils implements Serializable{
private static final long serialVersionUID = 526252850872633125L;
protected enum ColumnTypes {
SCALE,
NOMINAL,
ORDINAL,
DUMMYCODED,
INVALID;
protected byte toID() {
switch(this) {
case SCALE: return 1;
case NOMINAL: return 2;
case ORDINAL: return 3;
// Ideally, dummycoded columns should be of a different type. Treating them as SCALE is incorrect, semantically.
case DUMMYCODED: return 1;
default:
throw new RuntimeException("Invalid Column Type: " + this);
}
}
}
//transform methods
public static final String TXMETHOD_IMPUTE = "impute";
public static final String TXMETHOD_RECODE = "recode";
public static final String TXMETHOD_BIN = "bin";
public static final String TXMETHOD_DUMMYCODE = "dummycode";
public static final String TXMETHOD_SCALE = "scale";
public static final String TXMETHOD_OMIT = "omit";
public static final String TXMETHOD_MVRCD = "mvrcd";
//transform meta data constants (frame-based transform)
public static final String TXMTD_MVPREFIX = "#Meta"+Lop.DATATYPE_PREFIX+"MV";
public static final String TXMTD_NDPREFIX = "#Meta"+Lop.DATATYPE_PREFIX+"ND";
//transform meta data constants (old file-based transform)
public static final String TXMTD_SEP = ",";
public static final String TXMTD_COLTYPES = "coltypes.csv";
public static final String TXMTD_COLNAMES = "column.names";
public static final String TXMTD_DC_COLNAMES = "dummycoded.column.names";
public static final String TXMTD_RCD_MAP_SUFFIX = ".map";
public static final String TXMTD_RCD_DISTINCT_SUFFIX = ".ndistinct";
public static final String TXMTD_BIN_FILE_SUFFIX = ".bin";
public static final String TXMTD_MV_FILE_SUFFIX = ".impute";
public static final String JSON_ATTRS = "attributes";
public static final String JSON_MTHD = "methods";
public static final String JSON_CONSTS = "constants";
public static final String JSON_NBINS = "numbins";
protected static final String MODE_FILE_SUFFIX = ".mode";
protected static final String SCALE_FILE_SUFFIX = ".scale";
protected static final String DCD_FILE_NAME = "dummyCodeMaps.csv";
protected static final String DCD_NAME_SEP = "_";
private OmitAgent _oa = null;
private MVImputeAgent _mia = null;
private RecodeAgent _ra = null;
private BinAgent _ba = null;
private DummycodeAgent _da = null;
private long _numRecordsInPartFile; // Total number of records in the data file
private long _numValidRecords; // (_numRecordsInPartFile - #of omitted records)
private long _numTransformedRows; // Number of rows after applying transformations
private long _numTransformedColumns; // Number of columns after applying transformations
private String _headerLine = null;
private boolean _hasHeader;
private Pattern _delim = null;
private String _delimString = null;
private String[] _NAstrings = null;
private String[] _outputColumnNames = null;
private int _numInputCols = -1;
private String _tfMtdDir = null;
private String _spec = null;
private String _offsetFile = null;
private String _tmpDir = null;
private String _outputPath = null;
public TfUtils(JobConf job, boolean minimal)
throws IOException, JSONException
{
if( !InfrastructureAnalyzer.isLocalMode(job) ) {
ConfigurationManager.setCachedJobConf(job);
}
_NAstrings = TfUtils.parseNAStrings(job);
_spec = job.get(MRJobConfiguration.TF_SPEC);
_oa = new OmitAgent(new JSONObject(_spec), null, -1);
}
// called from GenTFMtdMapper, ApplyTf (Hadoop)
public TfUtils(JobConf job)
throws IOException, JSONException
{
if( !InfrastructureAnalyzer.isLocalMode(job) ) {
ConfigurationManager.setCachedJobConf(job);
}
boolean hasHeader = Boolean.parseBoolean(job.get(MRJobConfiguration.TF_HAS_HEADER));
String[] naStrings = TfUtils.parseNAStrings(job);
long numCols = UtilFunctions.parseToLong( job.get(MRJobConfiguration.TF_NUM_COLS) ); // #cols input data
String spec = job.get(MRJobConfiguration.TF_SPEC);
String offsetFile = job.get(MRJobConfiguration.TF_OFFSETS_FILE);
String tmpPath = job.get(MRJobConfiguration.TF_TMP_LOC);
String outputPath = FileOutputFormat.getOutputPath(job).toString();
JSONObject jspec = new JSONObject(spec);
init(job.get(MRJobConfiguration.TF_HEADER), hasHeader, job.get(MRJobConfiguration.TF_DELIM), naStrings, jspec, numCols, offsetFile, tmpPath, outputPath);
}
// called from GenTfMtdReducer
public TfUtils(JobConf job, String tfMtdDir) throws IOException, JSONException
{
this(job);
_tfMtdDir = tfMtdDir;
}
// called from GenTFMtdReducer and ApplyTf (Spark)
public TfUtils(String headerLine, boolean hasHeader, String delim, String[] naStrings, JSONObject spec, long ncol, String tfMtdDir, String offsetFile, String tmpPath) throws IOException, JSONException {
init (headerLine, hasHeader, delim, naStrings, spec, ncol, offsetFile, tmpPath, null);
_tfMtdDir = tfMtdDir;
}
//called from cp frame transformapply
public TfUtils(JSONObject spec, long inNcol)
throws IOException, JSONException
{
//TODO recodemaps handover
_numInputCols = (int)inNcol;
createAgents(spec, new String[]{});
}
protected static boolean checkValidInputFile(FileSystem fs, Path path, boolean err)
throws IOException {
// check non-existing file
if (!fs.exists(path))
if ( err )
throw new IOException("File " + path.toString() + " does not exist on HDFS/LFS.");
else
return false;
// check for empty file
if (MapReduceTool.isFileEmpty(fs, path.toString()))
if ( err )
throw new EOFException("Empty input file " + path.toString() + ".");
else
return false;
return true;
}
public static String getPartFileName(JobConf job) throws IOException {
FileSystem fs = FileSystem.get(job);
Path thisPath=new Path(job.get(MRConfigurationNames.MR_MAP_INPUT_FILE)).makeQualified(fs);
return thisPath.toString();
}
public static boolean isPartFileWithHeader(JobConf job) throws IOException {
FileSystem fs = FileSystem.get(job);
String thisfile=getPartFileName(job);
Path smallestFilePath=new Path(job.get(MRJobConfiguration.TF_SMALLEST_FILE)).makeQualified(fs);
if(thisfile.toString().equals(smallestFilePath.toString()))
return true;
else
return false;
}
/**
* Prepare NA strings so that they can be sent to workers via JobConf.
* A "dummy" string is added at the end to handle the case of empty strings.
* @param na
* @return
*/
public static String prepNAStrings(String na) {
return na + DataExpression.DELIM_NA_STRING_SEP + "dummy";
}
public static String[] parseNAStrings(String na)
{
if ( na == null )
return null;
String[] tmp = Pattern.compile(Pattern.quote(DataExpression.DELIM_NA_STRING_SEP)).split(na, -1);
return tmp; //Arrays.copyOf(tmp, tmp.length-1);
}
public static String[] parseNAStrings(JobConf job)
{
return parseNAStrings(job.get(MRJobConfiguration.TF_NA_STRINGS));
}
private void createAgents(JSONObject spec, String[] naStrings)
throws IOException, JSONException
{
_oa = new OmitAgent(spec, _outputColumnNames, _numInputCols);
_mia = new MVImputeAgent(spec, null, naStrings, _numInputCols);
_ra = new RecodeAgent(spec, _outputColumnNames, _numInputCols);
_ba = new BinAgent(spec, _outputColumnNames, _numInputCols);
_da = new DummycodeAgent(spec, _outputColumnNames, _numInputCols);
}
public void setupAgents(OmitAgent oa, MVImputeAgent mia, RecodeAgent ra, BinAgent ba, DummycodeAgent da) {
_oa = oa;
_mia = mia;
_ra = ra;
_ba = ba;
_da = da;
}
private void parseColumnNames() {
_outputColumnNames = _delim.split(_headerLine, -1);
for(int i=0; i < _outputColumnNames.length; i++)
_outputColumnNames[i] = UtilFunctions.unquote(_outputColumnNames[i]);
}
private void init(String headerLine, boolean hasHeader, String delim, String[] naStrings, JSONObject spec, long numCols, String offsetFile, String tmpPath, String outputPath) throws IOException, JSONException
{
_numRecordsInPartFile = 0;
_numValidRecords = 0;
_numTransformedRows = 0;
_numTransformedColumns = 0;
//TODO: fix hard-wired header propagation to meta data column names
_headerLine = headerLine;
_hasHeader = hasHeader;
_delimString = delim;
_delim = Pattern.compile(Pattern.quote(delim));
_NAstrings = naStrings;
_numInputCols = (int)numCols;
_offsetFile = offsetFile;
_tmpDir = tmpPath;
_outputPath = outputPath;
parseColumnNames();
createAgents(spec, naStrings);
}
public void incrValid() { _numValidRecords++; }
public long getValid() { return _numValidRecords; }
public long getTotal() { return _numRecordsInPartFile; }
public long getNumTransformedRows() { return _numTransformedRows; }
public long getNumTransformedColumns() { return _numTransformedColumns; }
public String getHeader() { return _headerLine; }
public boolean hasHeader() { return _hasHeader; }
public String getDelimString() { return _delimString; }
public Pattern getDelim() { return _delim; }
public String[] getNAStrings() { return _NAstrings; }
public long getNumCols() { return _numInputCols; }
public String getSpec() { return _spec; }
public String getTfMtdDir() { return _tfMtdDir; }
public String getOffsetFile() { return _offsetFile; }
public String getTmpDir() { return _tmpDir; }
public String getOutputPath() { return _outputPath; }
public String getName(int colID) { return _outputColumnNames[colID-1]; }
public void setValid(long n) { _numValidRecords = n;}
public void incrTotal() { _numRecordsInPartFile++; }
public void setTotal(long n) { _numRecordsInPartFile = n;}
public OmitAgent getOmitAgent() { return _oa; }
public MVImputeAgent getMVImputeAgent(){ return _mia;}
public RecodeAgent getRecodeAgent() { return _ra; }
public BinAgent getBinAgent() { return _ba; }
public DummycodeAgent getDummycodeAgent() { return _da; }
/**
* Function that checks if the given string is one of NA strings.
*
* @param w
* @return
*/
public static boolean isNA(String[] NAstrings, String w) {
if(NAstrings == null)
return false;
for(String na : NAstrings) {
if(w.equals(na))
return true;
}
return false;
}
public String[] getWords(Text line) {
return getWords(line.toString());
}
public String[] getWords(String line) {
return getDelim().split(line.trim(), -1);
}
/**
* Process a given row to construct transformation metadata.
*
* @param line
* @return
* @throws IOException
*/
public String[] prepareTfMtd(String line) throws IOException {
String[] words = getWords(line);
if(!getOmitAgent().omit(words, this))
{
getMVImputeAgent().prepare(words);
getRecodeAgent().prepare(words, this);
getBinAgent().prepare(words, this);
incrValid();
}
incrTotal();
return words;
}
public void loadTfMetadata() throws IOException
{
JobConf job = ConfigurationManager.getCachedJobConf();
loadTfMetadata(job, false);
}
public void loadTfMetadata(JobConf job, boolean fromLocalFS) throws IOException
{
Path tfMtdDir = null;
FileSystem fs = null;
if(fromLocalFS) {
// metadata must be read from local file system (e.g., distributed cache in the case of Hadoop)
tfMtdDir = (DistributedCache.getLocalCacheFiles(job))[0];
fs = FileSystem.getLocal(job);
}
else {
fs = FileSystem.get(job);
tfMtdDir = new Path(getTfMtdDir());
}
// load transformation metadata
getMVImputeAgent().loadTxMtd(job, fs, tfMtdDir, this);
getRecodeAgent().loadTxMtd(job, fs, tfMtdDir, this);
getBinAgent().loadTxMtd(job, fs, tfMtdDir, this);
// associate recode maps and bin definitions with dummycoding agent,
// as recoded and binned columns are typically dummycoded
getDummycodeAgent().setRecodeMaps( getRecodeAgent().getRecodeMaps() );
getDummycodeAgent().setNumBins(getBinAgent().getColList(), getBinAgent().getNumBins());
getDummycodeAgent().loadTxMtd(job, fs, tfMtdDir, this);
}
public String processHeaderLine() throws IOException
{
//TODO: fix hard-wired header propagation to meta data column names
FileSystem fs = FileSystem.get(ConfigurationManager.getCachedJobConf());
String dcdHeader = getDummycodeAgent().constructDummycodedHeader(getHeader(), getDelim());
getDummycodeAgent().genDcdMapsAndColTypes(fs, getTmpDir(), (int) getNumCols(), this);
// write header information (before and after transformation) to temporary path
// these files are copied into txMtdPath, once the ApplyTf job is complete.
DataTransform.generateHeaderFiles(fs, getTmpDir(), getHeader(), dcdHeader);
return dcdHeader;
//_numTransformedColumns = getDelim().split(dcdHeader, -1).length;
//return _numTransformedColumns;
}
public boolean omit(String[] words) {
if(getOmitAgent() == null)
return false;
return getOmitAgent().omit(words, this);
}
/**
* Function to apply transformation metadata on a given row.
*
* @param words
* @param optimizeMaps
* @return
*/
public String[] apply( String[] words ) {
words = getMVImputeAgent().apply(words);
words = getRecodeAgent().apply(words);
words = getBinAgent().apply(words);
words = getDummycodeAgent().apply(words);
_numTransformedRows++;
return words;
}
public void check(String []words) throws DMLRuntimeException
{
boolean checkEmptyString = ( getNAStrings() != null );
if ( checkEmptyString )
{
final String msg = "When na.strings are provided, empty string \"\" is considered as a missing value, and it must be imputed appropriately. Encountered an unhandled empty string in column ID: ";
for(int i=0; i