org.apache.sysml.runtime.transform.DataTransform Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Show all versions of systemml Show documentation
Declarative Machine Learning
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.sysml.runtime.transform;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.regex.Pattern;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.wink.json4j.JSONArray;
import org.apache.wink.json4j.JSONException;
import org.apache.wink.json4j.JSONObject;
import scala.Tuple2;
import org.apache.sysml.conf.ConfigurationManager;
import org.apache.sysml.conf.DMLConfig;
import org.apache.sysml.lops.CSVReBlock;
import org.apache.sysml.lops.Lop;
import org.apache.sysml.lops.LopProperties.ExecType;
import org.apache.sysml.parser.Expression.DataType;
import org.apache.sysml.parser.Expression.ValueType;
import org.apache.sysml.parser.ParameterizedBuiltinFunctionExpression;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext;
import org.apache.sysml.runtime.instructions.Instruction;
import org.apache.sysml.runtime.instructions.InstructionParser;
import org.apache.sysml.runtime.instructions.MRJobInstruction;
import org.apache.sysml.runtime.instructions.cp.ParameterizedBuiltinCPInstruction;
import org.apache.sysml.runtime.instructions.mr.CSVReblockInstruction;
import org.apache.sysml.runtime.instructions.spark.ParameterizedBuiltinSPInstruction;
import org.apache.sysml.runtime.instructions.spark.data.RDDObject;
import org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtils;
import org.apache.sysml.runtime.matrix.CSVReblockMR;
import org.apache.sysml.runtime.matrix.CSVReblockMR.AssignRowIDMRReturn;
import org.apache.sysml.runtime.matrix.JobReturn;
import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
import org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties;
import org.apache.sysml.runtime.matrix.data.FileFormatProperties;
import org.apache.sysml.runtime.matrix.data.InputInfo;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
import org.apache.sysml.runtime.matrix.data.OutputInfo;
import org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration;
import org.apache.sysml.runtime.transform.TransformationAgent.TX_METHOD;
import org.apache.sysml.runtime.util.MapReduceTool;
import org.apache.sysml.runtime.util.UtilFunctions;
import org.apache.sysml.utils.JSONHelper;
public class DataTransform {
/**
* Method to read the header line from the input data file.
*
* @param fs
* @param prop
* @param smallestFile
* @return
* @throws IOException
*/
private static String readHeaderLine(FileSystem fs, CSVFileFormatProperties prop, String smallestFile) throws IOException {
String line = null;
BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(smallestFile))));
line = br.readLine();
br.close();
if(prop.hasHeader()) {
; // nothing here
}
else
{
// construct header with default column names, V1, V2, etc.
int ncol = Pattern.compile( Pattern.quote(prop.getDelim()) ).split(line, -1).length;
line = null;
StringBuilder sb = new StringBuilder();
sb.append("V1");
for(int i=2; i <= ncol; i++)
sb.append(prop.getDelim() + "V" + i);
line = sb.toString();
}
return line;
}
/**
* Method to construct a mapping between column names and their
* corresponding column IDs. The mapping is used to prepare the
* specification file in processSpecFile()
.
*
* @param fs
* @param prop
* @param headerLine
* @param smallestFile
* @return
* @throws IllegalArgumentException
* @throws IOException
*/
private static HashMap processColumnNames(FileSystem fs, CSVFileFormatProperties prop, String headerLine, String smallestFile) throws IllegalArgumentException, IOException {
HashMap colNames = new HashMap();
String escapedDelim = Pattern.quote(prop.getDelim());
Pattern compiledDelim = Pattern.compile(escapedDelim);
String[] names = compiledDelim.split(headerLine, -1);
for(int i=0; i< names.length; i++)
colNames.put(UtilFunctions.unquote(names[i].trim()), i+1);
return colNames;
}
/**
* In-place permutation of list, mthd, and cst arrays based on indices,
* by navigating through cycles in the permutation.
*
* @param list
* @param mthd
* @param cst
* @param indices
*/
private static void inplacePermute(int[] list, byte[] mthd, Object[] cst, Integer[] indices)
{
int x;
byte xb = 0;
Object xo = null;
int j, k;
for(int i=0; i < list.length; i++)
{
x = list[i];
xb = mthd[i];
if ( cst != null ) xo = cst[i];
j = i;
while(true) {
k = indices[j];
indices[j] = j;
if (k == i)
break;
list[j] = list[k];
mthd[j] = mthd[k];
if ( cst != null ) cst[j] = cst[k];
j = k;
}
list[j] = x;
mthd[j] = xb;
if ( cst != null ) cst[j] = xo;
}
}
/**
* Convert input transformation specification file with column names into a
* specification with corresponding column Ids. This file is sent to all the
* relevant MR jobs.
*
* @param fs
* @param inputPath
* @param smallestFile
* @param colNames
* @param prop
* @param specFileWithNames
* @return
* @throws IllegalArgumentException
* @throws IOException
* @throws JSONException
*/
private static String processSpecFile(FileSystem fs, String inputPath, String smallestFile, HashMap colNames, CSVFileFormatProperties prop, String specFileWithNames) throws IllegalArgumentException, IOException, JSONException {
// load input spec file with Names
BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(specFileWithNames))));
JSONObject inputSpec = JSONHelper.parse(br);
br.close();
final String NAME = "name";
final String ID = "id";
final String METHOD = "method";
final String VALUE = "value";
final String MV_METHOD_MEAN = "global_mean";
final String MV_METHOD_MODE = "global_mode";
final String MV_METHOD_CONSTANT = "constant";
final String BIN_METHOD_WIDTH = "equi-width";
final String BIN_METHOD_HEIGHT = "equi-height";
final String SCALE_METHOD_Z = "z-score";
final String SCALE_METHOD_M = "mean-subtraction";
final String JSON_BYPOS = "ids";
String stmp = null;
JSONObject entry = null;
byte btmp = 0;
final int[] mvList;
int[] rcdList, dcdList, omitList;
final int[] binList;
final int[] scaleList;
byte[] mvMethods = null, binMethods=null, scaleMethods=null;
Object[] numBins = null;
Object[] mvConstants = null;
boolean byPositions = (inputSpec.containsKey(JSON_BYPOS) && ((Boolean)inputSpec.get(JSON_BYPOS)).booleanValue() == true);
// --------------------------------------------------------------------------
// Omit
if( inputSpec.containsKey(TX_METHOD.OMIT.toString()) ) {
JSONArray arrtmp = (JSONArray) inputSpec.get(TX_METHOD.OMIT.toString());
omitList = new int[arrtmp.size()];
for(int i=0; i() {
@Override
public int compare(Integer o1, Integer o2) {
return (mvList[o1]-mvList[o2]);
}
});
// rearrange mvList, mvMethods, and mvConstants according to permutation idx
inplacePermute(mvList, mvMethods, mvConstants, idx);
}
else
mvList = null;
// --------------------------------------------------------------------------
// Recoding
if( inputSpec.containsKey(TX_METHOD.RECODE.toString()) ) {
JSONArray arrtmp = (JSONArray) inputSpec.get(TX_METHOD.RECODE.toString());
rcdList = new int[arrtmp.size()];
for(int i=0; i() {
@Override
public int compare(Integer o1, Integer o2) {
return (binList[o1]-binList[o2]);
}
});
// rearrange binList and binMethods according to permutation idx
inplacePermute(binList, binMethods, numBins, idx);
}
else
binList = null;
// --------------------------------------------------------------------------
// Dummycoding
if( inputSpec.containsKey(TX_METHOD.DUMMYCODE.toString()) ) {
JSONArray arrtmp = (JSONArray) inputSpec.get(TX_METHOD.DUMMYCODE.toString());
dcdList = new int[arrtmp.size()];
for(int i=0; i() {
@Override
public int compare(Integer o1, Integer o2) {
return (scaleList[o1]-scaleList[o2]);
}
});
// rearrange scaleList and scaleMethods according to permutation idx
inplacePermute(scaleList, scaleMethods, null, idx);
}
else
scaleList = null;
// --------------------------------------------------------------------------
// check for column IDs that are imputed with mode, but not recoded
// These columns have be handled separately, because the computation of mode
// requires the computation of distinct values (i.e., recode maps)
ArrayList tmpList = new ArrayList();
if(mvList != null)
for(int i=0; i < mvList.length; i++) {
int colID = mvList[i];
if(mvMethods[i] == 2 && (rcdList == null || Arrays.binarySearch(rcdList, colID) < 0) )
tmpList.add(colID);
}
int[] mvrcdList = null;
if ( tmpList.size() > 0 ) {
mvrcdList = new int[tmpList.size()];
for(int i=0; i < tmpList.size(); i++)
mvrcdList[i] = tmpList.get(i);
}
// Perform Validity Checks
/*
OMIT MVI RCD BIN DCD SCL
OMIT - x * * * *
MVI x - * * * *
RCD * * - x * x
BIN * * x - * x
DCD * * * * - x
SCL * * x x x -
*/
if(mvList != null)
for(int i=0; i < mvList.length; i++)
{
int colID = mvList[i];
if ( omitList != null && Arrays.binarySearch(omitList, colID) >= 0 )
throw new IllegalArgumentException("Invalid transformations on column ID " + colID + ". A column can not be both omitted and imputed.");
if(mvMethods[i] == 1)
{
if ( rcdList != null && Arrays.binarySearch(rcdList, colID) >= 0 )
throw new IllegalArgumentException("Invalid transformations on column ID " + colID + ". A numeric column can not be recoded.");
if ( dcdList != null && Arrays.binarySearch(dcdList, colID) >= 0 )
// throw an error only if the column is not binned
if ( binList == null || Arrays.binarySearch(binList, colID) < 0 )
throw new IllegalArgumentException("Invalid transformations on column ID " + colID + ". A numeric column can not be dummycoded.");
}
}
if(scaleList != null)
for(int i=0; i < scaleList.length; i++)
{
int colID = scaleList[i];
if ( rcdList != null && Arrays.binarySearch(rcdList, colID) >= 0 )
throw new IllegalArgumentException("Invalid transformations on column ID " + colID + ". A column can not be recoded and scaled.");
if ( binList != null && Arrays.binarySearch(binList, colID) >= 0 )
throw new IllegalArgumentException("Invalid transformations on column ID " + colID + ". A column can not be binned and scaled.");
if ( dcdList != null && Arrays.binarySearch(dcdList, colID) >= 0 )
throw new IllegalArgumentException("Invalid transformations on column ID " + colID + ". A column can not be dummycoded and scaled.");
}
if(rcdList != null)
for(int i=0; i < rcdList.length; i++)
{
int colID = rcdList[i];
if ( binList != null && Arrays.binarySearch(binList, colID) >= 0 )
throw new IllegalArgumentException("Invalid transformations on column ID " + colID + ". A column can not be recoded and binned.");
}
// Check if dummycoded columns are either recoded or binned.
// If not, add them to recode list.
ArrayList addToRcd = new ArrayList();
if(dcdList != null)
for(int i=0; i < dcdList.length; i++)
{
int colID = dcdList[i];
boolean isRecoded = (rcdList != null && Arrays.binarySearch(rcdList, colID) >= 0);
boolean isBinned = (binList != null && Arrays.binarySearch(binList, colID) >= 0);
// If colID is neither recoded nor binned, then, add it to rcdList.
if ( !isRecoded && !isBinned )
addToRcd.add(colID);
}
if ( addToRcd.size() > 0 )
{
int[] newRcdList = null;
if ( rcdList != null)
newRcdList = Arrays.copyOf(rcdList, rcdList.length + addToRcd.size());
else
newRcdList = new int[addToRcd.size()];
int i = (rcdList != null ? rcdList.length : 0);
for(int idx=0; i < newRcdList.length; i++, idx++)
newRcdList[i] = addToRcd.get(idx);
Arrays.sort(newRcdList);
rcdList = newRcdList;
}
// -----------------------------------------------------------------------------
// Prepare output spec
JSONObject outputSpec = new JSONObject();
if (omitList != null)
{
JSONObject rcdSpec = new JSONObject();
rcdSpec.put(TransformationAgent.JSON_ATTRS, toJSONArray(omitList));
outputSpec.put(TX_METHOD.OMIT.toString(), rcdSpec);
}
if (mvList != null)
{
JSONObject mvSpec = new JSONObject();
mvSpec.put(TransformationAgent.JSON_ATTRS, toJSONArray(mvList));
mvSpec.put(TransformationAgent.JSON_MTHD, toJSONArray(mvMethods));
mvSpec.put(TransformationAgent.JSON_CONSTS, toJSONArray(mvConstants));
outputSpec.put(TX_METHOD.IMPUTE.toString(), mvSpec);
}
if (rcdList != null)
{
JSONObject rcdSpec = new JSONObject();
rcdSpec.put(TransformationAgent.JSON_ATTRS, toJSONArray(rcdList));
outputSpec.put(TX_METHOD.RECODE.toString(), rcdSpec);
}
if (binList != null)
{
JSONObject binSpec = new JSONObject();
binSpec.put(TransformationAgent.JSON_ATTRS, toJSONArray(binList));
binSpec.put(TransformationAgent.JSON_MTHD, toJSONArray(binMethods));
binSpec.put(TransformationAgent.JSON_NBINS, toJSONArray(numBins));
outputSpec.put(TX_METHOD.BIN.toString(), binSpec);
}
if (dcdList != null)
{
JSONObject dcdSpec = new JSONObject();
dcdSpec.put(TransformationAgent.JSON_ATTRS, toJSONArray(dcdList));
outputSpec.put(TX_METHOD.DUMMYCODE.toString(), dcdSpec);
}
if (scaleList != null)
{
JSONObject scaleSpec = new JSONObject();
scaleSpec.put(TransformationAgent.JSON_ATTRS, toJSONArray(scaleList));
scaleSpec.put(TransformationAgent.JSON_MTHD, toJSONArray(scaleMethods));
outputSpec.put(TX_METHOD.SCALE.toString(), scaleSpec);
}
if (mvrcdList != null)
{
JSONObject mvrcd = new JSONObject();
mvrcd.put(TransformationAgent.JSON_ATTRS, toJSONArray(mvrcdList));
outputSpec.put(TX_METHOD.MVRCD.toString(), mvrcd);
}
// write out the spec with IDs
String specFileWithIDs = MRJobConfiguration.constructTempOutputFilename();
BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fs.create(new Path(specFileWithIDs),true)));
out.write(outputSpec.toString());
out.close();
return specFileWithIDs;
}
private static JSONArray toJSONArray(int[] list)
{
JSONArray ret = new JSONArray(list.length);
for(int i=0; i < list.length; i++)
ret.add(list[i]);
return ret;
}
private static JSONArray toJSONArray(byte[] list)
{
JSONArray ret = new JSONArray(list.length);
for(int i=0; i < list.length; i++)
ret.add(list[i]);
return ret;
}
private static JSONArray toJSONArray(Object[] list)
{
JSONArray ret = new JSONArray(list.length);
for(int i=0; i < list.length; i++)
ret.add(list[i]);
return ret;
}
private static final String ERROR_MSG_ZERO_ROWS = "Number of rows in the transformed output (potentially, after ommitting the ones with missing values) is zero. Cannot proceed.";
/**
* Private class to hold the relevant input parameters to transform operation.
*/
private static class TransformOperands {
String inputPath=null, txMtdPath=null, applyTxPath=null, specFile=null, outNamesFile=null;
boolean isApply=false;
CSVFileFormatProperties inputCSVProperties = null;
TransformOperands(String inst, MatrixObject inputMatrix) {
String[] instParts = inst.split(Instruction.OPERAND_DELIM);
inputPath = inputMatrix.getFileName();
txMtdPath = instParts[3];
isApply = Boolean.parseBoolean(instParts[5]);
if ( isApply ) {
applyTxPath = instParts[4];
}
else {
specFile = instParts[4];
}
if (instParts.length == 8)
outNamesFile = instParts[6];
inputCSVProperties = (CSVFileFormatProperties)inputMatrix.getFileFormatProperties();
}
TransformOperands(ParameterizedBuiltinCPInstruction inst, MatrixObject inputMatrix) {
HashMap params = inst.getParameterMap();
inputPath = inputMatrix.getFileName();
txMtdPath = params.get(ParameterizedBuiltinFunctionExpression.TF_FN_PARAM_TXMTD);
if ( params.get(ParameterizedBuiltinFunctionExpression.TF_FN_PARAM_TXSPEC) != null ) {
isApply = false;
specFile = params.get(ParameterizedBuiltinFunctionExpression.TF_FN_PARAM_TXSPEC);
applyTxPath = null;
}
else if ( params.get(ParameterizedBuiltinFunctionExpression.TF_FN_PARAM_APPLYMTD) != null ) {
isApply = true;
specFile = null;
applyTxPath = params.get(ParameterizedBuiltinFunctionExpression.TF_FN_PARAM_APPLYMTD);
}
if ( params.get(ParameterizedBuiltinFunctionExpression.TF_FN_PARAM_OUTNAMES) != null)
outNamesFile = params.get(ParameterizedBuiltinFunctionExpression.TF_FN_PARAM_OUTNAMES);
inputCSVProperties = (CSVFileFormatProperties)inputMatrix.getFileFormatProperties();
}
TransformOperands(ParameterizedBuiltinSPInstruction inst, MatrixObject inputMatrix) {
HashMap params = inst.getParams();
inputPath = inputMatrix.getFileName();
txMtdPath = params.get(ParameterizedBuiltinFunctionExpression.TF_FN_PARAM_TXMTD);
specFile = params.get(ParameterizedBuiltinFunctionExpression.TF_FN_PARAM_TXSPEC);
applyTxPath = params.get(ParameterizedBuiltinFunctionExpression.TF_FN_PARAM_APPLYMTD);
isApply = (applyTxPath != null);
outNamesFile = params.get(ParameterizedBuiltinFunctionExpression.TF_FN_PARAM_OUTNAMES); // can be null
inputCSVProperties = (CSVFileFormatProperties)inputMatrix.getFileFormatProperties();
}
}
/**
* Helper function to move transformation metadata files from a temporary
* location to permanent location. These files (e.g., header before and
* after transformation) are generated by a single mapper, while applying
* data transformations. Note that, these files must be ultimately be placed
* under the existing metadata directory (txMtdPath), which is
* simultaneously read by other mappers. If they are not created at a
* temporary location, then MR tasks fail due to changing timestamps on
* txMtdPath.
*
* @param fs
* @param tmpPath
* @param txMtdPath
* @throws IllegalArgumentException
* @throws IOException
*/
private static void moveFilesFromTmp(FileSystem fs, String tmpPath, String txMtdPath) throws IllegalArgumentException, IOException
{
// move files from temporary location to txMtdPath
MapReduceTool.renameFileOnHDFS(tmpPath + "/" + TransformationAgent.OUT_HEADER, txMtdPath + "/" + TransformationAgent.OUT_HEADER);
MapReduceTool.renameFileOnHDFS(tmpPath + "/" + TransformationAgent.OUT_DCD_HEADER, txMtdPath + "/" + TransformationAgent.OUT_DCD_HEADER);
MapReduceTool.renameFileOnHDFS(tmpPath + "/" + TransformationAgent.COLTYPES_FILE_NAME, txMtdPath + "/" + TransformationAgent.COLTYPES_FILE_NAME);
if ( fs.exists(new Path(tmpPath +"/Dummycode/" + TransformationAgent.DCD_FILE_NAME)) )
{
if ( !fs.exists( new Path(txMtdPath + "/Dummycode/") ))
fs.mkdirs(new Path(txMtdPath + "/Dummycode/"));
MapReduceTool.renameFileOnHDFS( tmpPath + "/Dummycode/" + TransformationAgent.DCD_FILE_NAME, txMtdPath + "/Dummycode/" + TransformationAgent.DCD_FILE_NAME);
}
}
/**
* Helper function to determine the number of columns after applying
* transformations. Note that dummycoding changes the number of columns.
*
* @param fs
* @param header
* @param delim
* @param tfMtdPath
* @return
* @throws IllegalArgumentException
* @throws IOException
* @throws DMLRuntimeException
* @throws JSONException
*/
private static int getNumColumnsTf(FileSystem fs, String header, String delim, String tfMtdPath) throws IllegalArgumentException, IOException, DMLRuntimeException, JSONException {
String[] columnNames = Pattern.compile(Pattern.quote(delim)).split(header, -1);
int ret = columnNames.length;
BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(tfMtdPath + "/spec.json"))));
JSONObject spec = JSONHelper.parse(br);
br.close();
// fetch relevant attribute lists
if ( !spec.containsKey(TX_METHOD.DUMMYCODE.toString()) )
return ret;
JSONArray dcdList = (JSONArray) ((JSONObject)spec.get(TX_METHOD.DUMMYCODE.toString())).get(TransformationAgent.JSON_ATTRS);
// look for numBins among binned columns
for(Object o : dcdList)
{
int id = UtilFunctions.toInt(o);
Path binpath = new Path( tfMtdPath + "/Bin/" + UtilFunctions.unquote(columnNames[id-1]) + TransformationAgent.BIN_FILE_SUFFIX);
Path rcdpath = new Path( tfMtdPath + "/Recode/" + UtilFunctions.unquote(columnNames[id-1]) + TransformationAgent.NDISTINCT_FILE_SUFFIX);
if ( TfUtils.checkValidInputFile(fs, binpath, false ) )
{
br = new BufferedReader(new InputStreamReader(fs.open(binpath)));
int nbins = UtilFunctions.parseToInt(br.readLine().split(TransformationAgent.TXMTD_SEP)[4]);
br.close();
ret += (nbins-1);
}
else if ( TfUtils.checkValidInputFile(fs, rcdpath, false ) )
{
br = new BufferedReader(new InputStreamReader(fs.open(rcdpath)));
int ndistinct = UtilFunctions.parseToInt(br.readLine());
br.close();
ret += (ndistinct-1);
}
else
throw new DMLRuntimeException("Relevant transformation metadata for column (id=" + id + ", name=" + columnNames[id-1] + ") is not found.");
}
//System.out.println("Number of columns in transformed data: " + ret);
return ret;
}
/**
* Main method to create and/or apply transformation metdata using MapReduce.
*
* @param jobinst
* @param inputMatrices
* @param shuffleInst
* @param otherInst
* @param resultIndices
* @param outputMatrices
* @param numReducers
* @param replication
* @return
* @throws Exception
*/
public static JobReturn mrDataTransform(MRJobInstruction jobinst, MatrixObject[] inputMatrices, String shuffleInst, String otherInst, byte[] resultIndices, MatrixObject[] outputMatrices, int numReducers, int replication) throws Exception {
String[] insts = shuffleInst.split(Instruction.INSTRUCTION_DELIM);
// Parse transform instruction (the first instruction) to obtain relevant fields
TransformOperands oprnds = new TransformOperands(insts[0], inputMatrices[0]);
JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
FileSystem fs = FileSystem.get(job);
// find the first file in alphabetical ordering of partfiles in directory inputPath
String smallestFile = CSVReblockMR.findSmallestFile(job, oprnds.inputPath);
// find column names
String headerLine = readHeaderLine(fs, oprnds.inputCSVProperties, smallestFile);
HashMap colNamesToIds = processColumnNames(fs, oprnds.inputCSVProperties, headerLine, smallestFile);
String outHeader = getOutputHeader(fs, headerLine, oprnds);
int numColumns = colNamesToIds.size();
int numColumnsTf = 0;
long numRowsTf = 0;
ArrayList csvoutputs= new ArrayList();
ArrayList bboutputs = new ArrayList();
// divide output objects based on output format (CSV or BinaryBlock)
for(int i=0; i < outputMatrices.length; i++)
{
if(outputMatrices[i].getFileFormatProperties() != null
&& outputMatrices[i].getFileFormatProperties().getFileFormat() == FileFormatProperties.FileFormat.CSV)
csvoutputs.add(i);
else
bboutputs.add(i);
}
boolean isCSV = (csvoutputs.size() > 0);
boolean isBB = (bboutputs.size() > 0);
String tmpPath = MRJobConfiguration.constructTempOutputFilename();
JobReturn retCSV = null, retBB = null;
if (!oprnds.isApply) {
// build specification file with column IDs insteadof column names
String specFileWithIDs = processSpecFile(fs, oprnds.inputPath,
smallestFile, colNamesToIds,
oprnds.inputCSVProperties,
oprnds.specFile);
colNamesToIds = null; // enable GC on colNamesToIds
// Build transformation metadata, including recode maps, bin definitions, etc.
// Also, generate part offsets file (counters file), which is to be used in csv-reblock
String partOffsetsFile = MRJobConfiguration.constructTempOutputFilename();
numRowsTf = GenTfMtdMR.runJob(oprnds.inputPath,
oprnds.txMtdPath, specFileWithIDs,
smallestFile, partOffsetsFile,
oprnds.inputCSVProperties, numColumns,
replication, outHeader);
if ( numRowsTf == 0 )
throw new DMLRuntimeException(ERROR_MSG_ZERO_ROWS);
// store the specFileWithIDs as transformation metadata
MapReduceTool.copyFileOnHDFS(specFileWithIDs, oprnds.txMtdPath + "/" + "spec.json");
numColumnsTf = getNumColumnsTf(fs, outHeader, oprnds.inputCSVProperties.getDelim(), oprnds.txMtdPath);
// Apply transformation metadata, and perform actual transformation
if(isCSV)
retCSV = ApplyTfCSVMR.runJob(oprnds.inputPath, specFileWithIDs,
oprnds.txMtdPath, tmpPath,
outputMatrices[csvoutputs.get(0)].getFileName(),
partOffsetsFile, oprnds.inputCSVProperties, numColumns,
replication, outHeader);
if(isBB)
{
DMLConfig conf = ConfigurationManager.getConfig();
int blockSize = conf.getIntValue(DMLConfig.DEFAULT_BLOCK_SIZE);
CSVReblockInstruction rblk = prepDummyReblockInstruction(oprnds.inputCSVProperties, blockSize);
AssignRowIDMRReturn ret1 = CSVReblockMR.runAssignRowIDMRJob(new String[]{oprnds.inputPath},
new InputInfo[]{InputInfo.CSVInputInfo},
new int[]{blockSize}, new int[]{blockSize},
rblk.toString(), replication, new String[]{smallestFile},
true, oprnds.inputCSVProperties.getNAStrings(), specFileWithIDs);
if ( ret1.rlens[0] == 0 )
throw new DMLRuntimeException(ERROR_MSG_ZERO_ROWS);
retBB = ApplyTfBBMR.runJob(oprnds.inputPath,
insts[1], otherInst,
specFileWithIDs, oprnds.txMtdPath,
tmpPath, outputMatrices[bboutputs.get(0)].getFileName(),
ret1.counterFile.toString(), oprnds.inputCSVProperties,
numRowsTf, numColumns, numColumnsTf,
replication, outHeader);
}
MapReduceTool.deleteFileIfExistOnHDFS(new Path(partOffsetsFile), job);
}
else {
colNamesToIds = null; // enable GC on colNamesToIds
// copy given transform metadata (applyTxPath) to specified location (txMtdPath)
MapReduceTool.deleteFileIfExistOnHDFS(new Path(oprnds.txMtdPath), job);
MapReduceTool.copyFileOnHDFS(oprnds.applyTxPath, oprnds.txMtdPath);
// path to specification file
String specFileWithIDs = oprnds.txMtdPath + "/" + "spec.json";
numColumnsTf = getNumColumnsTf(fs, outHeader,
oprnds.inputCSVProperties.getDelim(),
oprnds.txMtdPath);
if (isCSV)
{
DMLConfig conf = ConfigurationManager.getConfig();
int blockSize = conf.getIntValue(DMLConfig.DEFAULT_BLOCK_SIZE);
CSVReblockInstruction rblk = prepDummyReblockInstruction(oprnds.inputCSVProperties, blockSize);
AssignRowIDMRReturn ret1 = CSVReblockMR.runAssignRowIDMRJob(new String[]{oprnds.inputPath},
new InputInfo[]{InputInfo.CSVInputInfo},
new int[]{blockSize}, new int[]{blockSize},
rblk.toString(), replication, new String[]{smallestFile},
true, oprnds.inputCSVProperties.getNAStrings(), specFileWithIDs);
numRowsTf = ret1.rlens[0];
if ( ret1.rlens[0] == 0 )
throw new DMLRuntimeException(ERROR_MSG_ZERO_ROWS);
// Apply transformation metadata, and perform actual transformation
retCSV = ApplyTfCSVMR.runJob(oprnds.inputPath, specFileWithIDs,
oprnds.applyTxPath, tmpPath,
outputMatrices[csvoutputs.get(0)].getFileName(),
ret1.counterFile.toString(), oprnds.inputCSVProperties, numColumns,
replication, outHeader);
}
if(isBB)
{
// compute part offsets file
CSVReblockInstruction rblk = (CSVReblockInstruction) InstructionParser.parseSingleInstruction(insts[1]);
CSVReblockInstruction newrblk = (CSVReblockInstruction) rblk.clone((byte)0);
AssignRowIDMRReturn ret1 = CSVReblockMR.runAssignRowIDMRJob(new String[]{oprnds.inputPath},
new InputInfo[]{InputInfo.CSVInputInfo},
new int[]{newrblk.brlen}, new int[]{newrblk.bclen},
newrblk.toString(), replication, new String[]{smallestFile},
true, oprnds.inputCSVProperties.getNAStrings(), specFileWithIDs);
numRowsTf = ret1.rlens[0];
if ( ret1.rlens[0] == 0 )
throw new DMLRuntimeException(ERROR_MSG_ZERO_ROWS);
// apply transformation metadata, as well as reblock the resulting data
retBB = ApplyTfBBMR.runJob(oprnds.inputPath,
insts[1], otherInst, specFileWithIDs,
oprnds.txMtdPath, tmpPath,
outputMatrices[bboutputs.get(0)].getFileName(),
ret1.counterFile.toString(),
oprnds.inputCSVProperties,
ret1.rlens[0], ret1.clens[0], numColumnsTf,
replication, outHeader);
}
}
// copy auxiliary data (old and new header lines) from temporary location to txMtdPath
moveFilesFromTmp(fs, tmpPath, oprnds.txMtdPath);
// generate matrix metadata file for outputs
if ( retCSV != null )
{
retCSV.getMatrixCharacteristics(0).setDimension(numRowsTf, numColumnsTf);
CSVFileFormatProperties prop = new CSVFileFormatProperties(
false,
oprnds.inputCSVProperties.getDelim(), // use the same header as the input
false, Double.NaN, null);
MapReduceTool.writeMetaDataFile (outputMatrices[csvoutputs.get(0)].getFileName()+".mtd",
ValueType.DOUBLE, retCSV.getMatrixCharacteristics(0),
OutputInfo.CSVOutputInfo, prop);
return retCSV;
}
if ( retBB != null )
{
retBB.getMatrixCharacteristics(0).setDimension(numRowsTf, numColumnsTf);
MapReduceTool.writeMetaDataFile (outputMatrices[bboutputs.get(0)].getFileName()+".mtd",
ValueType.DOUBLE, retBB.getMatrixCharacteristics(0), OutputInfo.BinaryBlockOutputInfo);
return retBB;
}
return null;
}
private static CSVReblockInstruction prepDummyReblockInstruction(CSVFileFormatProperties prop, int blockSize) {
StringBuilder sb = new StringBuilder();
sb.append( ExecType.MR );
sb.append( Lop.OPERAND_DELIMITOR );
sb.append( CSVReBlock.OPCODE );
sb.append( Lop.OPERAND_DELIMITOR );
sb.append( "0" );
sb.append(Lop.DATATYPE_PREFIX);
sb.append(DataType.MATRIX);
sb.append(Lop.VALUETYPE_PREFIX);
sb.append(ValueType.DOUBLE);
sb.append( Lop.OPERAND_DELIMITOR );
sb.append( "1" );
sb.append(Lop.DATATYPE_PREFIX);
sb.append(DataType.MATRIX);
sb.append(Lop.VALUETYPE_PREFIX);
sb.append(ValueType.DOUBLE);
sb.append( Lop.OPERAND_DELIMITOR );
sb.append( blockSize );
sb.append( Lop.OPERAND_DELIMITOR );
sb.append( blockSize );
sb.append( Lop.OPERAND_DELIMITOR );
sb.append( prop.hasHeader() );
sb.append( Lop.OPERAND_DELIMITOR );
sb.append( prop.getDelim() );
sb.append( Lop.OPERAND_DELIMITOR );
sb.append( prop.isFill() );
sb.append( Lop.OPERAND_DELIMITOR );
sb.append( prop.getFillValue() );
return (CSVReblockInstruction) CSVReblockInstruction.parseInstruction(sb.toString());
}
private static String getOutputHeader(FileSystem fs, String headerLine, TransformOperands oprnds) throws IOException
{
String ret = null;
if(oprnds.isApply)
{
BufferedReader br = new BufferedReader(new InputStreamReader( fs.open(new Path(oprnds.applyTxPath + "/" + TransformationAgent.OUT_HEADER)) ));
ret = br.readLine();
br.close();
}
else {
if ( oprnds.outNamesFile == null )
ret = headerLine;
else {
BufferedReader br = new BufferedReader(new InputStreamReader( fs.open(new Path(oprnds.outNamesFile)) ));
ret = br.readLine();
br.close();
}
}
return ret;
}
/**
* Main method to create and/or apply transformation metdata in-memory, on a
* single node.
*
* @param inst
* @param inputMatrices
* @param outputMatrices
* @return
* @throws IOException
* @throws DMLRuntimeException
* @throws JSONException
* @throws IllegalArgumentException
*/
public static JobReturn cpDataTransform(ParameterizedBuiltinCPInstruction inst, MatrixObject[] inputMatrices, MatrixObject[] outputMatrices) throws IOException, DMLRuntimeException, IllegalArgumentException, JSONException {
TransformOperands oprnds = new TransformOperands(inst, inputMatrices[0]);
return cpDataTransform(oprnds, inputMatrices, outputMatrices);
}
public static JobReturn cpDataTransform(String inst, MatrixObject[] inputMatrices, MatrixObject[] outputMatrices) throws IOException, DMLRuntimeException, IllegalArgumentException, JSONException {
String[] insts = inst.split(Instruction.INSTRUCTION_DELIM);
// Parse transform instruction (the first instruction) to obtain relevant fields
TransformOperands oprnds = new TransformOperands(insts[0], inputMatrices[0]);
return cpDataTransform(oprnds, inputMatrices, outputMatrices);
}
public static JobReturn cpDataTransform(TransformOperands oprnds, MatrixObject[] inputMatrices, MatrixObject[] outputMatrices) throws IOException, DMLRuntimeException, IllegalArgumentException, JSONException {
JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
FileSystem fs = FileSystem.get(job);
// find the first file in alphabetical ordering of partfiles in directory inputPath
String smallestFile = CSVReblockMR.findSmallestFile(job, oprnds.inputPath);
// find column names
String headerLine = readHeaderLine(fs, oprnds.inputCSVProperties, smallestFile);
HashMap colNamesToIds = processColumnNames(fs, oprnds.inputCSVProperties, headerLine, smallestFile);
String outHeader = getOutputHeader(fs, headerLine, oprnds);
ArrayList csvoutputs= new ArrayList();
ArrayList bboutputs = new ArrayList();
// divide output objects based on output format (CSV or BinaryBlock)
for(int i=0; i < outputMatrices.length; i++)
{
if(outputMatrices[i].getFileFormatProperties() != null && outputMatrices[i].getFileFormatProperties().getFileFormat() == FileFormatProperties.FileFormat.CSV)
csvoutputs.add(i);
else
bboutputs.add(i);
}
boolean isCSV = (csvoutputs.size() > 0);
boolean isBB = (bboutputs.size() > 0);
JobReturn ret = null;
if (!oprnds.isApply) {
// build specification file with column IDs insteadof column names
String specFileWithIDs = processSpecFile(fs, oprnds.inputPath, smallestFile, colNamesToIds, oprnds.inputCSVProperties, oprnds.specFile);
MapReduceTool.copyFileOnHDFS(specFileWithIDs, oprnds.txMtdPath + "/" + "spec.json");
ret = performTransform(job, fs, oprnds.inputPath, colNamesToIds.size(), oprnds.inputCSVProperties, specFileWithIDs, oprnds.txMtdPath, oprnds.isApply, outputMatrices[0], outHeader, isBB, isCSV );
}
else {
// copy given transform metadata (applyTxPath) to specified location (txMtdPath)
MapReduceTool.deleteFileIfExistOnHDFS(new Path(oprnds.txMtdPath), job);
MapReduceTool.copyFileOnHDFS(oprnds.applyTxPath, oprnds.txMtdPath);
// path to specification file
String specFileWithIDs = oprnds.txMtdPath + "/" + "spec.json";
ret = performTransform(job, fs, oprnds.inputPath, colNamesToIds.size(), oprnds.inputCSVProperties, specFileWithIDs, oprnds.txMtdPath, oprnds.isApply, outputMatrices[0], outHeader, isBB, isCSV );
}
return ret;
}
/**
* Helper function to fetch and sort the list of part files under the given
* input directory.
*
* @param input
* @param fs
* @return
* @throws FileNotFoundException
* @throws IOException
*/
@SuppressWarnings("unchecked")
private static ArrayList collectInputFiles(String input, FileSystem fs) throws FileNotFoundException, IOException
{
Path path = new Path(input);
ArrayList files=new ArrayList();
if(fs.isDirectory(path))
{
for(FileStatus stat: fs.listStatus(path, CSVReblockMR.hiddenFileFilter))
files.add(stat.getPath());
Collections.sort(files);
}
else
files.add(path);
return files;
}
private static int[] countNumRows(ArrayList files, CSVFileFormatProperties prop, FileSystem fs, TfUtils agents) throws IOException
{
int[] rows = new int[2];
int numRows=0, numRowsTf=0;
OmitAgent oa = agents.getOmitAgent();
if(!oa.isApplicable())
{
for(int fileNo=0; fileNo files = collectInputFiles(inputPath, fs);
// ---------------------------------
// Construct transformation metadata
// ---------------------------------
String line = null;
String[] words = null;
int numColumnsTf=0;
BufferedReader br = null;
if (!isApply) {
for(int fileNo=0; fileNo colNamesToIds = processColumnNames(fs, oprnds.inputCSVProperties, headerLine, smallestFile);
int numColumns = colNamesToIds.size();
String outHeader = getOutputHeader(fs, headerLine, oprnds);
String tmpPath = MRJobConfiguration.constructTempOutputFilename();
// Construct RDD for input data
@SuppressWarnings("unchecked")
JavaPairRDD inputData = (JavaPairRDD) sec.getRDDHandleForMatrixObject(inputMatrices[0], InputInfo.CSVInputInfo);
JavaRDD> csvLines = JavaPairRDD.toRDD(inputData).toJavaRDD();
long numRowsTf=0, numColumnsTf=0;
JavaPairRDD tfPairRDD = null;
if (!oprnds.isApply) {
// build specification file with column IDs insteadof column names
String specFileWithIDs = processSpecFile(fs, oprnds.inputPath,
smallestFile, colNamesToIds,
oprnds.inputCSVProperties,
oprnds.specFile);
colNamesToIds = null; // enable GC on colNamesToIds
// Build transformation metadata, including recode maps, bin definitions, etc.
// Also, generate part offsets file (counters file), which is to be used in csv-reblock (if needed)
String partOffsetsFile = MRJobConfiguration.constructTempOutputFilename();
numRowsTf = GenTfMtdSPARK.runSparkJob(sec, csvLines, oprnds.txMtdPath,
specFileWithIDs,partOffsetsFile,
oprnds.inputCSVProperties, numColumns,
outHeader);
// store the specFileWithIDs as transformation metadata
MapReduceTool.copyFileOnHDFS(specFileWithIDs, oprnds.txMtdPath + "/" + "spec.json");
numColumnsTf = getNumColumnsTf(fs, outHeader, oprnds.inputCSVProperties.getDelim(), oprnds.txMtdPath);
tfPairRDD = ApplyTfCSVSPARK.runSparkJob(sec, csvLines,
oprnds.txMtdPath, specFileWithIDs, tmpPath,
oprnds.inputCSVProperties, numColumns, outHeader);
MapReduceTool.deleteFileIfExistOnHDFS(new Path(partOffsetsFile), job);
}
else {
colNamesToIds = null; // enable GC on colNamesToIds
// copy given transform metadata (applyTxPath) to specified location (txMtdPath)
MapReduceTool.deleteFileIfExistOnHDFS(new Path(oprnds.txMtdPath), job);
MapReduceTool.copyFileOnHDFS(oprnds.applyTxPath, oprnds.txMtdPath);
// path to specification file
String specFileWithIDs = oprnds.txMtdPath + "/" + "spec.json";
numColumnsTf = getNumColumnsTf(fs, outHeader,
oprnds.inputCSVProperties.getDelim(),
oprnds.txMtdPath);
// Apply transformation metadata, and perform actual transformation
tfPairRDD = ApplyTfCSVSPARK.runSparkJob(sec, csvLines,
oprnds.txMtdPath, specFileWithIDs, tmpPath,
oprnds.inputCSVProperties, numColumns, outHeader);
}
// copy auxiliary data (old and new header lines) from temporary location to txMtdPath
moveFilesFromTmp(fs, tmpPath, oprnds.txMtdPath);
// convert to csv output format (serialized longwritable/text)
JavaPairRDD outtfPairRDD =
RDDConverterUtils.stringToSerializableText(tfPairRDD);
if ( outtfPairRDD != null )
{
MatrixObject outMO = outputMatrices[0];
String outVar = outMO.getVarName();
outMO.setRDDHandle(new RDDObject(outtfPairRDD, outVar));
sec.addLineageRDD(outVar, inst.getParams().get("target"));
//update output statistics (required for correctness)
MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(outVar);
mcOut.setDimension(numRowsTf, numColumnsTf);
mcOut.setNonZeros(-1);
}
}
}