org.apache.sysml.runtime.transform.ApplyTfCSVSPARK Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Show all versions of systemml Show documentation
Declarative Machine Learning
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.sysml.runtime.transform;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.broadcast.Broadcast;
import org.apache.wink.json4j.JSONException;
import org.apache.wink.json4j.JSONObject;
import scala.Tuple2;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext;
import org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties;
public class ApplyTfCSVSPARK {
/**
* Apply transformation metadata and generate the result in CSV format, as a
* JavaRDD of Strings.
*/
public static JavaPairRDD runSparkJob(
SparkExecutionContext sec, JavaRDD> inputRDD,
String tfMtdPath, String specFile,
String tmpPath, CSVFileFormatProperties prop,
int numCols, String headerLine
) throws IOException, ClassNotFoundException, InterruptedException, IllegalArgumentException, JSONException {
// Load transformation metadata and broadcast it
JobConf job = new JobConf();
FileSystem fs = FileSystem.get(job);
String[] naStrings = TfUtils.parseNAStrings(prop.getNAStrings());
JSONObject spec = TfUtils.readSpec(fs, specFile);
TfUtils _tfmapper = new TfUtils(headerLine, prop.hasHeader(), prop.getDelim(), naStrings, spec, numCols, tfMtdPath, null, tmpPath);
_tfmapper.loadTfMetadata();
Broadcast bcast_tf = sec.getSparkContext().broadcast(_tfmapper);
/*
* Construct transformation metadata (map-side) -- the logic is similar
* to GTFMTDMapper
*
* Note: The result of mapPartitionsWithIndex is cached so that the
* transformed data is not redundantly computed multiple times
*/
JavaPairRDD applyRDD = inputRDD
.mapPartitionsWithIndex( new ApplyTfCSVMap(bcast_tf), true)
.mapToPair(
new PairFunction(){
private static final long serialVersionUID = 3868143093999082931L;
@Override
public Tuple2 call(String t) throws Exception {
return new Tuple2(new Long(1), t);
}
}
).cache();
/*
* An action to force execution of apply()
*
* We need to trigger the execution of this RDD so as to ensure the
* creation of a few metadata files (headers, dummycoded information,
* etc.), which are referenced in the caller function.
*/
applyRDD.count();
return applyRDD;
}
public static class ApplyTfCSVMap implements Function2>, Iterator> {
private static final long serialVersionUID = 1496686437276906911L;
TfUtils _tfmapper = null;
ApplyTfCSVMap(boolean hasHeader, String delim, String naStrings, String specFile, String tmpPath, String tfMtdPath, long numCols, String headerLine, Broadcast tf) throws IllegalArgumentException, IOException, JSONException {
_tfmapper = tf.getValue();
}
ApplyTfCSVMap(Broadcast tf) throws IllegalArgumentException, IOException, JSONException {
_tfmapper = tf.getValue();
}
@Override
public Iterator call(Integer partitionID,
Iterator> csvLines) throws Exception {
boolean first = true;
Tuple2 rec = null;
ArrayList outLines = new ArrayList();
while(csvLines.hasNext()) {
rec = csvLines.next();
if (first && partitionID == 0) {
first = false;
_tfmapper.processHeaderLine();
if (_tfmapper.hasHeader() ) {
//outLines.add(dcdHeader); // if the header needs to be preserved in the output file
continue;
}
}
// parse the input line and apply transformation
String[] words = _tfmapper.getWords(rec._2());
if(!_tfmapper.omit(words))
{
try {
words = _tfmapper.apply(words);
String outStr = _tfmapper.checkAndPrepOutputString(words);
outLines.add(outStr);
}
catch(DMLRuntimeException e) {
throw new RuntimeException(e.getMessage() + ": " + rec._2().toString());
}
}
}
return outLines.iterator();
}
}
}