Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* * Copyright 2016 Skymind,Inc.
* *
* * Licensed under the Apache License, Version 2.0 (the "License");
* * you may not use this file except in compliance with the License.
* * You may obtain a copy of the License at
* *
* * http://www.apache.org/licenses/LICENSE-2.0
* *
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS,
* * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* * See the License for the specific language governing permissions and
* * limitations under the License.
*/
package org.deeplearning4j.spark.datavec;
import org.apache.spark.api.java.function.Function;
import org.datavec.api.io.WritableConverter;
import org.datavec.api.writable.Writable;
import org.datavec.common.data.NDArrayWritable;
import org.deeplearning4j.datasets.datavec.SequenceRecordReaderDataSetIterator;
import org.nd4j.linalg.api.ndarray.INDArray;
import org.nd4j.linalg.dataset.DataSet;
import org.nd4j.linalg.dataset.api.DataSetPreProcessor;
import org.nd4j.linalg.factory.Nd4j;
import org.nd4j.linalg.indexing.NDArrayIndex;
import org.nd4j.linalg.util.FeatureUtil;
import scala.Tuple2;
import java.io.Serializable;
import java.util.Iterator;
import java.util.List;
/**Map {@code Tuple2>,Collection>} objects (out of a TWO datavec-spark
* sequence record reader functions) to DataSet objects for Spark training.
* Analogous to {@link SequenceRecordReaderDataSetIterator}, but in the context of Spark.
* Supports loading data from a TWO sources only; hence supports many-to-one and one-to-many situations.
* see {@link DataVecSequenceDataSetFunction} for the single file version
* @author Alex Black
*/
public class DataVecSequencePairDataSetFunction implements Function>,List>>,DataSet>, Serializable {
/**Alignment mode for dealing with input/labels of differing lengths (for example, one-to-many and many-to-one type situations).
* For example, might have 10 time steps total but only one label at end for sequence classification.
* EQUAL_LENGTH: Default. Assume that label and input time series are of equal length
* ALIGN_START: Align the label/input time series at the first time step, and zero pad either the labels or
* the input at the end (pad whichever is shorter)
* ALIGN_END: Align the label/input at the last time step, zero padding either the input or the labels as required
*/
public enum AlignmentMode {
EQUAL_LENGTH,
ALIGN_START,
ALIGN_END
}
private final boolean regression;
private final int numPossibleLabels;
private final AlignmentMode alignmentMode;
private final DataSetPreProcessor preProcessor;
private final WritableConverter converter;
/** Constructor for equal length and no conversion of labels (i.e., regression or already in one-hot representation).
* No data set proprocessor or writable converter
*/
public DataVecSequencePairDataSetFunction(){
this(-1, true);
}
/**Constructor for equal length, no data set preprocessor or writable converter
* @see #DataVecSequencePairDataSetFunction(int, boolean, AlignmentMode, DataSetPreProcessor, WritableConverter)
*/
public DataVecSequencePairDataSetFunction(int numPossibleLabels, boolean regression){
this(numPossibleLabels, regression, AlignmentMode.EQUAL_LENGTH);
}
/**Constructor for data with a specified alignment mode, no data set preprocessor or writable converter
* @see #DataVecSequencePairDataSetFunction(int, boolean, AlignmentMode, DataSetPreProcessor, WritableConverter)
*/
public DataVecSequencePairDataSetFunction(int numPossibleLabels, boolean regression, AlignmentMode alignmentMode){
this(numPossibleLabels, regression, alignmentMode, null, null);
}
/**
* @param numPossibleLabels Number of classes for classification (not used if regression = true)
* @param regression False for classification, true for regression
* @param alignmentMode Alignment mode for data. See {@link DataVecSequencePairDataSetFunction.AlignmentMode}
* @param preProcessor DataSetPreprocessor (may be null)
* @param converter WritableConverter (may be null)
*/
public DataVecSequencePairDataSetFunction(int numPossibleLabels, boolean regression,
AlignmentMode alignmentMode, DataSetPreProcessor preProcessor,
WritableConverter converter){
this.numPossibleLabels = numPossibleLabels;
this.regression = regression;
this.alignmentMode = alignmentMode;
this.preProcessor = preProcessor;
this.converter = converter;
}
@Override
public DataSet call(Tuple2>,List>> input) throws Exception {
List> featuresSeq = input._1();
List> labelsSeq = input._2();
int featuresLength = featuresSeq.size();
int labelsLength = labelsSeq.size();
Iterator> fIter = featuresSeq.iterator();
Iterator> lIter = labelsSeq.iterator();
INDArray inputArr = null;
INDArray outputArr = null;
int[] idx = new int[3];
int i = 0;
while(fIter.hasNext()){
List step = fIter.next();
if (i == 0) {
int[] inShape = new int[]{1,step.size(),featuresLength};
inputArr = Nd4j.create(inShape);
}
Iterator timeStepIter = step.iterator();
int f = 0;
idx[1] = 0;
while (timeStepIter.hasNext()) {
Writable current = timeStepIter.next();
if(converter != null) current = converter.convert(current);
try {
inputArr.putScalar(idx, current.toDouble());
} catch (UnsupportedOperationException e) {
// This isn't a scalar, so check if we got an array already
if (current instanceof NDArrayWritable) {
inputArr.get(NDArrayIndex.point(idx[0]), NDArrayIndex.all(), NDArrayIndex.point(idx[2]))
.putRow(0, ((NDArrayWritable)current).get());
} else {
throw e;
}
}
idx[1] = ++f;
}
idx[2] = ++i;
}
idx = new int[3];
i = 0;
while(lIter.hasNext()){
List step = lIter.next();
if (i == 0) {
int[] outShape = new int[]{1,(regression ? step.size() : numPossibleLabels),labelsLength};
outputArr = Nd4j.create(outShape);
}
Iterator timeStepIter = step.iterator();
int f = 0;
idx[1] = 0;
if(regression){
//Load all values without modification
while (timeStepIter.hasNext()) {
Writable current = timeStepIter.next();
if(converter != null) current = converter.convert(current);
outputArr.putScalar(idx, current.toDouble());
idx[1] = ++f;
}
} else {
//Expect a single value (index) -> convert to one-hot vector
Writable value = timeStepIter.next();
int labelClassIdx = value.toInt();
INDArray line = FeatureUtil.toOutcomeVector(labelClassIdx, numPossibleLabels);
outputArr.tensorAlongDimension(i, 1).assign(line); //1d from [1,nOut,timeSeriesLength] -> tensor i along dimension 1 is at time i
}
idx[2] = ++i;
}
DataSet ds;
if(alignmentMode == AlignmentMode.EQUAL_LENGTH || featuresLength == labelsLength){
ds = new DataSet(inputArr,outputArr);
} else if(alignmentMode == AlignmentMode.ALIGN_END){
if(featuresLength > labelsLength ){
//Input longer, pad output
INDArray newOutput = Nd4j.create(1,outputArr.size(1),featuresLength);
newOutput.get(NDArrayIndex.point(0),NDArrayIndex.all(), NDArrayIndex.interval(featuresLength-labelsLength,featuresLength))
.assign(outputArr);
//Need an output mask array, but not an input mask array
INDArray outputMask = Nd4j.create(1,featuresLength);
for( int j=featuresLength-labelsLength; j labelsLength ){
//Input longer, pad output
INDArray newOutput = Nd4j.create(1,outputArr.size(1),featuresLength);
newOutput.get(NDArrayIndex.point(0),NDArrayIndex.all(), NDArrayIndex.interval(0,labelsLength)).assign(outputArr);
//Need an output mask array, but not an input mask array
INDArray outputMask = Nd4j.create(1,featuresLength);
for( int j=0; j