Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
package com.alibaba.alink.operator.common.linear;
import com.alibaba.alink.common.MLEnvironment;
import com.alibaba.alink.common.MLEnvironmentFactory;
import com.alibaba.alink.common.linalg.DenseVector;
import com.alibaba.alink.common.linalg.SparseVector;
import com.alibaba.alink.common.linalg.Vector;
import com.alibaba.alink.common.linalg.VectorUtil;
import com.alibaba.alink.common.model.ModelParamName;
import com.alibaba.alink.common.utils.TableUtil;
import com.alibaba.alink.operator.batch.BatchOperator;
import com.alibaba.alink.operator.common.linear.unarylossfunc.*;
import com.alibaba.alink.operator.common.optim.Lbfgs;
import com.alibaba.alink.operator.common.optim.OptimMethod;
import com.alibaba.alink.operator.common.optim.OptimizerFactory;
import com.alibaba.alink.operator.common.optim.Owlqn;
import com.alibaba.alink.operator.common.optim.objfunc.OptimObjFunc;
import com.alibaba.alink.operator.common.statistics.StatisticsHelper;
import com.alibaba.alink.operator.common.statistics.basicstatistic.BaseVectorSummary;
import com.alibaba.alink.operator.common.statistics.basicstatistic.SparseVectorSummary;
import com.alibaba.alink.params.regression.LassoRegTrainParams;
import com.alibaba.alink.params.regression.LinearSvrTrainParams;
import com.alibaba.alink.params.regression.RidgeRegTrainParams;
import com.alibaba.alink.params.shared.linear.HasL1;
import com.alibaba.alink.params.shared.linear.LinearTrainParams;
import org.apache.flink.api.common.functions.*;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.ml.api.misc.param.Params;
import org.apache.flink.table.api.TableSchema;
import org.apache.flink.types.Row;
import org.apache.flink.util.Collector;
import org.apache.flink.util.Preconditions;
import java.util.ArrayList;
import java.util.List;
/**
* Base class of linear model training. Linear binary classification and linear regression algorithms should inherit
* this class. Then it only need to write the code of loss function and regular item.
*
* @param parameter of this class. Maybe the Svm, linearRegression or Lr parameter.
*/
public abstract class BaseLinearModelTrainBatchOp> extends BatchOperator {
private String modelName;
private LinearModelType linearModelType;
private static final int NUM_FEATURE_THRESHOLD = 10000;
private static final String META = "meta";
private static final String MEAN_VAR = "meanVar";
private static final String VECTOR_SIZE = "vectorSize";
private static final String LABEL_VALUES = "labelValues";
/**
* @param params parameters needed by training process.
* @param modelType model type: LR, SVR, SVM, Ridge ...
* @param modelName name of model.
*/
public BaseLinearModelTrainBatchOp(Params params, LinearModelType modelType, String modelName) {
super(params);
this.modelName = modelName;
this.linearModelType = modelType;
}
@Override
public T linkFrom(BatchOperator... inputs) {
BatchOperator in = checkAndGetFirst(inputs);
// Get parameters of this algorithm.
Params params = getParams();
// Get type of processing: regression or not
boolean isRegProc = getIsRegProc(params, linearModelType, modelName);
// Get label info : including label values and label type.
Tuple2, TypeInformation> labelInfo = getLabelInfo(in, params, isRegProc);
// Transform data to Tuple3 format.//weight, label, feature vector.
DataSet> initData = transform(in, params, labelInfo.f0, isRegProc);
// Get statistics variables : including vector size, mean and variance of train data.
Tuple2, DataSet>
statInfo = getStatInfo(initData, params.get(LinearTrainParams.STANDARDIZATION));
// Do standardization and interception to train data.
DataSet> trainData = preProcess(initData, params, statInfo.f1);
// Solve the optimization problem.
DataSet> coefVectorSet = optimize(params, statInfo.f0,
trainData, linearModelType, MLEnvironmentFactory.get(getMLEnvironmentId()));
// Prepare the meta info of linear model.
DataSet meta = labelInfo.f0
.mapPartition(new CreateMeta(modelName, linearModelType, isRegProc, params))
.setParallelism(1);
// Build linear model rows, the format to be output.
DataSet modelRows;
String[] featureColTypes = getFeatureTypes(in, params.get(LinearTrainParams.FEATURE_COLS));
modelRows = coefVectorSet
.mapPartition(new BuildModelFromCoefs(labelInfo.f1,
params.get(LinearTrainParams.FEATURE_COLS),
params.get(LinearTrainParams.STANDARDIZATION),
params.get(LinearTrainParams.WITH_INTERCEPT), featureColTypes))
.withBroadcastSet(meta, META)
.withBroadcastSet(statInfo.f1, MEAN_VAR)
.setParallelism(1);
// Convert the model rows to table.
this.setOutput(modelRows, new LinearModelDataConverter(labelInfo.f1).getModelSchema());
return (T)this;
}
/**
* @param trainData train data.
* @param standardization do standardization or not.
* @return return one element. 1. vector size. 2. mean and variance of train data for standardization
*/
private Tuple2, DataSet> getStatInfo(
DataSet> trainData,
final boolean standardization) {
if (standardization) {
DataSet summary = StatisticsHelper.summary(trainData.map(
new MapFunction, Vector>() {
@Override
public Vector map(Tuple3 value) throws Exception {
return value.f2;
}
}).withForwardedFields());
DataSet coefficientDim = summary.map(new MapFunction() {
@Override
public Integer map(BaseVectorSummary value) throws Exception {
return value.vectorSize();
}
});
DataSet meanVar = summary.map(new MapFunction() {
@Override
public DenseVector[] map(BaseVectorSummary value) {
if (value instanceof SparseVectorSummary) {
// If train data format is sparse vector, use maxAbs as variance and set mean zero,
// then, the standardization operation will turn into a scale operation.
// Because if do standardization to sparse vector, vector will be convert to be a dense one.
DenseVector max = ((SparseVector)value.max()).toDenseVector();
DenseVector min = ((SparseVector)value.min()).toDenseVector();
for (int i = 0; i < max.size(); ++i) {
max.set(i, Math.max(Math.abs(max.get(i)), Math.abs(min.get(i))));
min.set(i, 0.0);
}
return new DenseVector[] {min, max};
} else {
return new DenseVector[] {(DenseVector)value.mean(),
(DenseVector)value.standardDeviation()};
}
}
});
return Tuple2.of(coefficientDim, meanVar);
} else {
// If not do standardization, the we use mapReduce to get vector Dim. Mean and var set zero vector.
DataSet coefficientDim = trainData.mapPartition(
new MapPartitionFunction, Integer>() {
@Override
public void mapPartition(Iterable> values, Collector out)
throws Exception {
int ret = -1;
for (Tuple3 val : values) {
if (val.f2 instanceof DenseVector) {
ret = ((DenseVector)val.f2).getData().length;
break;
} else {
int[] ids = ((SparseVector)val.f2).getIndices();
for (int id : ids) {
ret = Math.max(ret, id + 1);
}
}
}
out.collect(ret);
}
}).reduceGroup(new GroupReduceFunction() {
@Override
public void reduce(Iterable values, Collector out) {
int ret = -1;
for (int vSize : values) {
ret = Math.max(ret, vSize);
}
out.collect(ret);
}
});
DataSet meanVar = coefficientDim.map(new MapFunction() {
@Override
public DenseVector[] map(Integer value) {
return new DenseVector[] {new DenseVector(0), new DenseVector(0)};
}
});
return Tuple2.of(coefficientDim, meanVar);
}
}
/**
* order by the dictionary order,
* only classification problem need do this process.
*
* @param unorderedLabelRows unordered label rows
* @return
*/
private static Object[] orderLabels(Iterable