org.datavec.api.transform.reduce.Reducer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of datavec-api Show documentation
There is a newer version: 1.0.0-M2.1
/*******************************************************************************
 * Copyright (c) 2015-2018 Skymind, Inc.
 *
 * This program and the accompanying materials are made available under the
 * terms of the Apache License, Version 2.0 which is available at
 * https://www.apache.org/licenses/LICENSE-2.0.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 *
 * SPDX-License-Identifier: Apache-2.0
 ******************************************************************************/

package org.datavec.api.transform.reduce;

import com.clearspring.analytics.util.Preconditions;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import org.datavec.api.transform.ColumnType;
import org.datavec.api.transform.ReduceOp;
import org.datavec.api.transform.condition.Condition;
import org.datavec.api.transform.condition.column.TrivialColumnCondition;
import org.datavec.api.transform.metadata.*;
import org.datavec.api.transform.ops.*;
import org.datavec.api.transform.schema.Schema;
import org.datavec.api.writable.Writable;
import org.nd4j.shade.jackson.annotation.JsonIgnoreProperties;
import org.nd4j.shade.jackson.annotation.JsonProperty;

import java.io.Serializable;
import java.util.*;

/**
 * A Reducer is used to take a set of examples and reduce them.
 * The idea: suppose you have a large number of columns, and you want to combine/reduce the values in each column.

 * Reducer allows you to specify different reductions for differently for different columns: min, max, sum, mean etc.
 * See {@link Builder} and {@link ReduceOp} for the full list.

 * Note this supports executing multipe reducitons per column: simply call the Builder with Xcolumn() repeatedly
 * on the same column, or use {@link Reducer.Builder#multipleOpColmumns(List, String...)}}
 * 
 * Uses are:
 * (1) Reducing examples by a key
 * (2) Reduction operations in time series (windowing ops, etc)
 *
 * @author Alex Black
 */
@Data
@JsonIgnoreProperties({"schema", "keyColumnsSet"})
@EqualsAndHashCode(exclude = {"schema", "keyColumnsSet"})
public class Reducer implements IAssociativeReducer {

    private Schema schema;
    private final List keyColumns;
    private final Set keyColumnsSet;
    private final ReduceOp defaultOp;
    private final Map> opMap;
    private Map conditionalReductions;
    private Map customReductions;

    private Set ignoreInvalidInColumns;

    private Reducer(Builder builder) {
        this((builder.keyColumns == null ? null : Arrays.asList(builder.keyColumns)), builder.defaultOp, builder.opMap,
                        builder.customReductions, builder.conditionalReductions, builder.ignoreInvalidInColumns);
    }

    public Reducer(@JsonProperty("keyColumns") List keyColumns, @JsonProperty("defaultOp") ReduceOp defaultOp,
                    @JsonProperty("opMap") Map> opMap,
                    @JsonProperty("customReductions") Map customReductions,
                    @JsonProperty("conditionalReductions") Map conditionalReductions,
                    @JsonProperty("ignoreInvalidInColumns") Set ignoreInvalidInColumns) {
        this.keyColumns = keyColumns;
        this.keyColumnsSet = (keyColumns == null ? null : new HashSet<>(keyColumns));
        this.defaultOp = defaultOp;
        this.opMap = opMap;
        this.customReductions = customReductions;
        this.conditionalReductions = conditionalReductions;
        this.ignoreInvalidInColumns = ignoreInvalidInColumns;
    }

    @Override
    public void setInputSchema(Schema schema) {
        this.schema = schema;
        //Conditions (if any) also need the input schema:
        for (ConditionalReduction cr : conditionalReductions.values()) {
            cr.getCondition().setInputSchema(schema);
        }
    }

    @Override
    public Schema getInputSchema() {
        return schema;
    }

    @Override
    public List getKeyColumns() {
        return keyColumns;
    }

    /**
     * Get the output schema, given the input schema
     */
    @Override
    public Schema transform(Schema schema) {
        int nCols = schema.numColumns();
        List colNames = schema.getColumnNames();
        List meta = schema.getColumnMetaData();
        List newMeta = new ArrayList<>(nCols);

        for (int i = 0; i < nCols; i++) {
            String name = colNames.get(i);
            ColumnMetaData inMeta = meta.get(i);

            if (keyColumnsSet != null && keyColumnsSet.contains(name)) {
                //No change to key columns
                newMeta.add(inMeta);
                continue;
            }

            //First: check for a custom reduction on this column
            if (customReductions != null && customReductions.containsKey(name)) {
                AggregableColumnReduction reduction = customReductions.get(name);

                List outName = reduction.getColumnsOutputName(name);
                List outMeta = reduction.getColumnOutputMetaData(outName, inMeta);
                newMeta.addAll(outMeta);
                continue;
            }

            //Second: check for conditional reductions on this column:
            if (conditionalReductions != null && conditionalReductions.containsKey(name)) {
                ConditionalReduction reduction = conditionalReductions.get(name);

                List outNames = reduction.getOutputNames();
                List reductions = reduction.getReductions();
                for (int j = 0; j < reduction.getReductions().size(); j++) {
                    ReduceOp red = reductions.get(j);
                    String outName = outNames.get(j);
                    ColumnMetaData m = getMetaForColumn(red, name, inMeta);
                    m.setName(outName);
                    newMeta.add(m);
                }
                continue;
            }


            //Otherwise: get the specified (built-in) reduction op
            //If no reduction op is specified for that column: use the default
            List lop = opMap.containsKey(name) ? opMap.get(name) : Collections.singletonList(defaultOp);
            if (lop != null)
                for (ReduceOp op : lop) {
                    newMeta.add(getMetaForColumn(op, name, inMeta));
                }
        }

        return schema.newSchema(newMeta);
    }

    private static String getOutNameForColumn(ReduceOp op, String name) {
        return op.name().toLowerCase() + "(" + name + ")";
    }

    private static ColumnMetaData getMetaForColumn(ReduceOp op, String name, ColumnMetaData inMeta) {
        inMeta = inMeta.clone();
        switch (op) {
            // type-preserving operations
            case Min:
            case Max:
            case Range:
            case TakeFirst:
            case TakeLast:
                inMeta.setName(getOutNameForColumn(op, name));
                return inMeta;
            case Prod:
            case Sum:
                String outName = getOutNameForColumn(op, name);
                //Issue with prod/sum: the input meta data restrictions probably won't hold. But the data _type_ should essentially remain the same
                ColumnMetaData outMeta;
                if (inMeta instanceof IntegerMetaData)
                    outMeta = new IntegerMetaData(outName);
                else if (inMeta instanceof LongMetaData)
                    outMeta = new LongMetaData(outName);
                else if (inMeta instanceof FloatMetaData)
                    outMeta = new FloatMetaData(outName);
                else if (inMeta instanceof DoubleMetaData)
                    outMeta = new DoubleMetaData(outName);
                else { //Sum/Prod doesn't really make sense to sum other column types anyway...
                    outMeta = inMeta;
                }
                outMeta.setName(outName);
                return outMeta;
            case Mean:
            case Stdev:
            case Variance:
            case PopulationVariance:
            case UncorrectedStdDev:
                return new DoubleMetaData(getOutNameForColumn(op, name));
            case Append:
            case Prepend:
                return new StringMetaData(getOutNameForColumn(op, name));
            case Count: //Always Long
            case CountUnique:
                return new LongMetaData(getOutNameForColumn(op, name), 0L, null);
            default:
                throw new UnsupportedOperationException("Unknown or not implemented op: " + op);
        }
    }

    @Override
    public IAggregableReduceOp, List> aggregableReducer() {
        //Go through each writable, and reduce according to whatever strategy is specified

        if (schema == null)
            throw new IllegalStateException("Error: Schema has not been set");

        int nCols = schema.numColumns();
        List colNames = schema.getColumnNames();

        List>> ops = new ArrayList<>(nCols);
        boolean conditionalActive = (conditionalReductions != null && !conditionalReductions.isEmpty());
        List conditions = new ArrayList<>(nCols);

        for (int i = 0; i < nCols; i++) {
            String colName = colNames.get(i);
            if (keyColumnsSet != null && keyColumnsSet.contains(colName)) {
                IAggregableReduceOp first = new AggregatorImpls.AggregableFirst<>();
                ops.add(new AggregableMultiOp<>(Collections.singletonList(first)));
                if (conditionalActive)
                    conditions.add(new TrivialColumnCondition(colName));
                continue;
            }


            // is this a *custom* reduction column?
            if (customReductions != null && customReductions.containsKey(colName)) {
                AggregableColumnReduction reduction = customReductions.get(colName);
                ops.add(reduction.reduceOp());
                if (conditionalActive)
                    conditions.add(new TrivialColumnCondition(colName));
                continue;
            }

            // are we adding global *conditional* reduction column?
            // Only practical difference with conditional reductions is we filter the input on an all-fields condition first
            if (conditionalActive) {
                if (conditionalReductions.containsKey(colName))
                    conditions.add(conditionalReductions.get(colName).getCondition());
                else
                    conditions.add(new TrivialColumnCondition(colName));
            }

            //What type of column is this?
            ColumnType type = schema.getType(i);

            //What ops are we performing on this column?
            boolean conditionalOp = conditionalActive && conditionalReductions.containsKey(colName);
            List lop =
                            (conditionalOp ? conditionalReductions.get(colName).getReductions() : opMap.get(colName));
            if (lop == null || lop.isEmpty())
                lop = Collections.singletonList(defaultOp);

            //Execute the reduction, store the result
            ops.add(AggregableReductionUtils.reduceColumn(lop, type, ignoreInvalidInColumns.contains(colName),
                            schema.getMetaData(i)));
        }

        if (conditionalActive) {
            return new DispatchWithConditionOp<>(ops, conditions);
        } else {
            return new DispatchOp<>(ops);
        }
    }

    @Override
    public String toString() {
        StringBuilder sb = new StringBuilder("Reducer(");
        if (keyColumns != null) {
            sb.append("keyColumns=").append(keyColumns).append(",");
        }
        sb.append("defaultOp=").append(defaultOp);
        if (opMap != null) {
            sb.append(",opMap=").append(opMap);
        }
        if (customReductions != null) {
            sb.append(",customReductions=").append(customReductions);
        }
        if (conditionalReductions != null) {
            sb.append(",conditionalReductions=").append(conditionalReductions);
        }
        if (ignoreInvalidInColumns != null) {
            sb.append(",ignoreInvalidInColumns=").append(ignoreInvalidInColumns);
        }
        sb.append(")");
        return sb.toString();
    }


    public static class Builder {

        private ReduceOp defaultOp;
        private Map> opMap = new HashMap<>();
        private Map customReductions = new HashMap<>();
        private Map conditionalReductions = new HashMap<>();
        private Set ignoreInvalidInColumns = new HashSet<>();
        private String[] keyColumns;


        /**
         * Create a Reducer builder, and set the default column reduction operation.
         * For any columns that aren't specified explicitly, they will use the default reduction operation.
         * If a column does have a reduction operation explicitly specified, then it will override
         * the default specified here.
         *
         * @param defaultOp Default reduction operation to perform
         */
        public Builder(ReduceOp defaultOp) {
            this.defaultOp = defaultOp;
        }

        /**
         * Specify the key columns. The idea here is to be able to create a (potentially compound) key
         * out of multiple columns, using the toString representation of the values in these columns
         *
         * @param keyColumns Columns that will make up the key
         * @return
         */
        public Builder keyColumns(String... keyColumns) {
            this.keyColumns = keyColumns;
            return this;
        }

        private Builder add(ReduceOp op, String[] cols) {
            for (String s : cols) {
                List ops = new ArrayList<>();
                if (opMap.containsKey(s))
                    ops.addAll(opMap.get(s));
                ops.add(op);
                opMap.put(s, ops);
            }
            return this;
        }

        private Builder addAll(List ops, String[] cols) {
            for (String s : cols) {
                List theseOps = new ArrayList<>();
                if (opMap.containsKey(s))
                    theseOps.addAll(opMap.get(s));
                theseOps.addAll(ops);
                opMap.put(s, theseOps);
            }
            return this;
        }

        public Builder multipleOpColmumns(List ops, String... columns) {
            return addAll(ops, columns);
        }

        /**
         * Reduce the specified columns by taking the minimum value
         */
        public Builder minColumns(String... columns) {
            return add(ReduceOp.Min, columns);
        }

        /**
         * Reduce the specified columns by taking the maximum value
         */
        public Builder maxColumn(String... columns) {
            return add(ReduceOp.Max, columns);
        }

        /**
         * Reduce the specified columns by taking the sum of values
         */
        public Builder sumColumns(String... columns) {
            return add(ReduceOp.Sum, columns);
        }

        /**
         * Reduce the specified columns by taking the product of values
         */
        public Builder prodColumns(String... columns) {
            return add(ReduceOp.Prod, columns);
        }

        /**
         * Reduce the specified columns by taking the mean of the values
         */
        public Builder meanColumns(String... columns) {
            return add(ReduceOp.Mean, columns);
        }

        /**
         * Reduce the specified columns by taking the standard deviation of the values
         */
        public Builder stdevColumns(String... columns) {
            return add(ReduceOp.Stdev, columns);
        }

        /**
         * Reduce the specified columns by taking the uncorrected standard deviation of the values
         */
        public Builder uncorrectedStdevColumns(String... columns) {
            return add(ReduceOp.Stdev, columns);
        }

        /**
         * Reduce the specified columns by taking the variance of the values
         */
        public Builder variance(String... columns) {
            return add(ReduceOp.Variance, columns);
        }

        /**
         * Reduce the specified columns by taking the population variance of the values
         */
        public Builder populationVariance(String... columns) {
            return add(ReduceOp.PopulationVariance, columns);
        }

        /**
         * Reduce the specified columns by counting the number of values
         */
        public Builder countColumns(String... columns) {
            return add(ReduceOp.Count, columns);
        }

        /**
         * Reduce the specified columns by taking the range (max-min) of the values
         */
        public Builder rangeColumns(String... columns) {
            return add(ReduceOp.Range, columns);
        }

        /**
         * Reduce the specified columns by counting the number of unique values
         */
        public Builder countUniqueColumns(String... columns) {
            return add(ReduceOp.CountUnique, columns);
        }

        /**
         * Reduce the specified columns by taking the first value
         */
        public Builder takeFirstColumns(String... columns) {
            return add(ReduceOp.TakeFirst, columns);
        }

        /**
         * Reduce the specified columns by taking the last value
         */
        public Builder takeLastColumns(String... columns) {
            return add(ReduceOp.TakeLast, columns);
        }

        /**
         * Reduce the specified columns by taking the concatenation of all content
         * Beware, the output will be huge!
         */
        public Builder appendColumns(String... columns) {
            return add(ReduceOp.Append, columns);
        }

        /**
         * Reduce the specified columns by taking the concatenation of all content in the reverse order
         * Beware, the output will be huge!
         */
        public Builder prependColumns(String... columns) {
            return add(ReduceOp.Prepend, columns);
        }

        /**
         * Reduce the specified column using a custom column reduction functionality.
         *
         * @param column          Column to execute the custom reduction functionality on
         * @param columnReduction Column reduction to execute on that column
         */
        public Builder customReduction(String column, AggregableColumnReduction columnReduction) {
            customReductions.put(column, columnReduction);
            return this;
        }

        /**
         * Conditional reduction: apply the reduces on a specified column, where the reduction occurs *only* on those
         * examples where the condition returns true. Examples where the condition does not apply (returns false) are
         * ignored/excluded.
         *
         * @param column     Name of the column to execute the conditional reduction on
         * @param outputName Name of the column, after the reduction has been executed
         * @param reductions  Reductions to execute
         * @param condition  Condition to use in the reductions
         */
        public Builder conditionalReduction(String column, List outputNames, List reductions,
                        Condition condition) {
            Preconditions.checkArgument(outputNames.size() == reductions.size(),
                            "Conditional reductions should provide names for every column");
            this.conditionalReductions.put(column,
                            new ConditionalReduction(column, outputNames, reductions, condition));
            return this;
        }

        /**
         * Conditional reduction: apply the reduces on a specified column, where the reduction occurs *only* on those
         * examples where the condition returns true. Examples where the condition does not apply (returns false) are
         * ignored/excluded.
         *
         * @param column     Name of the column to execute the conditional reduction on
         * @param outputName Name of the column, after the reduction has been executed
         * @param reductions  Reductions to execute
         * @param condition  Condition to use in the reductions
         */
        public Builder conditionalReduction(String column, String outputName, ReduceOp reduction, Condition condition) {
            this.conditionalReductions.put(column, new ConditionalReduction(column,
                            Collections.singletonList(outputName), Collections.singletonList(reduction), condition));
            return this;
        }

        /**
         * When doing the reduction: set the specified columns to ignore any invalid values.
         * Invalid: defined as being not valid according to the ColumnMetaData: {@link ColumnMetaData#isValid(Writable)}.
         * For numerical columns, this typically means being unable to parse the Writable. For example, Writable.toLong() failing for a Long column.
         * If the column has any restrictions (min/max values, regex for Strings etc) these will also be taken into account.
         *
         * @param columns Columns to set 'ignore invalid' for
         */
        public Builder setIgnoreInvalid(String... columns) {
            Collections.addAll(ignoreInvalidInColumns, columns);
            return this;
        }

        public Reducer build() {
            return new Reducer(this);
        }
    }


    @AllArgsConstructor
    @Data
    public static class ConditionalReduction implements Serializable {
        private final String columnName;
        private final List outputNames;
        private final List reductions;
        private final Condition condition;
    }

}