All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.cmu.tetrad.regression.RegressionDataset Maven / Gradle / Ivy

The newest version!
///////////////////////////////////////////////////////////////////////////////
// For information as to what this class does, see the Javadoc, below.       //
// Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,       //
// 2007, 2008, 2009, 2010, 2014, 2015, 2022 by Peter Spirtes, Richard        //
// Scheines, Joseph Ramsey, and Clark Glymour.                               //
//                                                                           //
// This program is free software; you can redistribute it and/or modify      //
// it under the terms of the GNU General Public License as published by      //
// the Free Software Foundation; either version 2 of the License, or         //
// (at your option) any later version.                                       //
//                                                                           //
// This program is distributed in the hope that it will be useful,           //
// but WITHOUT ANY WARRANTY; without even the implied warranty of            //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the             //
// GNU General Public License for more details.                              //
//                                                                           //
// You should have received a copy of the GNU General Public License         //
// along with this program; if not, write to the Free Software               //
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA //
///////////////////////////////////////////////////////////////////////////////

package edu.cmu.tetrad.regression;

import edu.cmu.tetrad.data.DataSet;
import edu.cmu.tetrad.graph.*;
import edu.cmu.tetrad.util.Matrix;
import edu.cmu.tetrad.util.ProbUtils;
import edu.cmu.tetrad.util.Vector;
import org.apache.commons.math3.util.FastMath;

import java.util.Arrays;
import java.util.List;

/**
 * Implements a regression model from tabular continuous data.
 *
 * @author josephramsey
 * @version $Id: $Id
 */
public class RegressionDataset implements Regression {

    /**
     * The data set.
     */
    private final Matrix data;

    /**
     * The variables.
     */
    private final List variables;

    /**
     * The significance level for determining which regressors are significant based on their p values.
     */
    private double alpha = 0.05;

    /**
     * The graph of significant regressors into the target.
     */
    private Graph graph;

    private int[] rows;
    private Vector res2;


    //============================CONSTRUCTORS==========================//

    /**
     * Constructs a linear regression model for the given tabular data set.
     *
     * @param data A rectangular data set, the relevant variables of which are continuous.
     */
    public RegressionDataset(DataSet data) {
        this.data = data.getDoubleData();
        this.variables = data.getVariables();
        setRows(new int[data.getNumRows()]);
        for (int i = 0; i < getRows().length; i++) getRows()[i] = i;
    }

    /**
     * 

Constructor for RegressionDataset.

* * @param data a {@link edu.cmu.tetrad.util.Matrix} object * @param variables a {@link java.util.List} object */ public RegressionDataset(Matrix data, List variables) { this.data = data; this.variables = variables; setRows(new int[data.getNumRows()]); for (int i = 0; i < getRows().length; i++) getRows()[i] = i; } //===========================PUBLIC METHODS========================// /** *

regress.

* * @param target an array of {@link double} objects * @param regressors an array of {@link double} objects * @return a {@link edu.cmu.tetrad.regression.RegressionResult} object */ public static RegressionResult regress(double[] target, double[][] regressors) { int n = target.length; int k = regressors.length + 1; String[] regressorNames = new String[regressors.length]; for (int i = 0; i < regressors.length; i++) { regressorNames[i] = "X" + (i + 1); } Matrix y = new Matrix(new double[][]{target}).transpose(); Matrix x = new Matrix(regressors).transpose(); Matrix xT = x.transpose(); Matrix xTx = xT.times(x); Matrix xTxInv = xTx.inverse(); Matrix xTy = xT.times(y); Matrix b = xTxInv.times(xTy); Matrix yHat = x.times(b); if (yHat.getNumColumns() == 0) yHat = y.like(); Matrix res = y.minus(yHat); // y.copy().assign(yHat, PlusMult.plusMult(-1)); Vector _yHat = yHat.getColumn(0); Vector _res = res.getColumn(0); yHat.getNumColumns(); // y.copy().assign(yHat, PlusMult.plusMult(-1)); double rss = RegressionDataset.rss(x, y, b); double se = FastMath.sqrt(rss / (n - k)); double tss = RegressionDataset.tss(y); double r2 = 1.0 - (rss / tss); Vector sqErr = new Vector(x.getNumColumns()); Vector t = new Vector(x.getNumColumns()); Vector p = new Vector(x.getNumColumns()); for (int i = 0; i < x.getNumColumns(); i++) { double _s = se * se * xTxInv.get(i, i); double _se = FastMath.sqrt(_s); double _t = b.get(i, 0) / _se; double _p = (1.0 - ProbUtils.tCdf(FastMath.abs(_t), n - k)); sqErr.set(i, _se); t.set(i, _t); p.set(i, _p); } double[] bArray = b.getNumColumns() == 0 ? new double[0] : b.getColumn(0).toArray(); double[] tArray = t.toArray(); double[] pArray = p.toArray(); double[] seArray = sqErr.toArray(); return new RegressionResult(true, regressorNames, n, bArray, tArray, pArray, seArray, r2, rss, 0.05, _res); } /** * Calculates the residual sum of squares for parameter data x, actual values y, and regression coefficients * b--i.e., for each point in the data, the predicted value for that point is calculated, and then it is subtracted * from the actual value. The sum of the squares of these difference values over all points in the data is * calculated and returned. * * @param x the array of data. * @param y the target vector. * @param b the regression coefficients. * @return the residual sum of squares. */ private static double rss(Matrix x, Matrix y, Matrix b) { double rss = 0.0; for (int i = 0; i < x.getNumRows(); i++) { double yH = 0.0; for (int j = 0; j < x.getNumColumns(); j++) { yH += b.get(j, 0) * x.get(i, j); } double d = y.get(i, 0) - yH; rss += d * d; } return rss; } private static double tss(Matrix y) { // first calculate the mean double mean = 0.0; for (int i = 0; i < y.getNumRows(); i++) { mean += y.get(i, 0); } mean /= y.getNumRows(); double ssm = 0.0; for (int i = 0; i < y.getNumRows(); i++) { double d = mean - y.get(i, 0); ssm += d * d; } return ssm; } /** * {@inheritDoc} *

* Sets the alpha level for deciding which regressors are significant based on their p values. */ public void setAlpha(double alpha) { this.alpha = alpha; } /** *

Getter for the field graph.

* * @return This graph. */ public Graph getGraph() { return this.graph; } //=======================PRIVATE METHODS================================// /** * Regresses the target on the given regressors. * * @param target The target variable. * @param regressors The regressor variables. * @return The regression plane, specifying for each regressors its coefficeint, se, t, and p values, and specifying * the same for the constant. */ public RegressionResult regress(Node target, List regressors) { int n = getRows().length; int k = regressors.size() + 1; int _target = this.variables.indexOf(target); int[] _regressors = new int[regressors.size()]; for (int i = 0; i < regressors.size(); i++) { _regressors[i] = this.variables.indexOf(regressors.get(i)); } Matrix y = this.data.getSelection(getRows(), new int[]{_target}).copy(); Matrix xSub = this.data.getSelection(getRows(), _regressors); Matrix x; if (regressors.size() > 0) { x = new Matrix(xSub.getNumRows(), xSub.getNumColumns() + 1); for (int i = 0; i < x.getNumRows(); i++) { for (int j = 0; j < x.getNumColumns(); j++) { if (j == 0) { x.set(i, 0, 1); } else { x.set(i, j, xSub.get(i, j - 1)); } } } } else { x = new Matrix(xSub.getNumRows(), xSub.getNumColumns()); for (int i = 0; i < x.getNumRows(); i++) { for (int j = 0; j < x.getNumColumns(); j++) { x.set(i, j, xSub.get(i, j)); } } } Matrix xT = x.transpose(); Matrix xTx = xT.times(x); Matrix xTxInv = xTx.inverse(); Matrix xTy = xT.times(y); Matrix b = xTxInv.times(xTy); Matrix yHat = x.times(b); if (yHat.getNumColumns() == 0) yHat = y.like(); Matrix res = y.minus(yHat); // y.copy().assign(yHat, PlusMult.plusMult(-1)); Vector _yHat = yHat.getColumn(0); Vector _res = res.getColumn(0); Matrix b2 = b.copy(); Matrix yHat2 = x.times(b2); if (yHat.getNumColumns() == 0) yHat2 = y.like(); Matrix res2 = y.minus(yHat2); // y.copy().assign(yHat, PlusMult.plusMult(-1)); this.res2 = res2.getColumn(0); double rss = RegressionDataset.rss(x, y, b); double se = FastMath.sqrt(rss / (n - k)); double tss = RegressionDataset.tss(y); double r2 = 1.0 - (rss / tss); Vector sqErr = new Vector(x.getNumColumns()); Vector t = new Vector(x.getNumColumns()); Vector p = new Vector(x.getNumColumns()); for (int i = 0; i < x.getNumColumns(); i++) { double _s = se * se * xTxInv.get(i, i); double _se = FastMath.sqrt(_s); double _t = b.get(i, 0) / _se; double _p = (1.0 - ProbUtils.tCdf(FastMath.abs(_t), n - k)); sqErr.set(i, _se); t.set(i, _t); p.set(i, _p); } this.graph = createOutputGraph(target.getName(), x, regressors, p); String[] vNames = new String[regressors.size()]; for (int i = 0; i < regressors.size(); i++) { vNames[i] = regressors.get(i).getName(); } double[] bArray = b.getNumColumns() == 0 ? new double[0] : b.getColumn(0).toArray(); double[] tArray = t.toArray(); double[] pArray = p.toArray(); double[] seArray = sqErr.toArray(); return new RegressionResult(regressors.size() == 0, vNames, n, bArray, tArray, pArray, seArray, r2, rss, this.alpha, _res); } /** *

regress.

* * @param target a {@link edu.cmu.tetrad.graph.Node} object * @param regressors a {@link edu.cmu.tetrad.graph.Node} object * @return a {@link edu.cmu.tetrad.regression.RegressionResult} object */ public RegressionResult regress(Node target, Node... regressors) { List _regressors = Arrays.asList(regressors); return regress(target, _regressors); } private Graph createOutputGraph(String target, Matrix x, List regressors, Vector p) { // Create output graph. Node targetNode = new GraphNode(target); Graph graph = new EdgeListGraph(); graph.addNode(targetNode); for (int i = 0; i < x.getNumColumns(); i++) { String variableName = (i > 0) ? regressors.get(i - 1).getName() : "const"; //Add a node and edge to the output graph for significant predictors: if (p.get(i) < this.alpha) { Node predictorNode = new GraphNode(variableName); graph.addNode(predictorNode); Edge newEdge = new Edge(predictorNode, targetNode, Endpoint.TAIL, Endpoint.ARROW); graph.addEdge(newEdge); } } return graph; } /** * The rows in the data used for the regression. */ private int[] getRows() { return this.rows; } /** *

Setter for the field rows.

* * @param rows an array of {@link int} objects */ public void setRows(int[] rows) { this.rows = rows; } /** *

getResidualsWithoutFirstRegressor.

* * @return a {@link edu.cmu.tetrad.util.Vector} object */ public Vector getResidualsWithoutFirstRegressor() { return this.res2; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy