All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.analysis.modelfitter.StepwiseOLSModelFitter Maven / Gradle / Ivy

Go to download

TASSEL is a software package to evaluate traits associations, evolutionary patterns, and linkage disequilibrium.

The newest version!
package net.maizegenetics.analysis.modelfitter;

import net.maizegenetics.stats.linearmodels.CovariateModelEffect;
import net.maizegenetics.stats.linearmodels.FactorModelEffect;
import net.maizegenetics.stats.linearmodels.LinearModelUtils;
import net.maizegenetics.stats.linearmodels.ModelEffect;
import net.maizegenetics.stats.linearmodels.ModelEffectUtils;
import net.maizegenetics.stats.linearmodels.NestedCovariateModelEffect;
import net.maizegenetics.stats.linearmodels.PartitionedLinearModel;
import net.maizegenetics.stats.linearmodels.SweepFastLinearModel;
import net.maizegenetics.util.OpenBitSet;
import net.maizegenetics.util.SimpleTableReport;
import net.maizegenetics.util.TableReport;
import net.maizegenetics.analysis.association.AssociationUtils;
import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.dna.snp.GenotypeTable;
import net.maizegenetics.dna.snp.GenotypeTableUtils;
import net.maizegenetics.phenotype.CategoricalAttribute;
import net.maizegenetics.phenotype.GenotypePhenotype;
import net.maizegenetics.phenotype.NumericAttribute;
import net.maizegenetics.phenotype.Phenotype;
import net.maizegenetics.phenotype.TaxaAttribute;
import net.maizegenetics.phenotype.Phenotype.ATTRIBUTE_TYPE;
import net.maizegenetics.phenotype.PhenotypeAttribute;
import net.maizegenetics.phenotype.PhenotypeBuilder;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.Datum;

import java.util.Arrays;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.function.Predicate;
import java.util.stream.Collectors;

import net.maizegenetics.matrixalgebra.Matrix.DoubleMatrix;
import net.maizegenetics.matrixalgebra.Matrix.DoubleMatrixFactory;
import net.maizegenetics.stats.linearmodels.BasicShuffler;
import net.maizegenetics.taxa.Taxon;

public class StepwiseOLSModelFitter {
	private final GenotypePhenotype myData;

	//settable parameters
	private double[] enterlimits = null;
	private double[] exitlimits = null;
	private double enterlimit = 1e-5;
	private double exitlimit = 2e-5;
	private int maxNumberOfMarkers = 1000;
	private FactorModelEffect nestingEffect;
	private int nestingFactorIndex;
	private boolean isNested;
    private boolean calculateVIF = true;
    private double VIFTolerance = 0.001;
    private VIF_TYPE VIFType = VIF_TYPE.average; // options are min, average, and max
    public enum VIF_TYPE {min, average, max};
	private ArrayList nestingFactorNames;

	//global variables used by the analysis
	private ArrayList currentModel;
	private int currentPhenotypeIndex;
	private int numberOfBaseModelEffects;
	private double[] y;
	private OpenBitSet missing;
	int numberNotMissing;
	int totalNumber;
	private LinkedList resultRowsAnova = new LinkedList();
	private LinkedList resultRowsAnovaWithCI = new LinkedList();        
	private LinkedList rowsSiteEffectTable = new LinkedList();
	private LinkedList rowsSiteEffectTableWithCI = new LinkedList();
    private ArrayList excludedSNPs = new ArrayList<>();
	private MODEL_TYPE modelType = MODEL_TYPE.mbic;
	private String datasetName;
	private double globalbestbic = Double.MAX_VALUE; //Rationale: the optimal model has minimum BIC. Thus, initialize bestbic with the largest possible double java can handle.
	private double globalbestmbic = Double.MAX_VALUE;
	private double globalbestaic = Double.MAX_VALUE;
	public enum MODEL_TYPE {pvalue, bic, mbic, aic};
	private boolean test;
    private int[][] theUpperAndLowerBound;

    private int numberOfPermutations = 0;
    private double alpha = 0.05;
    private double[] minPvalues = null;

	private final String[] anovaReportHeader = new String[]{"Trait", "Name","Locus","Position","df","SS","MS", "F", "pr>F", "BIC", "mBIC", "AIC", "Model Rsq"};
    private final String[] anovaReportWithCIHeader = new String[]{"Trait", "Name","Locus","Position","df","SS","MS", "F", "pr>F", "BIC", "mBIC", "AIC", "Model Rsq", "SuppLeft", "SuppRight"};
    
	private final GenotypeTable myGenotype;
	private final Phenotype myPhenotype;
	List dataAttributeList;
	List factorAttributeList;
	List covariateAttributeList;

	public void setModelType(MODEL_TYPE modelType){
		this.modelType = modelType;
	}

	public StepwiseOLSModelFitter(GenotypePhenotype genoPheno, String datasetName) {
		myData = genoPheno;       
		myGenotype = myData.genotypeTable();
        myPhenotype = myData.phenotype();

		this.datasetName = datasetName;
	}

	public DataSet runAnalysis() {
		//numbers of various model components
		dataAttributeList = myPhenotype.attributeListOfType(ATTRIBUTE_TYPE.data);
		factorAttributeList = myPhenotype.attributeListOfType(ATTRIBUTE_TYPE.factor);
		covariateAttributeList = myPhenotype.attributeListOfType(ATTRIBUTE_TYPE.covariate);

		int numberOfPhenotypes = dataAttributeList.size();

		//cycle through the phenotypes
		//notation: 
		//X is the design matrix without the markers, rows of X will be deleted if marker data is missing
		//Xm is the design matrix with markers
		//y is the data
		currentPhenotypeIndex = -1;
		for (PhenotypeAttribute attr : dataAttributeList) {
			NumericAttribute currentTrait = (NumericAttribute) attr;
			currentPhenotypeIndex++;
			if (enterlimits != null) enterlimit = enterlimits[currentPhenotypeIndex];
			if (exitlimits != null) exitlimit = exitlimits[currentPhenotypeIndex];
			globalbestbic = Double.MAX_VALUE; 
			globalbestmbic = Double.MAX_VALUE;
			globalbestaic = Double.MAX_VALUE;
			
			//get phenotype data
			float[] phenotypeData = currentTrait.floatValues();

			//keep track of missing rows
			missing = new OpenBitSet(currentTrait.missing());
			for (PhenotypeAttribute factor : factorAttributeList) missing.or(factor.missing());
			for (PhenotypeAttribute cov : covariateAttributeList) missing.or(cov.missing());

			//remove missing values from the arrays
			totalNumber = phenotypeData.length;
			numberNotMissing = totalNumber - (int) missing.cardinality();

			y = AssociationUtils.getNonMissingDoubles(phenotypeData, missing);
			fitModel();
			scanAndFindCI();    
		}

		return null;
	}

	public void fitModel() {
		//build the base model
		currentModel = new ArrayList();
		int numberOfTaxa = y.length;
		int[] mean = new int[numberOfTaxa];

		FactorModelEffect meanEffect = new FactorModelEffect(mean, false);
		meanEffect.setID("mean");
		currentModel.add(meanEffect);

		for (PhenotypeAttribute factor:factorAttributeList) {
			ArrayList ids = new ArrayList();
			CategoricalAttribute ca = (CategoricalAttribute) factor;
			String[] factorLabels = AssociationUtils.getNonMissingValues(ca.allLabels(), missing);
			int[] levels = ModelEffectUtils.getIntegerLevels(factorLabels, ids);
			FactorModelEffect fme = new FactorModelEffect(levels, true, new Object[]{ca.name(), ids});
			if (isNested && myPhenotype.attributeIndexForName(factor.name()) == nestingFactorIndex) {
				nestingEffect = fme;
				nestingFactorNames = ids; 
			}
			currentModel.add(fme);
		}
		
		//add the covariate effects
		for (PhenotypeAttribute cov:covariateAttributeList) {
			NumericAttribute na = (NumericAttribute) cov;
			double[] covData = AssociationUtils.getNonMissingDoubles(na.floatValues(), missing);
			CovariateModelEffect cme = new CovariateModelEffect(covData, cov.name());
			currentModel.add(cme);
		}
		numberOfBaseModelEffects = currentModel.size();
		
		//run permutation test
		System.out.println("-----Number of Permutations = " + numberOfPermutations + "------------");
		if(numberOfPermutations > 0) runPermutationTestNoMissingData(); 
		System.out.println("--------------the Enter limit is " + enterlimit);
		System.out.println("--------------the Exit limit is " + exitlimit);  			

		while(forwardStep()){
			if((calculateVIF)&&(currentModel.size()> (numberOfBaseModelEffects+1))){
				ArrayList theVIFValues = removeCollinearMarkers(true);
			}
			while(backwardStep());
		}
		if((calculateVIF)&&(currentModel.size()> (numberOfBaseModelEffects+1))){
			//Begin code for testing out the model
			ArrayList theSNPs = new ArrayList();
			for(int i = numberOfBaseModelEffects; i < currentModel.size();i++){
				SNP thisSNP = (SNP) currentModel.get(i).getID();
				theSNPs.add(thisSNP);
			}

			System.out.println("Here are all of the SNPs in the model prior to identifying a collinear effect: ");
			System.out.println(theSNPs.toString());
			//End code for testing out the model

			ArrayList theVIFValues = removeCollinearMarkers(true);
		}

		SweepFastLinearModel theLinearModel = new SweepFastLinearModel(currentModel, y);
		appendAnovaResults(theLinearModel);
		appendSiteEffectEstimates(theLinearModel);

	}

	public ArrayList removeCollinearMarkers(boolean removeCollinearMarker){
		//Obtain the design matrix of the current model (everything except for the "BaseModelEffects")        
		DoubleMatrix theDesignMatrix = null;
		for(int i=numberOfBaseModelEffects; i < currentModel.size(); i++){//I am starting the i loop at 1 because I do not want to have the intercept in the model
			DoubleMatrix partialDesignMatrix = currentModel.get(i).getX();
			if(i==numberOfBaseModelEffects){
				theDesignMatrix = partialDesignMatrix;
			}else{
				theDesignMatrix = theDesignMatrix.concatenate(partialDesignMatrix, false);
			}
		}

		//Scale and center the design matrix
		DoubleMatrix theCenteredDesignmatrix = centerCols(theDesignMatrix);         
		DoubleMatrix theCenteredAndScaledDesignMatrix = scaleCenteredMatrix(theCenteredDesignmatrix);
		double invSqrtSampleSizeMinusOne = 1/Math.sqrt(((double)theCenteredAndScaledDesignMatrix.numberOfRows())-1);
		theCenteredAndScaledDesignMatrix = theCenteredAndScaledDesignMatrix.scalarMult(invSqrtSampleSizeMinusOne);

		//Obtain the correlation matrix of the explanatory variables, r_{xx}
		DoubleMatrix theCorrelationMatrix = theCenteredAndScaledDesignMatrix.crossproduct(); 


		//Invert r_{xx}, and call it r_{xx}^{-1}. The diagonal elements of r_{xx}^{-1} are the VIF values
		DoubleMatrix theInvertedCorrelationMatrix = theCorrelationMatrix.inverse();
		ArrayList theVIFForEachExplanatoryVariable = new ArrayList();

		//For loop through the marker effects //NOTE: Turn these two for loops into separate methods
		int counter = 0;
		double theMinimumValue = Double.MAX_VALUE;
		int indexOfCollinearEffect = 0;
		ArrayList indiciesOfCollinearExplanatoryVariables = new ArrayList();
		ModelEffect theCollinearEffect = null;
		boolean InfiniteVIFsPresent = false;
		for(int i=numberOfBaseModelEffects; i < currentModel.size(); i++){
			int numberOfExplanatoryVariables = currentModel.get(i).getX().numberOfColumns();

			//For loop through all explanatory variables for a marker effect
			double theSumVIFValues = 0; 
			double theMinimumToleranceValueWithinX = Double.MAX_VALUE;
			double theMaximumToleranceValueWithinX = Double.MIN_VALUE;
			for(int j = 0; j < numberOfExplanatoryVariables; j++){
				//If one of the diagonoal elements of theInvertedCorrelationMatrix (i.e. the VIFs)
				// is infinity, this means that the most recent term added to the model has has a VIF
				// of infinity. The following if statment will remove this term from the model.
				double theVIF = theInvertedCorrelationMatrix.get((counter+j), (counter+j));
				if(Double.isNaN(theVIF)){
					theMinimumValue = 0;
					theCollinearEffect = currentModel.get((currentModel.size()-1));
					InfiniteVIFsPresent = true;
					break;
				}
				double theInvertedVIF = 1/theInvertedCorrelationMatrix.get((counter+j), (counter+j));
				if((j == 0) | (j == numberOfExplanatoryVariables)) System.out.println("theInvertedVIF is: " + theInvertedVIF);
				if(theInvertedVIF < theMinimumToleranceValueWithinX)theMinimumToleranceValueWithinX = theInvertedVIF;
				if(theInvertedVIF > theMaximumToleranceValueWithinX)theMaximumToleranceValueWithinX = theInvertedVIF;
				theSumVIFValues += theInvertedVIF;
			}

			if(InfiniteVIFsPresent) break;
			if(VIFType == VIF_TYPE.average){
				//Take the average (1/VIF) value of the tested marker
				double theAverageVIFValue = theSumVIFValues/numberOfExplanatoryVariables;
				if(theAverageVIFValue < theMinimumValue){
					theMinimumValue = theAverageVIFValue;
					theCollinearEffect = currentModel.get(i);
					indexOfCollinearEffect = i-numberOfBaseModelEffects; //i-numberOfBaseModelEffects because java indexes things starting at zero
				}
				theVIFForEachExplanatoryVariable.add(theAverageVIFValue);
			}else if(VIFType == VIF_TYPE.min){
				if(theMinimumToleranceValueWithinX < theMinimumValue){
					theMinimumValue = theMinimumToleranceValueWithinX;
					theCollinearEffect = currentModel.get(i);
					indexOfCollinearEffect = i-numberOfBaseModelEffects;//i-1 because java indexes things starting at zero
				} 
				theVIFForEachExplanatoryVariable.add(theMinimumToleranceValueWithinX);
			}else if(VIFType == VIF_TYPE.max){
				if(theMaximumToleranceValueWithinX < theMinimumValue){
					theMinimumValue = theMaximumToleranceValueWithinX;
					theCollinearEffect = currentModel.get(i);
					indexOfCollinearEffect = i-numberOfBaseModelEffects;//i-1 because java indexes things starting at zero
				} 
				theVIFForEachExplanatoryVariable.add(theMaximumToleranceValueWithinX);

			}

			counter = counter + numberOfExplanatoryVariables;
			System.out.println("indexOfCollinearEffect: " + indexOfCollinearEffect);
			System.out.println("theVIFForEachExplanatoryVariable");
			System.out.println(theVIFForEachExplanatoryVariable.toString());

			//End for loop through the marker effects
		}
		//If this minimum exceeds the tolerence threshold, remove the corresponding marker from the model
		if(removeCollinearMarker){
			if(theMinimumValue < VIFTolerance){
				ArrayList theSNPs = new ArrayList();
				for(int i = numberOfBaseModelEffects; i < currentModel.size();i++){
					SNP thisSNP = (SNP) currentModel.get(i).getID();
					theSNPs.add(thisSNP);
				}
				System.out.println("Here are all of the SNPs in the model prior to identifying a collinear effect: ");
				System.out.println(theSNPs.toString());

				currentModel.remove(theCollinearEffect);
				excludedSNPs.add(((SNP)theCollinearEffect.getID()).index);

				SNP snpRemoved = (SNP) theCollinearEffect.getID(); 
				if(!InfiniteVIFsPresent)theVIFForEachExplanatoryVariable.remove(indexOfCollinearEffect);
				System.out.println("------------------"+snpRemoved +
						" is causing multicollinearity in the model. It has been removed from the model------------------------------");
			}
		}

		return theVIFForEachExplanatoryVariable;

	}    
	
	//scaleCenteredMatrix was written by in PrinComp by (I think) Peter Bradbury

	private DoubleMatrix centerCols(DoubleMatrix data) {
		int nrows = data.numberOfRows();
		int ncols = data.numberOfColumns();
		DoubleMatrix dm = data.copy();
		for (int c = 0; c < ncols; c++) {
			double colmean = dm.columnSum(c) / nrows;
			for (int r = 0; r < nrows; r++) {
				dm.set(r, c, dm.get(r, c) - colmean);
			}
		}

		return dm;
	}
	
	private DoubleMatrix scaleCenteredMatrix(DoubleMatrix data) {
		int nrows = data.numberOfRows();
		int ncols = data.numberOfColumns();
		for (int c = 0; c < ncols; c++) {
			double sumsq = 0;
			for (int r = 0; r < nrows; r++) {
				double val = data.get(r, c);
				sumsq += val * val;
			}
			double stdDev = Math.sqrt(sumsq/(nrows - 1));
			for (int r = 0; r < nrows; r++) {
				double val = data.get(r, c);
				data.set(r, c, val/stdDev);
			}
		}
		return data;
	}

    public void scanAndFindCI() {
        //rescan the model to refine position and find CI's
        //rescan requires a map

        //for each snp in the model:
        //1. add an adjacent snp to left of the original
        //2. solve the new model
        //3. if the marginal p of the model snp is sig stop and repeat on right of snp.
        //4. find the max ss of any point with the CI interval (not including the original marker)
        //5. if the max ss is greater than the original marker ss, replace the original and go to 1.
        //report steps as process proceeds
        SweepFastLinearModel sflm;
        int numberOfTerms = currentModel.size();
        int firstMarkerIndex = 1;
        firstMarkerIndex += factorAttributeList.size();
        firstMarkerIndex += covariateAttributeList.size();
        
        System.out.println("firstMarkerIndex " + firstMarkerIndex);
        int[] upperbound = new int[numberOfTerms - firstMarkerIndex];
        int[] lowerbound = new int[numberOfTerms - firstMarkerIndex];
        theUpperAndLowerBound = new int[numberOfTerms - firstMarkerIndex][2];
        for (int t = firstMarkerIndex; t < numberOfTerms; t++){ //0 is the mean, 1 is populations
            //find the CI bounds
            lowerbound[t - firstMarkerIndex] = scanASide(true, t);
            System.out.println("lowerbound[t - firstMarkerIndex]");
            System.out.println(lowerbound[t - firstMarkerIndex]);
            upperbound[t - firstMarkerIndex] = scanASide(false, t);
            System.out.println("upperbound[t - firstMarkerIndex]");
            System.out.println(upperbound[t - firstMarkerIndex]);
            theUpperAndLowerBound[t-firstMarkerIndex][0] = lowerbound[t-firstMarkerIndex];
            theUpperAndLowerBound[t-firstMarkerIndex][1] = upperbound[t-firstMarkerIndex];
             
            //find the marker with highest ss in the CI
            SNP bestsnp = null;
            double bestss = 0;
            ModelEffect besteffect = null;
            ModelEffect currentme = currentModel.remove(t);
            sflm = new SweepFastLinearModel(currentModel, y);
            PartitionedLinearModel plm = new PartitionedLinearModel(currentModel, sflm);
            
            			//NEW CODE: create the appropriate marker effect
            for (int m = lowerbound[t - firstMarkerIndex]; m <= upperbound[t - firstMarkerIndex]; m++) {
                ModelEffect markerEffect = null;
                SNP testsnp = new SNP(myGenotype.siteName(m), myGenotype.chromosome(m), myGenotype.chromosomalPosition(m), m);
                
                //if the Genotype has a reference allele use that, else if genotype use that else nothing, allele probability not implemented
                //for genotype use an additive model for now
                if (myGenotype.hasReferenceProbablity()) {
                	double[] cov = new double[numberNotMissing];
                	int count = 0;
                	float[] siteReferenceProb = myData.referenceProb(m);
                	for (int i = 0; i < totalNumber; i++) {
                		if (!missing.fastGet(i)) cov[count++] = siteReferenceProb[i];
                	}

                	if (isNested) {
                		CovariateModelEffect cme = new CovariateModelEffect(cov);
                		markerEffect = new NestedCovariateModelEffect(cme, nestingEffect);
                		markerEffect.setID(testsnp);
                	} else {
                		markerEffect = new CovariateModelEffect(cov, testsnp);
                	}
 
                } else if (myGenotype.hasGenotype()) {
                	byte minor = myGenotype.minorAllele(m);
                	double[] cov = new double[numberNotMissing];
                	byte[] siteGeno = AssociationUtils.getNonMissingBytes( myData.genotypeAllTaxa(m), missing);
                	for (int i = 0; i < numberNotMissing; i++) {
                		byte[] diploidAlleles = GenotypeTableUtils.getDiploidValues(siteGeno[i]);
                		if (diploidAlleles[0] == minor) cov[i] += 0.5;
                		if (diploidAlleles[1] == minor) cov[i] += 0.5;
                	}
                	
                	if (isNested) {
                		CovariateModelEffect cme = new CovariateModelEffect(cov);
                		markerEffect = new NestedCovariateModelEffect(cme, nestingEffect);
                		markerEffect.setID(testsnp);
                	} else {
                		markerEffect = new CovariateModelEffect(cov, testsnp);
                	}
               	
                } else {
                	throw new IllegalArgumentException("No genotypes or reference probabilities in the data set.");
                }
                
                plm.testNewModelEffect(markerEffect);
                double modelss = plm.getModelSS();
                if (modelss > bestss) {
                    bestss = modelss;
                    bestsnp = testsnp;
                    besteffect = markerEffect;
                }
            }
            
            currentModel.add(t, besteffect);
            ////
            //did the best marker change?
            boolean markerchanged = false;
            SNP currentSnp = (SNP) currentme.getID();
            if (currentSnp.index != bestsnp.index) markerchanged = true;

            //if this marker is different than the one tested, reset the CI
            if (markerchanged) {
                lowerbound[t - firstMarkerIndex] = scanASide(true, t);
                upperbound[t - firstMarkerIndex] = scanASide(false, t);
                theUpperAndLowerBound[t-firstMarkerIndex][0] = lowerbound[t-firstMarkerIndex];
                theUpperAndLowerBound[t-firstMarkerIndex][1] = upperbound[t-firstMarkerIndex];
            }
            System.out.println("upperAndLowerBound[t-firstMarkerIndex][0]");
            System.out.println(theUpperAndLowerBound[t-firstMarkerIndex][0]); 
            System.out.println("upperAndLowerBound[t-firstMarkerIndex][1]");
            System.out.println(theUpperAndLowerBound[t-firstMarkerIndex][1]);
        }
        
        //report the results including CI's
        //writeModelWithCI2File(lowerbound, upperbound); Turn this into a global matrix of upper and lower bounds
        SweepFastLinearModel theLinearModel = new SweepFastLinearModel(currentModel, y);
        appendAnovaResultsWithCI(theLinearModel);
        appendSiteEffectEstimatesWithCI(theLinearModel);

 
    }        
        
  
    
    private int scanASide(boolean left, int whichModelTerm) {

    	double alpha = 0.05;
    	int minIndex = 0;
    	int maxIndex = myGenotype.numberOfSites() - 1;
    	int incr;
    	if (left) {
    		incr = -1;
    	} else {
    		incr = 1;
    	}

    	SNP modelsnp = (SNP) currentModel.get(whichModelTerm).getID();
    	int markerIndex = modelsnp.index;
    	Chromosome chr = modelsnp.locus;
    	//int chrAsInt = chr.getChromosomeNumber();
    	boolean boundfound = false;
    	int testIndex = markerIndex;
    	int lastterm = currentModel.size();

    	///
    	do{
    		testIndex += incr;
    		if(testIndex < minIndex || testIndex > maxIndex){
    			testIndex -= incr;
    			boundfound = true;
    			break;
    		}
    		ModelEffect markerEffect = null;
    		SNP snp = new SNP(myGenotype.siteName(testIndex), myGenotype.chromosome(testIndex), myGenotype.chromosomalPosition(testIndex), testIndex);
    		Chromosome chrOfTestedSnp = snp.locus;
    		//int chrOfTestedSnpAsInt = chrOfTestedSnp.getChromosomeNumber();
    		if (snp == null || !chrOfTestedSnp.equals(chr)) {
    		//if (snp == null || chrOfTestedSnpAsInt != chrAsInt) {
    			testIndex -= incr;
    			boundfound = true;
    		} else {
    			//create the appropriate marker effect

    			
                //if the Genotype has a reference allele use that, else if genotype use that else nothing, allele probability not implemented
                //for genotype use an additive model for now
                if (myGenotype.hasReferenceProbablity()) {
                	double[] cov = new double[numberNotMissing];
                	int count = 0;
                	float[] referenceProb = myData.referenceProb(testIndex);
                	for (int i = 0; i < totalNumber; i++) {
                		if (!missing.fastGet(i)) cov[count++] = referenceProb[i];
                	}

                	if (isNested) {
                		CovariateModelEffect cme = new CovariateModelEffect(cov);
                		markerEffect = new NestedCovariateModelEffect(cme, nestingEffect);
                		markerEffect.setID(snp);
                	} else {
                		markerEffect = new CovariateModelEffect(cov, snp);
                	}
 
                } else if (myGenotype.hasGenotype()) {
                	byte minor = myGenotype.minorAllele(testIndex);
                	double[] cov = new double[numberNotMissing];
                	byte[] siteGeno = AssociationUtils.getNonMissingBytes(myData.genotypeAllTaxa(testIndex), missing);
                	for (int i = 0; i < numberNotMissing; i++) {
                		byte[] diploidAlleles = GenotypeTableUtils.getDiploidValues(siteGeno[i]);
                		if (diploidAlleles[0] == minor) cov[i] += 0.5;
                		if (diploidAlleles[1] == minor) cov[i] += 0.5;
                	}
                	
                	if (isNested) {
                		CovariateModelEffect cme = new CovariateModelEffect(cov);
                		markerEffect = new NestedCovariateModelEffect(cme, nestingEffect);
                		markerEffect.setID(snp);
                	} else {
                		markerEffect = new CovariateModelEffect(cov, snp);
                	}
               	
                } else {
                	throw new IllegalArgumentException("No genotypes or reference probabilities in the data set.");
                }

    			///
    			currentModel.add(markerEffect);
    			SweepFastLinearModel sflm = new SweepFastLinearModel(currentModel, y);
    			double[] snpssdf = sflm.getMarginalSSdf(whichModelTerm);
    			double[] errorssdf = sflm.getResidualSSdf();
    			double F = snpssdf[0] / snpssdf[1] / errorssdf[0] * errorssdf[1];
    			double p;
    			try {
    				p = LinearModelUtils.Ftest(F, snpssdf[1], errorssdf[1]);
    			} catch(Exception e) {
    				p = 1;
    			}     

    			if(p < alpha){
    				boundfound = true;
    			}
    			currentModel.remove(lastterm);
    		}    
    	}  while (!boundfound && testIndex > minIndex && testIndex < maxIndex);      
    	////      
    	return testIndex;
    }      
        
	public boolean forwardStep() {
		double bestss = 0;
		double bestbic = Double.MAX_VALUE;
		double bestmbic = Double.MAX_VALUE;
		double bestaic = Double.MAX_VALUE;
		ModelEffect besteffect = null;

		SweepFastLinearModel sflm = new SweepFastLinearModel(currentModel, y);
		PartitionedLinearModel plm = new PartitionedLinearModel(currentModel, sflm);
		int numberOfSites = myGenotype.numberOfSites();

		System.out.println("We are in forwardStep()");

		double[] temperrorssdf = sflm.getResidualSSdf();
		System.out.println("The value of errorss before the loop in forwardStep() is: " + temperrorssdf[0]);
		for (int s = 0; s < numberOfSites; s++) if (!excludedSNPs.contains(s)) {
			//create the appropriate marker effect
			ModelEffect markerEffect = null;
			SNP snp = new SNP(myGenotype.siteName(s), myGenotype.chromosome(s), myGenotype.chromosomalPosition(s), s);

            //if the Genotype has a reference allele use that, else if genotype use that else nothing, allele probability not implemented
            //for genotype use an additive model for now
            if (myGenotype.hasReferenceProbablity()) {
            	double[] cov = new double[numberNotMissing];
            	int count = 0;
            	float[] referenceProb = myData.referenceProb(s);
            	for (int i = 0; i < totalNumber; i++) {
            		if (!missing.fastGet(i)) cov[count++] = referenceProb[i];
            	}

            	if (isNested) {
            		CovariateModelEffect cme = new CovariateModelEffect(cov);
            		markerEffect = new NestedCovariateModelEffect(cme, nestingEffect);
            		markerEffect.setID(snp);
            	} else {
            		markerEffect = new CovariateModelEffect(cov, snp);
            	}

            } else if (myGenotype.hasGenotype()) {
            	byte minor = myGenotype.minorAllele(s);
            	double[] cov = new double[numberNotMissing];
            	byte[] siteGeno = AssociationUtils.getNonMissingBytes(myData.genotypeAllTaxa(s), missing);
            	for (int i = 0; i < numberNotMissing; i++) {
            		byte[] diploidAlleles = GenotypeTableUtils.getDiploidValues(siteGeno[i]);
            		if (diploidAlleles[0] == minor) cov[i] += 0.5;
            		if (diploidAlleles[1] == minor) cov[i] += 0.5;
            	}
            	
            	if (isNested) {
            		CovariateModelEffect cme = new CovariateModelEffect(cov);
            		markerEffect = new NestedCovariateModelEffect(cme, nestingEffect);
            		markerEffect.setID(snp);
            	} else {
            		markerEffect = new CovariateModelEffect(cov, snp);
            	}
           	
            } else {
            	throw new IllegalArgumentException("No genotypes or reference probabilities in the data set.");
            }

			plm.testNewModelEffect(markerEffect);
			double modelss = plm.getModelSS();

			currentModel.add(markerEffect); //Temporary; Remove if wrong
			SweepFastLinearModel sflm2 = new SweepFastLinearModel(currentModel, y);
			
			int n = numberNotMissing;
			//Calculate the BIC
			double [] errorss = sflm2.getResidualSSdf();
			double [] modeldf = sflm2.getFullModelSSdf();
			double pForBIC  = modeldf[1]; 
			double bic = (n*Math.log(errorss[0]/n)) + (pForBIC*Math.log(n)) ;
			double aic = (n*Math.log(errorss[0]/n)) + (2*pForBIC) ;
			if(s==2) System.out.println("The value of errorss in forwardStep() is: " + errorss[0]);

			//Calculate the mBIC
			int numberOfTwoWayInteractions = (numberOfSites*(numberOfSites-1))/2;
			double pForMbic = modeldf[1];
			double qForMbic = 0; //This is going to be the number of two-way interaction terms. For now, this is not implemented, and thus I am setting it equal to zer
			double mbic;
			if(qForMbic == 0){
				mbic = (n*Math.log(errorss[0])) + ((pForMbic+qForMbic)*Math.log(n)) + (2*pForMbic*(Math.log((numberOfSites/2.2)-1)));
			}else{
				mbic = (n*Math.log(errorss[0])) + ((pForMbic+qForMbic)*Math.log(n)) + (2*pForMbic*(Math.log((numberOfSites/2.2)-1))) + 
						(2*qForMbic*(Math.log((numberOfTwoWayInteractions/2.2)-1) )); 
			}


			switch(modelType){
			case pvalue:
				test = modelss > bestss;
				break;
			case bic:
				test = bic < bestbic;
				break;
			case mbic:
				test = mbic < bestmbic;
				break;
			case aic:
				test = aic < bestaic;
				break;
			}
			if (test) {
				bestmbic = mbic;
				bestbic = bic;
				bestaic = aic;
				bestss = modelss;
				besteffect = markerEffect;
			}
			currentModel.remove(markerEffect); //Temporary; Remove if wrong
		}

		//if the p-value for the select SNP is less than the enter limit, add it to the model and recalculate the model solution
		plm.testNewModelEffect(besteffect);
		double[] Fp = plm.getFp();
		if(modelType == MODEL_TYPE.pvalue){
			if ( Fp[1] < enterlimit) {
				currentModel.add(besteffect);
				if (currentModel.size() == maxNumberOfMarkers + numberOfBaseModelEffects) return false;
				return true;
			} else { 
				return false; 
			}     
		} else if(modelType == MODEL_TYPE.bic){
			if(bestbic < globalbestbic){
				globalbestbic = bestbic;
				currentModel.add(besteffect);
				if (currentModel.size() == maxNumberOfMarkers + numberOfBaseModelEffects) return false;
				return true;
			} else{
				return false;
			} 
		} else if(modelType == MODEL_TYPE.mbic){
			if(bestmbic < globalbestmbic){
				globalbestmbic = bestmbic;
				//System.out.println("***********The value of globalbestmbic at the end of forwardStep() is " + globalbestmbic);
				currentModel.add(besteffect);
				if (currentModel.size() == maxNumberOfMarkers + numberOfBaseModelEffects) return false;
				return true;
			} else{
				return false;
			}
		} else if(modelType == MODEL_TYPE.aic){
			if(bestaic < globalbestaic){
				globalbestaic = bestaic;
				//System.out.println("***********The value of globalbestmbic at the end of forwardStep() is " + globalbestmbic);
				currentModel.add(besteffect);
				if (currentModel.size() == maxNumberOfMarkers + numberOfBaseModelEffects) return false;
				return true;
			} else{
				return false;
			} 
		}  else{
			return false;
		}  


	}

	public boolean backwardStep() {
		int numberOfTerms = currentModel.size();
		if (numberOfTerms <= numberOfBaseModelEffects) return false;
		double bestbic = Double.MAX_VALUE;
		double bestmbic = Double.MAX_VALUE;
		double bestaic = Double.MAX_VALUE;

		int n = y.length;
		int numberOfSites = myGenotype.numberOfSites();

		SweepFastLinearModel sflm0 = new SweepFastLinearModel(currentModel, y);

		//find the model term (snps only) with the largest p-value
		double maxp = 0;
		double minF= -1;
		int maxterm = 0;
		ModelEffect worstMarkerEffect = null;
		double[] errorssdf = sflm0.getResidualSSdf();

		for (int t = numberOfBaseModelEffects; t < numberOfTerms; t++) {
			double bic;
			double mbic;
			double aic;
			ModelEffect markerEffect = null;
			double[] termssdf = sflm0.getIncrementalSSdf(t);
			double F = termssdf[0]/termssdf[1]/errorssdf[0]*errorssdf[1];
			double p;
			if(modelType != MODEL_TYPE.pvalue){

				ModelEffect meTest1 = currentModel.remove(t);
				SNP snpRemoved = (SNP) meTest1.getID(); 
				System.out.println("CORRECT The SNP just removed is: " + snpRemoved);

				SweepFastLinearModel sflm = new SweepFastLinearModel(currentModel, y); 
				//Calculate the BIC
				double [] errorss = sflm.getResidualSSdf();
				double [] modeldf = sflm.getFullModelSSdf();
				double pForBIC  = modeldf[1]; 
				bic = (n*Math.log(errorss[0]/n)) + (pForBIC*Math.log(n)) ; 
				aic = (n*Math.log(errorss[0]/n)) + (2*pForBIC) ;

				//Calculate the mBIC
				double numberOfTwoWayInteractions = (double) (numberOfSites*(numberOfSites-1))/2;			
				double pForMbic = modeldf[1];
				double qForMbic = 0; //This is going to be the number of two-way interaction terms. For now, this is not implemented, and thus I am setting it equal to zer
				if(qForMbic == 0){
					mbic = (n*Math.log(errorss[0])) + ((pForMbic+qForMbic)*Math.log(n)) + (2*pForMbic*(Math.log((numberOfSites/2.2)-1)));
				}else{
					mbic = (n*Math.log(errorss[0])) + ((pForMbic+qForMbic)*Math.log(n)) + (2*pForMbic*(Math.log((numberOfSites/2.2)-1))) + 
							(2*qForMbic*(Math.log((numberOfTwoWayInteractions/2.2)-1) )); 
				}


				currentModel.add(t, meTest1);

				sflm = new SweepFastLinearModel(currentModel, y); 


			} else{
				bic = Double.MAX_VALUE;
				mbic = Double.MAX_VALUE;
				aic = Double.MAX_VALUE;
			}

			try{
				p = LinearModelUtils.Ftest(F, termssdf[1], errorssdf[1]);  
			} catch (Exception e){
				p = Double.NaN;
			}


			switch(modelType){
			case pvalue:
				try {
					if (p > maxp) {
						maxterm = t;
						maxp = p;
						minF = F; 
					}
				} catch(Exception e){

				}
				break;
			case bic:
				if(bic < bestbic){
					bestbic = bic;
					bestaic = aic;
					bestmbic = mbic;
					maxterm = t;  //Actually, this should be "minterm" becasue we are finding the model with the minimum BIC, but I am keeping this as "maxterm" for simpilicity of the calculations
					worstMarkerEffect = markerEffect;
					maxp = p;
					minF = F; 
				}
				break;
			case mbic:
				if(mbic < bestmbic){
					bestbic = bic;
					bestaic = aic;
					bestmbic = mbic;
					maxterm = t;
					worstMarkerEffect = markerEffect;
					maxp = p;
					minF = F;
				}
				break;
			case aic:
				if(aic < bestaic){
					bestbic = bic;
					bestaic = aic;
					bestmbic = mbic;
					maxterm = t;
					worstMarkerEffect = markerEffect;
					maxp = p;
					minF = F;
				}
				break;
			} 

		}


		//if that maxp is >= exitlimit, then remove maxterm from the model, recalculate the model, and return true;
		switch(modelType){
		case pvalue:
			test = maxp >= exitlimit;
			break;
		case bic:
			test = bestbic < globalbestbic;
			break;
		case mbic:
			test = bestmbic < globalbestmbic;
			break;
		case aic:
			test = bestaic < globalbestaic;
			break;
		}


		if ((test)&&(maxterm != 0)) {
			ModelEffect me = currentModel.remove(maxterm);
			globalbestbic = bestbic;
			globalbestmbic = bestmbic;
			globalbestaic = bestaic;
			return true;
		}

		return false;
	}

	public void runPermutationTestNoMissingData() {
		ArrayList permutedData = new ArrayList();
		DoubleMatrix PvalueVectorAcrossMarkers = null;

		int indexOfThreshold = (int) (alpha*numberOfPermutations);

		int numberOfSites = myGenotype.numberOfSites();
		DoubleMatrix[][] Xmatrices;
		System.out.println("-----------------Running permutations...----------------");

		int numberOfObs = y.length;
		double totalSS = 0;
		for (int i = 0; i < numberOfObs; i++) totalSS += y[i] * y[i];

		ArrayList baseModelSSdf = new ArrayList();
		double[] permTotalSS = new double[numberOfPermutations];

		int[] mean = new int[numberOfObs];
		FactorModelEffect meanME = new FactorModelEffect(mean, false, "mean");
		int columnNumber = 2;
		columnNumber += covariateAttributeList.size();
		columnNumber += factorAttributeList.size();

		Xmatrices = new DoubleMatrix[1][columnNumber];//Intercept+family term+                

		Xmatrices[0][0] = meanME.getX();
		int effectCount = 1;
		
		for (PhenotypeAttribute attr : factorAttributeList) {
			CategoricalAttribute ca = (CategoricalAttribute) attr;
			ArrayList ids = new ArrayList();
			String[] labels = AssociationUtils.getNonMissingValues(ca.allLabels(), missing);
			int[] levels = ModelEffectUtils.getIntegerLevels(labels, ids);
			FactorModelEffect fme = new FactorModelEffect(levels, true, new Object[]{attr.name(), ids});
			if (isNested && myPhenotype.indexOfAttribute(attr) == nestingFactorIndex) {
				nestingEffect = fme;
				nestingFactorNames = ids; 
			}
			Xmatrices[0][effectCount++] = fme.getX();
		}

		for (PhenotypeAttribute attr : covariateAttributeList) {
			NumericAttribute na = (NumericAttribute) attr;
			double[] cov = AssociationUtils.getNonMissingDoubles(na.floatValues(), missing);
			CovariateModelEffect cme = new CovariateModelEffect(cov, na.name());
			Xmatrices[0][effectCount++] = cme.getX();
		}

		SweepFastLinearModel baseSflm = new SweepFastLinearModel(currentModel, y);


		DoubleMatrix resAsDoubleMatrix = baseSflm.getResiduals();
		DoubleMatrix predAsDoubleMatrix = baseSflm.getPredictedValues();
		double[] res = new double[numberOfObs];
		double[] pred = new double[numberOfObs];
		for(int i = 0; i < numberOfObs; i++){
			res[i] = resAsDoubleMatrix.get(i,0);
			pred[i] = predAsDoubleMatrix.get(i,0);
		}

		//permute data
		double[] minP = new double[numberOfPermutations];

		DoubleMatrix minPAsDoubleMatrix = null;
		for (int p = 0; p < numberOfPermutations; p++) {
			minP[p] = 1;
			double[] pdata = new double[numberOfObs];
			System.arraycopy(res, 0, pdata, 0, numberOfObs);
			BasicShuffler.shuffle(pdata);

			for(int i = 0; i < numberOfObs; i++){
				pdata[i] = pdata[i] + pred[i];
			} //Add the predicted value back to the residual
			totalSS = 0;
			for (int i = 0; i < numberOfObs; i++) totalSS += pdata[i] * pdata[i];
			permTotalSS[p] = totalSS;
			permutedData.add(pdata); 

			SweepFastLinearModel sflm = new SweepFastLinearModel(currentModel, pdata);
			baseModelSSdf.add(sflm.getFullModelSSdf());
		}


		for (int m = 0; m < numberOfSites; m++) {

			//create the appropriate marker effect
			ModelEffect markerEffect = null;
			SNP snp = new SNP(myGenotype.siteName(m), myGenotype.chromosome(m), myGenotype.chromosomalPosition(m), m);

            //if the Genotype has a reference allele use that, else if genotype use that else nothing, allele probability not implemented
            //for genotype use an additive model for now
            if (myGenotype.hasReferenceProbablity()) {
            	double[] cov = new double[numberNotMissing];
            	int count = 0;
            	float[] referenceProb = myData.referenceProb(m);
            	for (int i = 0; i < totalNumber; i++) {
            		if (!missing.fastGet(i)) cov[count++] = referenceProb[i];
            	}

            	if (isNested) {
            		CovariateModelEffect cme = new CovariateModelEffect(cov);
            		markerEffect = new NestedCovariateModelEffect(cme, nestingEffect);
            		markerEffect.setID(snp);
            	} else {
            		markerEffect = new CovariateModelEffect(cov, snp);
            	}

            } else if (myGenotype.hasGenotype()) {
            	byte minor = myGenotype.minorAllele(m);
            	double[] cov = new double[numberNotMissing];
            	byte[] siteGeno = AssociationUtils.getNonMissingBytes(myData.genotypeAllTaxa(m), missing);
            	for (int i = 0; i < numberNotMissing; i++) {
            		byte[] diploidAlleles = GenotypeTableUtils.getDiploidValues(siteGeno[i]);
            		if (diploidAlleles[0] == minor) cov[i] += 0.5;
            		if (diploidAlleles[1] == minor) cov[i] += 0.5;
            	}
            	
            	if (isNested) {
            		CovariateModelEffect cme = new CovariateModelEffect(cov);
            		markerEffect = new NestedCovariateModelEffect(cme, nestingEffect);
            		markerEffect.setID(snp);
            	} else {
            		markerEffect = new CovariateModelEffect(cov, snp);
            	}
           	
            } else {
            	throw new IllegalArgumentException("No genotypes or reference probabilities in the data set.");
            }
			
			Xmatrices[0][columnNumber-1] = markerEffect.getX(); //columnNumber-1 because java
			//starts counting things from 0
			DoubleMatrix X = DoubleMatrixFactory.DEFAULT.compose(Xmatrices);

			currentModel.add(markerEffect);
			SweepFastLinearModel sflm = new SweepFastLinearModel(currentModel, y);         
			double[] modelSSdf = sflm.getFullModelSSdf();
			DoubleMatrix G = sflm.getInverseOfXtX();

			for (int p = 0; p < numberOfPermutations; p++) {
				double[] pdata = permutedData.get(p);
				DoubleMatrix yperm = DoubleMatrixFactory.DEFAULT.make(numberOfObs, 1, pdata);
				totalSS = permTotalSS[p];

				DoubleMatrix Xty = X.crossproduct(yperm);
				double[] reducedSSdf = baseModelSSdf.get(p);
				double fullSS = Xty.crossproduct(G.mult(Xty)).get(0, 0);
				double fulldf = modelSSdf[1];
				double markerSS = fullSS - reducedSSdf[0];
				double markerdf = fulldf - reducedSSdf[1];
				double errorSS = totalSS - fullSS;
				double errordf = numberOfObs - fulldf;
				double F = markerSS / markerdf / errorSS * errordf;
				double probF;
				try {
					probF = LinearModelUtils.Ftest(F, markerdf, errordf);
				} catch(Exception e) {
					probF = 1;
				}
				minP[p] = probF;
				minPAsDoubleMatrix = DoubleMatrixFactory.DEFAULT.make(minP.length, 1, minP);
			}
			if(m == 0){
				PvalueVectorAcrossMarkers = minPAsDoubleMatrix;
			}else{
				PvalueVectorAcrossMarkers = PvalueVectorAcrossMarkers.concatenate(minPAsDoubleMatrix, false);//Change this to a DoubleMatrix; transpose it
				//for the next step                            
			}

			currentModel.remove(markerEffect);
		}

		System.out.println(PvalueVectorAcrossMarkers.toString());
		for(int p=0; p pValueReportTable = new LinkedList();;
                //Object[] pValueReportRow = new Object[numberOfPermutations];
                for(int i=0; i < numberOfPermutations; i++){
                    Object[] pValueReportRow = new Object[1];
                    pValueReportRow[0] = new Double(minPvalues[i]);          
                    //System.out.println("minPvalues[i]" + minPvalues[i]);
                    pValueReportTable.add(pValueReportRow);
                }
                
                //pValueReportTable.add(pValueReportRow);
               	Object[][] table = new Object[pValueReportTable.size()][];
		pValueReportTable.toArray(table);                
		//columns are trait, snp name, locus, position, factor value, estimate
		String reportName = numberOfPermutations + " Permutations for "  + datasetName;
		String[] reportHeader = new String[]{"P-value"};

		return new SimpleTableReport(reportName, reportHeader, table);
	}       
        
        
	public LinkedList createReportRowsFromCurrentModel(SweepFastLinearModel sflm) {
		String traitname = dataAttributeList.get(currentPhenotypeIndex).name();
		int ncol = anovaReportHeader.length;
		LinkedList reportTable = new LinkedList();
		double[] residualSSdf = sflm.getResidualSSdf();
		int n = y.length;
		int numberOfSites = myGenotype.numberOfSites();


		//Calcualte the BIC
		double [] errorss = sflm.getResidualSSdf();
		double [] modeldf = sflm.getFullModelSSdf();
		double pForBIC  = modeldf[1];
		double bic = (n*Math.log(errorss[0]/n)) + (pForBIC*Math.log(n)) ; 
		double aic = (n*Math.log(errorss[0]/n)) + (2*pForBIC) ;

		System.out.println("error ss is " + errorss[0]);
		System.out.println("pForBIC is " + pForBIC);
		System.out.println("n is " + n);


		//Calculate the mBIC
		double numberOfTwoWayInteractions = (double) (numberOfSites*(numberOfSites-1))/2;			
		double pForMbic = modeldf[1];
		double qForMbic = 0; //This is going to be the number of two-way interaction terms. For now, this is not implemented, and thus I am setting it equal to zer
		double mbic;
		if(qForMbic == 0){
			mbic = (n*Math.log(errorss[0])) + ((pForMbic+qForMbic)*Math.log(n)) + (2*pForMbic*(Math.log((numberOfSites/2.2)-1)));
		}else{
			mbic = (n*Math.log(errorss[0])) + ((pForMbic+qForMbic)*Math.log(n)) + (2*pForMbic*(Math.log((numberOfSites/2.2)-1))) + 
					(2*qForMbic*(Math.log((numberOfTwoWayInteractions/2.2)-1) )); 
		}
                
		int effectPtr = 0;
		for (ModelEffect me : currentModel) {
			Object[] reportRow = new Object[ncol];
			int ptr = 0;
			reportRow[ptr++] = traitname;

			if (me.getID() instanceof SNP) {
				SNP snp = (SNP) me.getID();
				reportRow[ptr++] = snp.name;
				reportRow[ptr++] = snp.locus.getName();
				reportRow[ptr++] = Integer.toString(snp.position);
			} else {
				if (me.getID() instanceof String) reportRow[ptr++] = me.getID().toString();
				else if (me instanceof FactorModelEffect) reportRow[ptr++] = ((Object[]) me.getID())[0].toString();
				else reportRow[ptr++] = me.getID().toString();
				reportRow[ptr++] = "--";
				reportRow[ptr++] = "--";
			}
			double[] effectSSdf = sflm.getMarginalSSdf(effectPtr);
			double ms = effectSSdf[0] / effectSSdf[1];
			double Fval = ms / residualSSdf[0] * residualSSdf[1];
			double pval;
			try {
				pval = LinearModelUtils.Ftest(Fval, effectSSdf[1], residualSSdf[1]);
			} catch (Exception e) {
				pval = Double.NaN;
			}
			reportRow[ptr++] = new Integer((int) effectSSdf[1]);
			reportRow[ptr++] = new Double(effectSSdf[0]);
			reportRow[ptr++] = new Double(ms);
			reportRow[ptr++] = new Double(Fval);
			reportRow[ptr++] = new Double(pval);
			reportRow[ptr++] = new Double(bic);
			reportRow[ptr++] = new Double(mbic);
			reportRow[ptr++] = new Double(aic);
			double[] modelSSdf = sflm.getModelcfmSSdf();
			reportRow[ptr++] = new Double(modelSSdf[0]/(modelSSdf[0] + residualSSdf[0]));
			reportTable.add(reportRow);
			effectPtr++;
		}
		int ptr = 0;
		Object[] reportRow = new Object[ncol];
		reportRow[ptr++] = traitname;
		reportRow[ptr++] = "Error";
		reportRow[ptr++] = "--";
		reportRow[ptr++] = "--";
		reportRow[ptr++] = new Integer((int) residualSSdf[1]);
		reportRow[ptr++] = new Double(residualSSdf[0]);
		reportRow[ptr++] = new Double(residualSSdf[0]/residualSSdf[1]);
		reportRow[ptr++] = new Double(Double.NaN);
		reportRow[ptr++] = new Double(Double.NaN);
		reportTable.add(reportRow);

		return reportTable;
	}

	public LinkedList createReportRowsFromCurrentModelAfterScanCI(SweepFastLinearModel sflm) {
		String traitname = dataAttributeList.get(currentPhenotypeIndex).name();
		int ncol = anovaReportWithCIHeader.length;
		LinkedList reportTable = new LinkedList();
		double[] residualSSdf = sflm.getResidualSSdf();
		int n = y.length;
		int numberOfSites = myGenotype.numberOfSites();
		int firstMarkerIndex = 1;

		firstMarkerIndex += factorAttributeList.size();
		firstMarkerIndex += covariateAttributeList.size();
		
		//Calcualte the BIC
		double [] errorss = sflm.getResidualSSdf();
		double [] modeldf = sflm.getFullModelSSdf();
		double pForBIC  = modeldf[1];
		double bic = (n*Math.log(errorss[0]/n)) + (pForBIC*Math.log(n)) ; 
		double aic = (n*Math.log(errorss[0]/n)) + (2*pForBIC) ;

		System.out.println("error ss is " + errorss[0]);
		System.out.println("pForBIC is " + pForBIC);
		System.out.println("n is " + n);


		//Calculate the mBIC
		double numberOfTwoWayInteractions = (double) (numberOfSites*(numberOfSites-1))/2;			
		double pForMbic = modeldf[1];
		double qForMbic = 0; //This is going to be the number of two-way interaction terms. For now, this is not implemented, and thus I am setting it equal to zer
		double mbic;
		if(qForMbic == 0){
			mbic = (n*Math.log(errorss[0])) + ((pForMbic+qForMbic)*Math.log(n)) + (2*pForMbic*(Math.log((numberOfSites/2.2)-1)));
		}else{
			mbic = (n*Math.log(errorss[0])) + ((pForMbic+qForMbic)*Math.log(n)) + (2*pForMbic*(Math.log((numberOfSites/2.2)-1))) + 
					(2*qForMbic*(Math.log((numberOfTwoWayInteractions/2.2)-1) )); 
		}

		int effectPtr = 0;
		for (ModelEffect me : currentModel) {
			Object[] reportRow = new Object[ncol];
			int ptr = 0;
			reportRow[ptr++] = traitname;

			if (me.getID() instanceof SNP) {
				SNP snp = (SNP) me.getID();
				reportRow[ptr++] = snp.name;
				reportRow[ptr++] = snp.locus.getName();
				reportRow[ptr++] = Integer.toString(snp.position);
			} else {
				if (me.getID() instanceof String) reportRow[ptr++] = me.getID().toString();
				else if (me instanceof FactorModelEffect) reportRow[ptr++] = ((Object[]) me.getID())[0].toString();
				else reportRow[ptr++] = me.getID().toString();
				reportRow[ptr++] = "--";
				reportRow[ptr++] = "--";
			}
			double[] effectSSdf = sflm.getMarginalSSdf(effectPtr);
			double ms = effectSSdf[0] / effectSSdf[1];
			double Fval = ms / residualSSdf[0] * residualSSdf[1];
			double pval;
			try {
				pval = LinearModelUtils.Ftest(Fval, effectSSdf[1], residualSSdf[1]);
			} catch (Exception e) {
				pval = Double.NaN;
			}
			reportRow[ptr++] = new Integer((int) effectSSdf[1]);
			reportRow[ptr++] = new Double(effectSSdf[0]);
			reportRow[ptr++] = new Double(ms);
			reportRow[ptr++] = new Double(Fval);
			reportRow[ptr++] = new Double(pval);
			reportRow[ptr++] = new Double(bic);
			reportRow[ptr++] = new Double(mbic);
			reportRow[ptr++] = new Double(aic);
			double[] modelSSdf = sflm.getModelcfmSSdf();
			reportRow[ptr++] = new Double(modelSSdf[0]/(modelSSdf[0] + residualSSdf[0]));

			//System.out.println("The value of effectPtr is "+effectPtr);

			if (me.getID() instanceof SNP) {
				reportRow[ptr++] = new String(myGenotype.siteName(theUpperAndLowerBound[effectPtr-firstMarkerIndex][0]));
				reportRow[ptr++] = new String(myGenotype.siteName(theUpperAndLowerBound[effectPtr-firstMarkerIndex][1]));

			} else {
				reportRow[ptr++] = "--";
				reportRow[ptr++] = "--";
			}
			reportTable.add(reportRow);
			effectPtr++;
		}
		int ptr = 0;
		Object[] reportRow = new Object[ncol];
		reportRow[ptr++] = traitname;
		reportRow[ptr++] = "Error";
		reportRow[ptr++] = "--";
		reportRow[ptr++] = "--";
		reportRow[ptr++] = new Integer((int) residualSSdf[1]);
		reportRow[ptr++] = new Double(residualSSdf[0]);
		reportRow[ptr++] = new Double(residualSSdf[0]/residualSSdf[1]);
		reportRow[ptr++] = new Double(Double.NaN);
		reportRow[ptr++] = new Double(Double.NaN);
		reportTable.add(reportRow);

		return reportTable;
	}

	public Datum createReportFromCurrentModel(SweepFastLinearModel sflm) {
		String traitname = dataAttributeList.get(currentPhenotypeIndex).name();
		LinkedList reportTable = createReportRowsFromCurrentModel(sflm);
		Object[][] table = new Object[reportTable.size()][];
		reportTable.toArray(table);
		String reportName = "ANOVA for " + traitname + ", " + datasetName;
		TableReport tr = new SimpleTableReport(reportName, anovaReportHeader, table);
		return new Datum(reportName, tr, "");
	}

	public Datum createReportFromCurrentModelWithCI(SweepFastLinearModel sflm) {
		String traitname = dataAttributeList.get(currentPhenotypeIndex).name();
		LinkedList reportTable = createReportRowsFromCurrentModelAfterScanCI(sflm);
		Object[][] table = new Object[reportTable.size()][];
		reportTable.toArray(table);
		String reportName = "ANOVA With CI for " + traitname + ", " + datasetName;
		TableReport tr = new SimpleTableReport(reportName, anovaReportWithCIHeader, table);
		return new Datum(reportName, tr, "");
	}
        
	public void appendAnovaResults(SweepFastLinearModel sflm) {
		resultRowsAnova.addAll(createReportRowsFromCurrentModel(sflm));
	}

        public void appendAnovaResultsWithCI(SweepFastLinearModel sflm) {
            resultRowsAnovaWithCI.addAll(createReportRowsFromCurrentModelAfterScanCI(sflm));
	}

	public TableReport getAnovaReport() {
		String reportName = "ANOVA table for " + datasetName;
		Object[][] table = new Object[resultRowsAnova.size()][];
		resultRowsAnova.toArray(table);
		return new SimpleTableReport(reportName, anovaReportHeader, table);
	}

	public TableReport getAnovaReportWithCI() {
		String reportName = "ANOVA table with CI scan for " + datasetName;
		Object[][] table = new Object[resultRowsAnovaWithCI.size()][];
		resultRowsAnovaWithCI.toArray(table);
		return new SimpleTableReport(reportName, anovaReportWithCIHeader, table);
	}
        
	public TableReport getMarkerEffectReport() {
		if (rowsSiteEffectTable.size() == 0) return null;
		//columns are trait, snp name, locus, position, factor value, estimate
		String reportName = "Marker effects for " + datasetName;
		String[] reportHeader = new String[]{"Trait","Snp","Locus","Position","Within","Estimate"};
		Object[][] table = new Object[rowsSiteEffectTable.size()][];
		rowsSiteEffectTable.toArray(table);
		return new SimpleTableReport(reportName, reportHeader, table);
	}

	public TableReport getMarkerEffectReportWithCI() {
		if (rowsSiteEffectTableWithCI.size() == 0) return null;
		//columns are trait, snp name, locus, position, factor value, estimate
		String reportName = "Marker effects for " + datasetName;
		String[] reportHeader = new String[]{"Trait","Snp","Locus","Position","Within","Estimate"};
		Object[][] table = new Object[rowsSiteEffectTableWithCI.size()][];
		rowsSiteEffectTableWithCI.toArray(table);
		return new SimpleTableReport(reportName, reportHeader, table);
	}

	public void appendSiteEffectEstimates(SweepFastLinearModel sflm) {

		//columns are trait, snp name, locus, position, factor value, estimate
		int nBaseModelParameters = 0;
		String traitname = dataAttributeList.get(currentPhenotypeIndex).name();
		for (int i = 0; i < numberOfBaseModelEffects; i++) {
			nBaseModelParameters += currentModel.get(i).getEffectSize();
		}

		double[] beta = sflm.getBeta();
		int parameterIndex = nBaseModelParameters;
		for (int s = numberOfBaseModelEffects; s < currentModel.size(); s++) {
			ModelEffect me = currentModel.get(s);
			if (me instanceof NestedCovariateModelEffect) {
				NestedCovariateModelEffect ncme = (NestedCovariateModelEffect) me;
				FactorModelEffect fme = ncme.getFactorModelEffect();
				SNP snp = (SNP) ncme.getID();
				int n = nestingFactorNames.size();
				for (int i = 0; i < n; i++) {
					Object[] rowValues = new Object[6];
					int ptr = 0;
					rowValues[ptr++] = traitname;
					rowValues[ptr++] = snp.name;
					rowValues[ptr++] = snp.locus.getName();
					rowValues[ptr++] = new Integer(snp.position);
					rowValues[ptr++] = nestingFactorNames.get(i);
					rowValues[ptr++] = new Double(beta[parameterIndex++]);
					rowsSiteEffectTable.add(rowValues);
				}
			} else if (me instanceof CovariateModelEffect) {
				SNP snp = (SNP) me.getID();
				Object[] rowValues = new Object[6];
				int ptr = 0;
				rowValues[ptr++] = traitname;
				rowValues[ptr++] = snp.name;
				rowValues[ptr++] = snp.locus.getName();
				rowValues[ptr++] = new Integer(snp.position);
				rowValues[ptr++] = "NA";
				rowValues[ptr++] = new Double(beta[parameterIndex++]);
				rowsSiteEffectTable.add(rowValues);
			} else if (me instanceof FactorModelEffect) {

			}
		}
	}

        public void appendSiteEffectEstimatesWithCI(SweepFastLinearModel sflm) {

		//columns are trait, snp name, locus, position, factor value, estimate
		int nBaseModelParameters = 0;
		String traitname = dataAttributeList.get(currentPhenotypeIndex).name();
		for (int i = 0; i < numberOfBaseModelEffects; i++) {
			nBaseModelParameters += currentModel.get(i).getEffectSize();
		}

		double[] beta = sflm.getBeta();
		int parameterIndex = nBaseModelParameters;
		for (int s = numberOfBaseModelEffects; s < currentModel.size(); s++) {
			ModelEffect me = currentModel.get(s);
			if (me instanceof NestedCovariateModelEffect) {
				NestedCovariateModelEffect ncme = (NestedCovariateModelEffect) me;
				FactorModelEffect fme = ncme.getFactorModelEffect();
				SNP snp = (SNP) ncme.getID();
				int n = nestingFactorNames.size();
				for (int i = 0; i < n; i++) {
					Object[] rowValues = new Object[6];
					int ptr = 0;
					rowValues[ptr++] = traitname;
					rowValues[ptr++] = snp.name;
					rowValues[ptr++] = snp.locus.getName();
					rowValues[ptr++] = new Integer(snp.position);
					rowValues[ptr++] = nestingFactorNames.get(i);
					rowValues[ptr++] = new Double(beta[parameterIndex++]);
					rowsSiteEffectTableWithCI.add(rowValues);
				}
			} else if (me instanceof CovariateModelEffect) {
				SNP snp = (SNP) me.getID();
				Object[] rowValues = new Object[6];
				int ptr = 0;
				rowValues[ptr++] = traitname;
				rowValues[ptr++] = snp.name;
				rowValues[ptr++] = snp.locus.getName();
				rowValues[ptr++] = new Integer(snp.position);
				rowValues[ptr++] = "NA";
				rowValues[ptr++] = new Double(beta[parameterIndex++]);
				rowsSiteEffectTableWithCI.add(rowValues);
			} else if (me instanceof FactorModelEffect) {

			}
		}
	}

        public Phenotype generateChromosomeResidualsFromCurrentModel() {
            List attributes = new ArrayList<>();
            List types = new ArrayList<>();

            Taxon[] allTaxa = myPhenotype.taxaAttribute().allTaxa();
            Taxon[] nonmissingTaxa = AssociationUtils.getNonMissingValues(allTaxa, missing);
            attributes.add(new TaxaAttribute(Arrays.asList(nonmissingTaxa)));
            types.add(ATTRIBUTE_TYPE.taxa);
            
            //this next step will include family in the return Phenotype
            //any covariates will not be included
            for (PhenotypeAttribute factor : factorAttributeList) {
                String[] values = ((CategoricalAttribute) factor).allLabels();
                attributes.add(new CategoricalAttribute(factor.name(), AssociationUtils.getNonMissingValues(values, missing)));
                types.add(ATTRIBUTE_TYPE.factor);
            }
            
            for (int c = 0; c <10; c++) {
                String chrname = String.format("c%d",c + 1);
                
                //create a model without this chromosome
                List chrModel = currentModel.stream()
                        .filter(notInChr(c + 1))
                        .collect(Collectors.toList());
                
                SweepFastLinearModel sflm = new SweepFastLinearModel(chrModel, y);
                
                //add the residuals to the Phenotype
                DoubleMatrix resid = sflm.getResiduals();
                float[] data = AssociationUtils.convertDoubleArrayToFloat(resid.to1DArray());
                attributes.add(new NumericAttribute(chrname, data, new OpenBitSet(data.length)));
                types.add(ATTRIBUTE_TYPE.data);
            }
            
            return new PhenotypeBuilder().fromAttributeList(attributes, types).build().get(0);
        }
        
        private Predicate notInChr(int chr) {
            return me -> (!(me.getID() instanceof SNP)) || ((me.getID() instanceof SNP) && ( ((SNP) me.getID()).locus.getChromosomeNumber() != chr ));
        }
        
	public void setEnterlimits(double[] enterlimits) {
		this.enterlimits = enterlimits;
	}

	public void setExitlimits(double[] exitlimits) {
		this.exitlimits = exitlimits;
	}

	public void setEnterlimit(double enterlimit) {
		this.enterlimit = enterlimit;
	}

	public void setExitlimit(double exitlimit) {
		this.exitlimit = exitlimit;
	}

	public void setMaxNumberOfMarkers(int maxNumberOfMarkers) {
		this.maxNumberOfMarkers = maxNumberOfMarkers;
	}

	public void setNestingEffectIndex(int nestingFactorIndex) {
		this.nestingFactorIndex = nestingFactorIndex;
	}

	public void setNested(boolean isNested) {
		this.isNested = isNested;
	}
        
	public void setNumberOfPermutations(int numberOfPermutations) {
		this.numberOfPermutations = numberOfPermutations;
                minPvalues = new double[this.numberOfPermutations];
}
 
	public void setAlpha(double alpha) {
		this.alpha = alpha;
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy