All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.dna.snp.ExportUtils Maven / Gradle / Ivy

/*
 * ExportUtils
 */
package net.maizegenetics.dna.snp;

import com.google.common.base.Joiner;
import com.google.common.collect.Multimap;
import net.maizegenetics.dna.WHICH_ALLELE;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.dna.snp.genotypecall.AlleleFreqCache;
import net.maizegenetics.dna.snp.io.VCFUtil;
import net.maizegenetics.taxa.TaxaList;
import net.maizegenetics.taxa.Taxon;
import net.maizegenetics.util.*;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.io.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.zip.GZIPOutputStream;

/**
 * Exports Genotype Tables to various file formats.
 *
 * @author Jon Zhang
 * @author Terry Casstevens
 * @author Ed Buckler
 */
public class ExportUtils {

    private static final Logger myLogger = LogManager.getLogger(ExportUtils.class);
    private static FormattedOutput format = FormattedOutput.getInstance();

    private ExportUtils() {
        // Utility Class - do not instantiate.
    }

    /**
     * Write a GenotypeTable to HapMap format with standard settings - unphased
     * single character, tab delimiter, and no progress tracking.
     *
     * @param alignment genotype table
     * @param filename outfile name (will add ".hmp.txt" if needed)
     *
     * @return name of the outfile with the appropriate suffix
     */
    public static String writeToHapmap(GenotypeTable alignment, String filename) {
        return writeToHapmap(alignment, false, filename, '\t', null);
    }

    /**
     * Write a GenotypeTable to HapMap format.
     *
     * @param alignment genotype table
     * @param diploid true uses phased two letter encoding, false one letter
     * unphased
     * @param filename outfile name (will add ".hmp.txt" if needed)
     * @param delimChar delimiter character normally tab
     * @param listener progress listener, (null if unneeded)
     *
     * @return name of the outfile with the appropriate suffix
     */
    public static String writeToHapmap(GenotypeTable alignment, boolean diploid, String filename, char delimChar, ProgressListener listener) {
        return writeToHapmap(alignment, diploid, filename, delimChar, true, listener);
    }

    public static String writeToHapmap(GenotypeTable alignment, boolean diploid, String filename, char delimChar, boolean includeTaxaAnnotations, ProgressListener listener) {
        if (delimChar != ' ' && delimChar != '\t') {
            throw new IllegalArgumentException("Delimiter charater must be either a blank space or a tab.");
        }

        BufferedWriter bw = null;
        try {
            String fullFileName = Utils.addSuffixIfNeeded(filename, ".hmp.txt", new String[]{".hmp.txt", ".hmp.txt.gz"});
            bw = Utils.getBufferedWriter(fullFileName);
            if (includeTaxaAnnotations) {
                for (Taxon taxon : alignment.taxa()) {
                    GeneralAnnotation annotation = taxon.getAnnotation();
                    if ((annotation == null) || (annotation.numAnnotations() == 0)) {
                        continue;
                    }
                    bw.write("##SAMPLE=" + taxon.toStringWithVCFAnnotation() + "\n");
                }
            }
            bw.write(Joiner.on(delimChar).join("rs#", "alleles", "chrom", "pos", "strand", "assembly#", "center", "protLSID",
                    "assayLSID", "panelLSID", "QCcode"));
            bw.write(delimChar);
            int numTaxa = alignment.numberOfTaxa();
            for (int taxa = 0; taxa < numTaxa; taxa++) {
                String sequenceID = alignment.taxaName(taxa).trim();
                bw.write(sequenceID);
                if (taxa != numTaxa - 1) {
                    bw.write(delimChar);
                }
            }
            bw.write("\n");
            int numSites = alignment.numberOfSites();
            for (int site = 0; site < numSites; site++) {
                bw.write(alignment.siteName(site));
                bw.write(delimChar);
                byte[] genotypes = alignment.genotypeAllTaxa(site);
                // which alleles are present among the genotypes
                int[][] sortedAlleles = AlleleFreqCache.allelesSortedByFrequencyNucleotide(genotypes);
                int numAlleles = sortedAlleles[0].length;
                if (numAlleles == 0) {
                    bw.write("NA"); //if data does not exist
                } else if (numAlleles == 1) {
                    bw.write(alignment.genotypeAsString(site, (byte) sortedAlleles[0][0]));
                } else {
                    bw.write(alignment.genotypeAsString(site, (byte) sortedAlleles[0][0]));
                    for (int allele = 1; allele < sortedAlleles[0].length; allele++) {
                        if (sortedAlleles[0][allele] != GenotypeTable.UNKNOWN_ALLELE) {
                            bw.write('/');
                            bw.write(alignment.genotypeAsString(site, (byte) sortedAlleles[0][allele]));  // will write out a third allele if it exists
                        }
                    }
                }
                bw.write(delimChar);
                bw.write(Joiner.on(delimChar).join(alignment.chromosomeName(site), String.valueOf(alignment.chromosomalPosition(site)),
                        "+", "NA", "NA", "NA", "NA", "NA", "NA"));
                bw.write(delimChar);
                for (int taxa = 0; taxa < numTaxa; taxa++) {
                    if (diploid == false) {
                        String baseIUPAC = null;
                        try {
                            baseIUPAC = alignment.diploidAsString(site, genotypes[taxa]);
                        } catch (Exception e) {
                            String[] b = alignment.genotypeAsStringArray(taxa, site);
                            bw.close();
                            myLogger.debug(e.getMessage(), e);
                            throw new IllegalArgumentException("There is no String representation for diploid values: " + b[0] + ":" + b[1] + " getBase(): 0x" + Integer.toHexString(alignment.genotype(taxa, site)) + "\nTry Exporting as Diploid Values.");
                        }
                        if ((baseIUPAC == null) || baseIUPAC.equals("?")) {
                            String[] b = alignment.genotypeAsStringArray(taxa, site);
                            bw.close();
                            throw new IllegalArgumentException("There is no String representation for diploid values: " + b[0] + ":" + b[1] + " getBase(): 0x" + Integer.toHexString(alignment.genotype(taxa, site)) + "\nTry Exporting as Diploid Values.");
                        }
                        bw.write(baseIUPAC);
                    } else {
                        byte[] temp = GenotypeTableUtils.getDiploidValues(genotypes[taxa]);
                        bw.write(alignment.genotypeAsString(site, temp[0]));
                        bw.write(alignment.genotypeAsString(site, temp[1]));
                    }
                    if (taxa != (numTaxa - 1)) {
                        bw.write(delimChar);
                    }
                }
                bw.write("\n");

                if (listener != null) {
                    listener.progress((int) (((double) (site + 1) / (double) numSites) * 100.0), null);
                }
            }
            return fullFileName;
        } catch (Exception e) {
            myLogger.debug(e.getMessage(), e);
            throw new IllegalArgumentException("Error writing Hapmap file: " + filename + ": " + ExceptionUtils.getExceptionCauses(e));
        } finally {
            try {
                bw.close();
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    }

    /**
     * Writes given alignment to a VCF file
     *
     * @param gt
     * @param filename
     *
     * @return
     */
    public static String writeToVCF(GenotypeTable gt, String filename, boolean keepDepth) {
        return writeToVCF(gt, filename, keepDepth, null);
    }

    public static String writeToVCF(GenotypeTable gt, String filename, boolean keepDepth, ProgressListener listener) {
        final char delimChar = '\t';
        boolean hasDepth = gt.hasDepth() && keepDepth;
        try {

            filename = Utils.addSuffixIfNeeded(filename, ".vcf", new String[]{".vcf", ".vcf.gz"});
            BufferedWriter bw = Utils.getBufferedWriter(filename);
            bw.write("##fileformat=VCFv4.0");
            bw.newLine();
            if (!gt.hasReference()) {
                bw.write("##Tassel=");
                bw.newLine();
            }
            bw.write("##FORMAT=");
            bw.newLine();
            bw.write("##FORMAT=");
            bw.newLine();
            bw.write("##FORMAT=");
            bw.newLine();
            bw.write("##FORMAT=");
            bw.newLine();
            bw.write("##FORMAT=");
            bw.newLine();
            bw.write("##INFO=");
            bw.newLine();
            bw.write("##INFO=");
            bw.newLine();
            bw.write("##INFO=");
            bw.newLine();
            writeVCFSampleAnnotationToWriter(gt, bw);
            bw.write("#CHROM" + delimChar + "POS" + delimChar + "ID" + delimChar + "REF" + delimChar + "ALT" + delimChar + "QUAL" + delimChar + "FILTER" + delimChar + "INFO" + delimChar + "FORMAT");
            for (int taxa = 0; taxa < gt.numberOfTaxa(); taxa++) {
                String taxonName = gt.taxaName(taxa).trim();
                bw.write(delimChar + taxonName);
            }
            bw.newLine();

            int noAlleles = 0;
            for (int site = 0; site < gt.numberOfSites(); site++) {
                int numSites = gt.numberOfSites();
                Position p = gt.positions().get(site);
                String[] knownVariants = p.getKnownVariants();
                byte refAllele = p.getAllele(WHICH_ALLELE.Reference);
                int[] sortedAlleles = gt.allelesSortedByFrequency(site)[0]; // which alleles are actually present among the genotypes


                //ZRM22 March 18 2016 move to add reference into sortedAlleles array if its missing
                int[] sortedAllelesTemp = VCFUtil.resolveRefSorted(sortedAlleles, refAllele);

                //ZRM22 June 6 2016 fix variants with ref allele


                sortedAlleles = sortedAllelesTemp;
                //ZRM22 Jan7 Remake
                //If knownVariants.length is greater than 0 its either from a VCF file or Hapmap
                if (knownVariants.length > 0) {

                    //ReOrder based on variant alleles
                    //Store a tempSortedAlleles so we can appropriately handle hapmap to vcf
                    //int[] tempSortedAlleles = new int[knownVariants.length];

                    //ArrayList to hold the Sorted Alleles Indices Temporarily as the ordering will change
                    ArrayList tempSortedAlleles = new ArrayList();

                    //Loop through all the knownVariants and check to see if we have an indel
                    boolean knownVariantIndel = VCFUtil.indelInKnownVariant(knownVariants);


                    //If we do have an indel, we can add the variants after picking off the first character to the tempSortedAlleles
                    if (knownVariantIndel) {
                        //Loop through the variants
                        for (int i = 0; i < knownVariants.length; i++) {
                            //Pull off the first character if it exists
                            if (knownVariants[i].length() > 1) {
                                String parsedVariant = knownVariants[i].substring(1);
                                tempSortedAlleles.add((int) NucleotideAlignmentConstants.getNucleotideAlleleByte(parsedVariant.charAt(0)));
                            } else {
                                //Mark as deletion
                                tempSortedAlleles.add((int) NucleotideAlignmentConstants.getNucleotideAlleleByte('-'));
                            }
                        }
                    } else {
                        //If we dont have an indel, we can add it to the allele array
                        if (sortedAlleles.length < knownVariants.length) {
                            //Clear it out, we probably dont need to do this
                            tempSortedAlleles = new ArrayList();
                        }
                        int nIndex = -1;
                        for (int i = 0; i < knownVariants.length; i++) {
                            //ZRM22 Mar 22
                            if (knownVariants[i].charAt(0) != 'N') {
                                tempSortedAlleles.add((int) NucleotideAlignmentConstants.getNucleotideAlleleByte(knownVariants[i].charAt(0)));
                            } else {
                                //If N is in our known Variants list but we do not have an indel, we need to remove it
                                nIndex = i;
                            }
                        }
                        if (nIndex != -1) {
                            //if we have an N we need to resize KnownVariants
                            String[] knownVariantsSmall = new String[knownVariants.length - 1];
                            for (int i = 0; i < knownVariants.length; i++) {
                                if (i < nIndex) {
                                    knownVariantsSmall[i] = knownVariants[i];
                                } else if (i > nIndex) {
                                    knownVariantsSmall[i - 1] = knownVariants[i];
                                }
                            }
                            knownVariants = knownVariantsSmall;
                        }
                    }
                    //END ZRM22 Jan7

                    //Make a copy of KnownVaraints in case we need to add some
                    ArrayList knownVariantsList = new ArrayList();
                    boolean indelsExist = false;
                    boolean indelsInKnownVariants = VCFUtil.indelInKnownVariant(knownVariants);
                    if (indelsInKnownVariants) {
                        indelsExist = true;
                    }

                    //Go through sorted alleles and also check for indels
                    for (int i = 0; i < sortedAlleles.length; i++) {
                        if (NucleotideAlignmentConstants.getHaplotypeNucleotide((byte) sortedAlleles[i]).equals("-")) {
                            indelsExist = true;
                        }
                    }
                    //Move To Function/

                    for (String variant : knownVariants) {
                        if (indelsExist && !indelsInKnownVariants) {
                            knownVariantsList.add("N" + variant);
                        } else {
                            knownVariantsList.add(variant);
                        }
                    }
                    //ZRM Jun6 fix to force Ref annotated alleles to stay in REF for export
                    //Need to reorder the variants based on the original sorting
                    ArrayList sortedAllelesList = new ArrayList();
                    HashMap sortedAlleleKnownVariantMap = new HashMap();
                    for (int i = 0; i < sortedAlleles.length; i++) {
                        //Add it to the new sorted list
                        sortedAllelesList.add(sortedAlleles[i]);
                        if (!tempSortedAlleles.contains(sortedAlleles[i])) {
                            //Check for an indel
                            if (indelsExist) {
                                if (NucleotideAlignmentConstants.getHaplotypeNucleotide((byte) sortedAlleles[i]).equals("-")) {
                                    //Add an Entry to the sortedAllele, knownVariant mapping
                                    sortedAlleleKnownVariantMap.put(sortedAlleles[i], "N");
                                } else {
                                    sortedAlleleKnownVariantMap.put(sortedAlleles[i], NucleotideAlignmentConstants.getHaplotypeNucleotide((byte) sortedAlleles[i]));
                                }
                            } else {
                                sortedAlleleKnownVariantMap.put(sortedAlleles[i], NucleotideAlignmentConstants.getHaplotypeNucleotide((byte) sortedAlleles[i]));
                            }
                        } else {
                            //Find the index in tempSortedAlleles
                            int variantIndex = tempSortedAlleles.indexOf(sortedAlleles[i]);
                            //Use it to get the correct KnownVariant
                            sortedAlleleKnownVariantMap.put(sortedAlleles[i], knownVariants[variantIndex]);
                        }
                    }
                    //loop through tempSortedAlleles and make sure we have them all
                    //Else add to the end
                    for (int i = 0; i < tempSortedAlleles.size(); i++) {
                        if (!sortedAllelesList.contains(tempSortedAlleles.get(i))) {
                            sortedAllelesList.add(tempSortedAlleles.get(i));
                            sortedAlleleKnownVariantMap.put(tempSortedAlleles.get(i), knownVariantsList.get(i));
                        }
                    }
                    int[] sortedAllelesExtended = new int[sortedAllelesList.size()];
                    for (int i = 0; i < sortedAllelesExtended.length; i++) {
                        sortedAllelesExtended[i] = sortedAllelesList.get(i);
                    }
                    sortedAlleles = sortedAllelesExtended;

                    String[] knownVariantsExtended = new String[sortedAllelesList.size()];
                    for (int i = 0; i < knownVariantsExtended.length; i++) {
                        knownVariantsExtended[i] = sortedAlleleKnownVariantMap.get(sortedAllelesList.get(i));
                    }
                    knownVariants = knownVariantsExtended;
                    //TODO Cleanup
//                    //Go through sorted alleles
//                    for(int i = 0 ;i alleleRedirectMap = new HashMap();
                String[] alleleRedirect = new String[16];
                Arrays.fill(alleleRedirect, ".");
                for (int i = 0; i < sortedAlleles.length; i++) {
                    alleleRedirect[sortedAlleles[i]] = "" + i;
                    alleleRedirectMap.put(NucleotideAlignmentConstants.getHaplotypeNucleotide((byte) sortedAlleles[i]), i);
                }

                bw.write(gt.chromosomeName(site)); // chromosome
                bw.write(delimChar);
                bw.write(gt.chromosomalPosition(site) + ""); // position
                bw.write(delimChar);
                bw.write(gt.siteName(site)); // site name
                bw.write(delimChar);
                if (nAlleles == 0) {                                                  //used to be ==0
                    //System.out.println("A0:"+gt.chromosomeName(site)+":"+gt.chromosomalPosition(site));
                    noAlleles++;
                    bw.write(".\t.\t.\tPASS\t.\tGT");
                    for (int taxa = 0; taxa < gt.numberOfTaxa(); taxa++) {
                        bw.write("\t./.");
                    }
                    bw.newLine();
                    continue;
                }
                //bw.write(NucleotideAlignmentConstants.getHaplotypeNucleotide((byte) sortedAlleles[0])); // ref allele
                //Fix for indels 8_27
                if (knownVariants.length == 0) {
                    bw.write(NucleotideAlignmentConstants.getHaplotypeNucleotide((byte) sortedAlleles[0])); // ref allele
                } else {
                    bw.write(knownVariants[0]);
                }
                bw.write(delimChar);

                StringBuilder altAllelesBuilder = new StringBuilder("");

                //ZRM 8_27
                String altString = "";
                int indelIndex = -1;

                if (knownVariants.length == 0 || knownVariants.length < sortedAlleles.length) {
                    ArrayList altAlleles = new ArrayList();
                    for (int aa = 1; aa < sortedAlleles.length; aa++) {
                        //Ramu Fix
                        //altAlleles.add(NucleotideAlignmentConstants.getHaplotypeNucleotide((byte) sortedAlleles[aa]));
                        //UNCOMMENT BEFORE COMMIT
                        if (NucleotideAlignmentConstants.getHaplotypeNucleotide((byte) sortedAlleles[aa]) != "-") {
                            altAlleles.add(NucleotideAlignmentConstants.getHaplotypeNucleotide((byte) sortedAlleles[aa]));
                        } else {
                            indelIndex = aa;
                        }
                    }
                    altString = altAlleles.stream().collect(Collectors.joining(","));
                } else {
                    altString = Arrays.stream(knownVariants, 1, knownVariants.length).collect(Collectors.joining(","));
                }

                if (altString.length() == 0) {
                    altString = ".";
                }

                ////bw.write(altAllelesBuilder.toString()); // alt alleles
                bw.write(altString);
                bw.write(delimChar);

                bw.write("."); // qual score
                bw.write(delimChar);

                bw.write("PASS"); // filter
                bw.write(delimChar);

                //INFO
                GeneralAnnotation ga = p.getAnnotation();
                String annotationHolder = ga.getAnnotationKeys().stream().sorted()
                        .filter(k -> !k.equals("VARIANT"))
                        .filter(k -> !(k.equals("DP") && hasDepth)) //Get rid of the DP tag if we have depth in the genotype table already.  
                        .map(key -> {
                            String[] annos = ga.getTextAnnotation(key);
                            if (annos[0].equals("TRUE")) return key;
                            return key + Arrays.stream(annos).collect(Collectors.joining(",", "=", ""));
                        })
                        .collect(Collectors.joining(";"));
                if (hasDepth) {
                    //bw.write("DP=" + gt.depth().depthForSite(site)); // DP
                    //To Fix bug where ";DP=100" string would occur
                    if (annotationHolder.equals("")) {
                        annotationHolder += "DP=" + gt.depth().depthForSite(site);
                    } else if (annotationHolder.equals(".")) {
                        //Fix bug where we have .;DP=100 showing up.
                        annotationHolder = "DP=" + gt.depth().depthForSite(site);
                    } else {
                        annotationHolder += ";DP=" + gt.depth().depthForSite(site);
                    }
                }
                if (!annotationHolder.equals("")) {
                    bw.write(annotationHolder);
                } else {
                    bw.write("."); // DP
                }
                bw.write(delimChar);

                if (hasDepth) {
                    bw.write("GT:AD:DP:GQ:PL");
                } else {
                    bw.write("GT");
                }
                for (int taxa = 0; taxa < gt.numberOfTaxa(); taxa++) {
                    bw.write(delimChar);
                    // GT = genotype
                    byte[] values = gt.genotypeArray(taxa, site);
                    if (knownVariants.length > 0) {
                        bw.write(alleleRedirect[values[0]] + "/" + alleleRedirect[values[1]]);
                    } else {
                        //Ramu Fix
//                        if(alleleRedirect[values[0]].equals(".")) {
//                            bw.write(alleleRedirect[values[0]]);
//                        }
//                        else {
//                            if(indelIndex != -1 && Integer.parseInt(alleleRedirect[values[0]]) > indelIndex) {
//                                bw.write(""+(Integer.parseInt(alleleRedirect[values[0]]) -1 ));
//                            }
//                            else {
//                                bw.write(alleleRedirect[values[0]]);
//                            }
//                        }

                        //handle if no Known Variants(from a different file type)
                        if (NucleotideAlignmentConstants.getHaplotypeNucleotide((byte) values[0]).equals("-")) {
                            //TODO handle Missing better
                            bw.write(".");
                        } else {
                            if (alleleRedirect[values[0]].equals(".")) {
                                bw.write(alleleRedirect[values[0]]);
                            } else {
                                if (indelIndex != -1 && Integer.parseInt(alleleRedirect[values[0]]) > indelIndex) {
                                    bw.write("" + (Integer.parseInt(alleleRedirect[values[0]]) - 1));
                                } else {
                                    bw.write(alleleRedirect[values[0]]);
                                }
                            }
                        }

                        bw.write("/");

                        //Ramu Fix
//                        if(alleleRedirect[values[1]].equals(".")) {
//                            bw.write(alleleRedirect[values[1]]);
//                        }
//                        else {
//                            if(indelIndex != -1 && Integer.parseInt(alleleRedirect[values[1]]) > indelIndex) {
//                                bw.write(""+(Integer.parseInt(alleleRedirect[values[1]]) -1));
//                            }
//                            else {
//                                bw.write(alleleRedirect[values[1]]);
//                            }
//                        }

                        if (NucleotideAlignmentConstants.getHaplotypeNucleotide((byte) values[1]).equals("-")) {
                            bw.write(".");
                        } else {
                            if (alleleRedirect[values[1]].equals(".")) {
                                bw.write(alleleRedirect[values[1]]);
                            } else {
                                if (indelIndex != -1 && Integer.parseInt(alleleRedirect[values[1]]) > indelIndex) {
                                    bw.write("" + (Integer.parseInt(alleleRedirect[values[1]]) - 1));
                                } else {
                                    bw.write(alleleRedirect[values[1]]);
                                }
                            }
                        }
                    }
                    if (!(hasDepth)) {
                        continue;
                    }
                    bw.write(":");

                    // AD
                    int[] siteAlleleDepths = gt.depthForAlleles(taxa, site);
                    int siteTotalDepth = 0;

                    ArrayList depthsList = new ArrayList();
                    //Fix missing commas in depth information
                    for (int ss = 0; ss < sortedAlleles.length; ss++) {
                        if (ss != indelIndex && sortedAlleles[ss] < siteAlleleDepths.length) {
                            try {
                                depthsList.add(siteAlleleDepths[sortedAlleles[ss]]);
                                siteTotalDepth += siteAlleleDepths[sortedAlleles[ss]];
                            } catch (Exception e) {
                                System.out.println(Arrays.toString(alleleRedirect));
                                System.out.println(altString);
                                System.out.println(Arrays.toString(siteAlleleDepths));
                                System.out.println(Arrays.toString(sortedAlleles));
                                System.out.println(ss);
                                throw e;
                            }
                            //TODO Cleanup
//                            depthsList.add(AlleleDepthUtil.depthByteToInt((byte)siteAlleleDepths[sortedAlleles[ss]]));
//                            siteTotalDepth += AlleleDepthUtil.depthByteToInt((byte)siteAlleleDepths[sortedAlleles[ss]]);
                        }
                    }

                    bw.write(depthsList.stream().map((depth) -> "" + depth).collect(Collectors.joining(",")));
//
//                    for (int ss = 0; ss < sortedAlleles.length; ss++) {
//                        //bw.write("" + AlleleDepthUtil.decode(siteAlleleDepths[sortedAlleles[ss]]));
//                        if(ss!=indelIndex) {
//                            bw.write("" + siteAlleleDepths[sortedAlleles[ss]]);
//                            if (ss < sortedAlleles.length - 1 && ss+1!=indelIndex) {
//                                bw.write(',');
//                            }
//                            siteTotalDepth += siteAlleleDepths[sortedAlleles[ss]];
//                        }
//
//                    }
                    bw.write(":");
                    // DP
                    bw.write(siteTotalDepth + "");

                    int[] scores = new int[]{-1, -1, -1, -1};
                    if (values[0] != GenotypeTable.UNKNOWN_ALLELE) {
                        int altDepth = (sortedAlleles.length < 2 || sortedAlleles[1] >= siteAlleleDepths.length) ? 0 : siteAlleleDepths[sortedAlleles[1]];
                        altDepth = (altDepth < 0) ? 0 : altDepth;
                        //int refDepth = (siteAlleleDepths[sortedAlleles[0]]==-1) ? 0 : siteAlleleDepths[sortedAlleles[0]];

                        //Check to see if either the major or alt allele has depth
                        if (siteAlleleDepths[sortedAlleles[0]] >= 0 && altDepth >= 0) {
                            scores = VCFUtil.getScore(siteAlleleDepths[sortedAlleles[0]], altDepth);
                            bw.write(":");
                            // GQ
                            bw.write(scores[3] + "");
                            bw.write(":");
                            // PL
                            int k = sortedAlleles.length - 1;
                            int[] fullPL = new int[(k * (k + 1) / 2) + k + 1];


                            //Set all the values to 255 as Higher PL means its less likely to be correct
                            //Zero PL means the probability of error is 0
                            Arrays.fill(fullPL, 255);

                            //Leaving these indicies in expanded form so we know its correct
                            //it should really just be in positions 0,1, and 2 regardless of number of sites
                            //(k*(k+1)/2)+j
                            //If we only have 1 allele we should only have 1 likelihood
                            if (fullPL.length == 1) {
                                fullPL[0] = scores[0];
                            } else {
                                fullPL[(0 * (0 + 1) / 2) + 0] = scores[0];
                                fullPL[(1 * (1 + 1) / 2) + 0] = scores[1];
                                fullPL[(1 * (1 + 1) / 2) + 1] = scores[2];
                            }
                            for (int i = 0; i < fullPL.length - 1; i++) {
                                bw.write(fullPL[i] + ",");
                            }
                            bw.write("" + fullPL[fullPL.length - 1]);

//                            //Leaving these indicies in expanded form so we know its correct
//                            //it should really just be in positions 0,1, and 2 regardless of number of sites
//                            //(k*(k+1)/2)+j
//                            fullPL[(0 * (0 + 1)/2) + 0] = scores[0];
//                            fullPL[(1 * (1 + 1)/2) + 0] = scores[1];
//                            fullPL[(1 * (1 + 1)/2) + 1] = scores[2];
//                            for(int i = 0; i < fullPL.length-1; i++) {
//                                bw.write(fullPL[i] + ",");
//                            }
//                            bw.write(""+fullPL[fullPL.length-1]);
//                            //
//                            //bw.write(scores[0] + "," + scores[1] + "," + scores[2]);
                        }
                    }
//                    else {
//                        //If unknown just write out :0:0:0,0,0
//                        bw.write(":0:0,0,0");
//                    }
                }
                bw.newLine();
                if (listener != null) {
                    listener.progress((int) (((double) (site + 1) / (double) numSites) * 100.0), null);
                }
            }
            if (noAlleles > 0) {
                myLogger.warn("Warning: " + noAlleles + " sites have no alleles.");
            }
            bw.flush();
            bw.close();
        } catch (Exception e) {
            e.printStackTrace();
            throw new IllegalArgumentException("Error writing VCF file: " + filename + ": " + ExceptionUtils.getExceptionCauses(e));
        }
        return filename;
    }

    private static void writeVCFSampleAnnotationToWriter(GenotypeTable gt, BufferedWriter bw) throws IOException {
        for (Taxon taxon : gt.taxa()) {
            GeneralAnnotation annotation = taxon.getAnnotation();
            if ((annotation == null) || (annotation.numAnnotations() == 0)) {
                continue;
            }
            Multimap annoMap = taxon.getAnnotation().getAnnotationAsMap();
            String annoString = Joiner.on(',').withKeyValueSeparator("=").join(annoMap.entries());
            bw.write("##SAMPLE=");
            bw.newLine();
        }
    }

    /**
     * Writes given set of alignments to a set of Plink files
     *
     * @param alignment
     * @param filename
     * @param delimChar
     */
    public static String writeToPlink(GenotypeTable alignment, String filename, char delimChar) {
        if (delimChar != ' ' && delimChar != '\t') {
            throw new IllegalArgumentException("Delimiter charater must be either a blank space or a tab.");
        }

        BufferedWriter MAPbw = null;
        BufferedWriter PEDbw = null;
        String mapFileName = Utils.addSuffixIfNeeded(filename, ".plk.map");
        String pedFileName = Utils.addSuffixIfNeeded(filename, ".plk.ped");
        try {
            MAPbw = new BufferedWriter(new FileWriter(mapFileName), 1000000);
            int numSites = alignment.numberOfSites();
            for (int site = 0; site < numSites; site++) {
                MAPbw.write(alignment.chromosomeName(site)); // chromosome name
                MAPbw.write(delimChar);
                MAPbw.write(alignment.siteName(site)); // rs#
                MAPbw.write(delimChar);
                MAPbw.write("-9"); // genetic distance unavailable
                MAPbw.write(delimChar);
                MAPbw.write(Integer.toString(alignment.chromosomalPosition(site))); // position
                MAPbw.write("\n");
            }
            MAPbw.close();

            PEDbw = new BufferedWriter(new FileWriter(pedFileName), 1000000);
            // Compiled : Pattern
            Pattern splitter = Pattern.compile(":");
            int numTaxa = alignment.numberOfTaxa();
            for (int taxa = 0; taxa < numTaxa; taxa++) {
                String[] name = splitter.split(alignment.taxaName(taxa).trim());
                if (name.length != 1) {
                    PEDbw.write(name[1]); // namelvl 1 if is available
                } else {
                    PEDbw.write("-9");
                }
                PEDbw.write(delimChar);
                PEDbw.write(alignment.taxaName(taxa).trim()); // namelvl 0
                PEDbw.write(delimChar);
                PEDbw.write("-9"); // paternal ID unavailable
                PEDbw.write(delimChar);
                PEDbw.write("-9"); // maternal ID unavailable
                PEDbw.write(delimChar);
                PEDbw.write("-9"); // gender is both
                PEDbw.write(delimChar);
                PEDbw.write("-9"); // phenotype unavailable, might have to change to "-9" for missing affection status
                PEDbw.write(delimChar);
                for (int site = 0; site < numSites; site++) {
                    String[] b = getSNPValueForPlink(alignment.genotypeAsStringArray(taxa, site));
                    PEDbw.write(b[0]);
                    PEDbw.write(delimChar);
                    PEDbw.write(b[b.length - 1]);
                    if (site != numSites - 1) {
                        PEDbw.write(delimChar);
                    }
                }
                PEDbw.write("\n");
            }
            PEDbw.close();
            return mapFileName + " and " + pedFileName;
        } catch (Exception e) {
            myLogger.error("Error writing Plink files: " + mapFileName + " and " + pedFileName + ": " + ExceptionUtils.getExceptionCauses(e));
            throw new IllegalArgumentException("Error writing Plink files: " + mapFileName + " and " + pedFileName + ": " + ExceptionUtils.getExceptionCauses(e));
        } finally {
            try {
                PEDbw.close();
            } catch (Exception e) {
                // do nothing
            }
            try {
                MAPbw.close();
            } catch (Exception e) {
                // do nothing
            }
        }
    }

    private static String[] getSNPValueForPlink(String[] base) {
        for (int i = 0; i < base.length; i++) {
            if (base[i].equals("N")) {
                base[i] = "0";
            } else if (base[i].equals("0")) {
                base[i] = "D";
            }
        }
        return base;
    }

    public static String saveDelimitedAlignment(GenotypeTable theAlignment, String delimit, String saveFile) {

        if ((saveFile == null) || (saveFile.length() == 0)) {
            return null;
        }
        saveFile = Utils.addSuffixIfNeeded(saveFile, ".txt");
        FileWriter fw = null;
        BufferedWriter bw = null;
        try {

            fw = new FileWriter(new File(saveFile));
            bw = new BufferedWriter(fw);

            bw.write("Taxa");
            int numSites = theAlignment.numberOfSites();
            for (int j = 0; j < numSites; j++) {
                bw.write(delimit);
                bw.write(String.valueOf(theAlignment.chromosomalPosition(j)));
            }
            bw.write("\n");

            for (int r = 0, n = theAlignment.numberOfTaxa(); r < n; r++) {
                bw.write(theAlignment.taxaName(r));
                for (int i = 0; i < numSites; i++) {
                    bw.write(delimit);
                    bw.write(theAlignment.genotypeAsString(r, i));
                }
                bw.write("\n");
            }

            return saveFile;

        } catch (Exception e) {
            myLogger.error("Error writing Delimited Alignment: " + saveFile + ": " + ExceptionUtils.getExceptionCauses(e));
            throw new IllegalArgumentException("Error writing Delimited Alignment: " + saveFile + ": " + ExceptionUtils.getExceptionCauses(e));
        } finally {
            try {
                bw.close();
                fw.close();
            } catch (Exception e) {
                // do nothing
            }
        }

    }

    /**
     * print alignment (in PHYLIP SEQUENTIAL format)
     */
    public static void printSequential(GenotypeTable a, Writer out) throws IOException {
        // PHYLIP header line
        out.write("  " + a.numberOfTaxa() + " " + a.numberOfSites() + "  S" + "\n");

        // Print sequences
        for (int s = 0; s < a.numberOfTaxa(); s++) {
            int n = 0;
            while (n < a.numberOfSites()) {
                if (n == 0) {
                    format.displayLabel(out, a.taxaName(s), 10);
                    out.write("     ");
                } else {
                    out.write("               ");
                }
                printNextSites(a, out, false, s, n, 50);
                out.write("\n");
                n += 50;
            }
        }
    }

    /**
     * print alignment (in PHYLIP 3.4 INTERLEAVED format)
     */
    public static void printInterleaved(GenotypeTable a, Writer out) throws IOException {
        int n = 0;

        // PHYLIP header line
        out.write("  " + a.numberOfTaxa() + " " + a.numberOfSites() + "\n");

        // Print sequences
        while (n < a.numberOfSites()) {
            for (int s = 0; s < a.numberOfTaxa(); s++) {
                if (n == 0) {
                    format.displayLabel(out, a.taxaName(s), 10);
                    out.write("     ");
                } else {
                    out.write("               ");
                }
                printNextSites(a, out, true, s, n, 50);
                out.write("\n");
            }
            out.write("\n");
            n += 50;
        }
    }

    /**
     * Print alignment (in CLUSTAL W format)
     */
    public static void printCLUSTALW(GenotypeTable a, Writer out) throws IOException {
        int n = 0;

        // CLUSTAL W header line
        out.write("CLUSTAL W multiple sequence alignment\n\n");

        // Print sequences
        while (n < a.numberOfSites()) {
            out.write("\n");
            for (int s = 0; s < a.numberOfTaxa(); s++) {
                format.displayLabel(out, a.taxaName(s), 10);
                out.write("     ");

                printNextSites(a, out, false, s, n, 50);
                out.write("\n");
            }
            // Blanks in status line are necessary for some parsers)
            out.write("               \n");
            n += 50;
        }
    }

    private static void printNextSites(GenotypeTable a, Writer out, boolean chunked, int seq, int start, int num) throws IOException {
        // Print next num characters
        for (int i = 0; (i < num) && (start + i < a.numberOfSites()); i++) {
            // Chunks of 10 characters
            if (i % 10 == 0 && i != 0 && chunked) {
                out.write(' ');
            }
            out.write(a.genotypeAsString(seq, start + i));
        }
    }

    public static String writeAlignmentToSerialGZ(GenotypeTable sba, String outFile) {

        long time = System.currentTimeMillis();

        File theFile = null;
        FileOutputStream fos = null;
        GZIPOutputStream gz = null;
        ObjectOutputStream oos = null;
        try {
            theFile = new File(Utils.addSuffixIfNeeded(outFile, ".serial.gz"));
            fos = new FileOutputStream(theFile);
            gz = new GZIPOutputStream(fos);
            oos = new ObjectOutputStream(gz);
            oos.writeObject(sba);
            return theFile.getName();
        } catch (Exception e) {
            e.printStackTrace();
            myLogger.error("Error writing Serial GZ: " + theFile.getName() + ": " + ExceptionUtils.getExceptionCauses(e));
            throw new IllegalArgumentException("Error writing Serial GZ: " + theFile.getName() + ": " + ExceptionUtils.getExceptionCauses(e));
        } finally {
            try {
                oos.flush();
                oos.close();
                gz.close();
                fos.close();
            } catch (Exception e) {
                // do nothing
            }
            myLogger.info("writeAlignmentToSerialGZ: " + theFile.toString() + "  Time: " + (System.currentTimeMillis() - time));
        }

    }

    /**
     * Simple method to export a GenotypeTable to a fasta file.
     *
     * @param gt Import GenotypeTable which we want to export.
     * @param out FileWriter which will be used to export the FASTA file.
     */
    public static void writeFasta(GenotypeTable gt, Writer out) {
        try {
            TaxaList tl = gt.taxa();
            for (int i = 0; i < tl.size(); i++) {
                out.write(">");
                out.write(tl.get(i).getName());
                out.write("\n");
                for (int j = 0; j < gt.positions().size(); j++) {
                    byte call = gt.genotype(i, j);
                    out.write(NucleotideAlignmentConstants.getNucleotideIUPAC(call));
                }
                out.write("\n");
            }
        } catch (Exception e) {
            e.printStackTrace();
            myLogger.error("Error writing FASTA file: " + ExceptionUtils.getExceptionCauses(e));
            throw new IllegalArgumentException("Error writing FastaAlignment: " + ExceptionUtils.getExceptionCauses(e));
        }
    }

    /**
     * Simple method to export a GenotypeTable to a fasta file while removing Gap characters.
     *
     * @param gt GenotypeTable which we want to export.
     * @param out FileWriter which will be used to export the FASTA file.
     */
    public static void writeFastaNoGaps(GenotypeTable gt, Writer out) {
        try {
            TaxaList tl = gt.taxa();
            for (int i = 0; i < tl.size(); i++) {
                out.write(">");
                out.write(tl.get(i).getName());
                out.write("\n");
                for (int j = 0; j < gt.positions().size(); j++) {
                    byte call = gt.genotype(i, j);
                    if (!NucleotideAlignmentConstants.getNucleotideIUPAC(call).equals("-")) {
                        out.write(NucleotideAlignmentConstants.getNucleotideIUPAC(call));
                    }
                }
                out.write("\n");
            }
        } catch (Exception e) {
            e.printStackTrace();
            myLogger.error("Error writing FASTA file: " + ExceptionUtils.getExceptionCauses(e));
            throw new IllegalArgumentException("Error writing FastaAlignment: " + ExceptionUtils.getExceptionCauses(e));
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy