
net.maizegenetics.dna.snp.ExportUtils Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of tassel Show documentation
Show all versions of tassel Show documentation
TASSEL is a software package to evaluate traits associations, evolutionary patterns, and linkage
disequilibrium.
/*
* ExportUtils
*/
package net.maizegenetics.dna.snp;
import com.google.common.base.Joiner;
import com.google.common.collect.Multimap;
import net.maizegenetics.dna.WHICH_ALLELE;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.dna.snp.io.VCFUtil;
import net.maizegenetics.taxa.TaxaList;
import net.maizegenetics.taxa.Taxon;
import net.maizegenetics.util.*;
import org.apache.log4j.Logger;
import java.io.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.zip.GZIPOutputStream;
import net.maizegenetics.dna.snp.genotypecall.AlleleFreqCache;
/**
* Exports Genotype Tables to various file formats.
*
* @author Jon Zhang
* @author Terry Casstevens
* @author Ed Buckler
*/
public class ExportUtils {
private static final Logger myLogger = Logger.getLogger(ExportUtils.class);
private static FormattedOutput format = FormattedOutput.getInstance();
private ExportUtils() {
// Utility Class - do not instantiate.
}
public static String writeGenotypeHDF5(GenotypeTable a, String newHDF5file) {
return writeGenotypeHDF5(a, newHDF5file, null, true);
}
public static String writeGenotypeHDF5(GenotypeTable a, String newHDF5file, boolean keepDepth) {
return writeGenotypeHDF5(a, newHDF5file, null, keepDepth);
}
/**
* Exports a alignment into the Byte HDF5 format.
*
* @param a alignment to be exported
* @param newHDF5file filename for the new file (should end with "hmp.h5")
* @param exportTaxa subset of taxa (if null exports ALL taxa)
* @return
*/
public static String writeGenotypeHDF5(GenotypeTable a, String newHDF5file, TaxaList exportTaxa, boolean keepDepth) {
GenotypeTableBuilder aB = GenotypeTableBuilder.getTaxaIncremental(a.positions(), newHDF5file);
if ((exportTaxa != null) && (exportTaxa.numberOfTaxa() == 0)) {
aB.build();
return newHDF5file;
}
for (int t = 0; t < a.numberOfTaxa(); t++) {
if ((exportTaxa != null) && (!exportTaxa.contains(a.taxa().get(t)))) {
continue; //taxon not in export list
}
byte[] bases = a.genotypeAllSites(t);
if (a.hasDepth() == false || keepDepth == false) {
aB.addTaxon(a.taxa().get(t), bases, null);
} else {
aB.addTaxon(a.taxa().get(t), bases, a.depth().valuesForTaxonByte(t));
}
}
aB.build();
return newHDF5file;
}
/**
* Write a GenotypeTable to HapMap format with standard settings - unphased
* single character, tab delimiter, and no progress tracking.
*
* @param alignment genotype table
* @param filename outfile name (will add ".hmp.txt" if needed)
* @return name of the outfile with the appropriate suffix
*/
public static String writeToHapmap(GenotypeTable alignment, String filename) {
return writeToHapmap(alignment, false, filename, '\t', null);
}
/**
* Write a GenotypeTable to HapMap format.
*
* @param alignment genotype table
* @param diploid true uses phased two letter encoding, false one letter
* unphased
* @param filename outfile name (will add ".hmp.txt" if needed)
* @param delimChar delimiter character normally tab
* @param listener progress listener, (null if unneeded)
* @return name of the outfile with the appropriate suffix
*/
public static String writeToHapmap(GenotypeTable alignment, boolean diploid, String filename, char delimChar, ProgressListener listener) {
return writeToHapmap(alignment, diploid, filename, delimChar, true, listener);
}
public static String writeToHapmap(GenotypeTable alignment, boolean diploid, String filename, char delimChar, boolean includeTaxaAnnotations, ProgressListener listener) {
if (delimChar != ' ' && delimChar != '\t') {
throw new IllegalArgumentException("Delimiter charater must be either a blank space or a tab.");
}
BufferedWriter bw = null;
try {
String fullFileName = Utils.addSuffixIfNeeded(filename, ".hmp.txt", new String[]{".hmp.txt", ".hmp.txt.gz"});
bw = Utils.getBufferedWriter(fullFileName);
if (includeTaxaAnnotations) {
for (Taxon taxon : alignment.taxa()) {
GeneralAnnotation annotation = taxon.getAnnotation();
if ((annotation == null) || (annotation.numAnnotations() == 0)) {
continue;
}
bw.write("##SAMPLE=" + taxon.toStringWithVCFAnnotation() + "\n");
}
}
bw.write(Joiner.on(delimChar).join("rs#", "alleles", "chrom", "pos", "strand", "assembly#", "center", "protLSID",
"assayLSID", "panelLSID", "QCcode"));
bw.write(delimChar);
int numTaxa = alignment.numberOfTaxa();
for (int taxa = 0; taxa < numTaxa; taxa++) {
String sequenceID = alignment.taxaName(taxa).trim();
bw.write(sequenceID);
if (taxa != numTaxa - 1) {
bw.write(delimChar);
}
}
bw.write("\n");
int numSites = alignment.numberOfSites();
for (int site = 0; site < numSites; site++) {
bw.write(alignment.siteName(site));
bw.write(delimChar);
byte[] genotypes = alignment.genotypeAllTaxa(site);
// which alleles are present among the genotypes
int[][] sortedAlleles = AlleleFreqCache.allelesSortedByFrequencyNucleotide(genotypes);
int numAlleles = sortedAlleles[0].length;
if (numAlleles == 0) {
bw.write("NA"); //if data does not exist
} else if (numAlleles == 1) {
bw.write(alignment.genotypeAsString(site, (byte) sortedAlleles[0][0]));
} else {
bw.write(alignment.genotypeAsString(site, (byte) sortedAlleles[0][0]));
for (int allele = 1; allele < sortedAlleles[0].length; allele++) {
if (sortedAlleles[0][allele] != GenotypeTable.UNKNOWN_ALLELE) {
bw.write('/');
bw.write(alignment.genotypeAsString(site, (byte) sortedAlleles[0][allele])); // will write out a third allele if it exists
}
}
}
bw.write(delimChar);
bw.write(Joiner.on(delimChar).join(alignment.chromosomeName(site), String.valueOf(alignment.chromosomalPosition(site)),
"+", "NA", "NA", "NA", "NA", "NA", "NA"));
bw.write(delimChar);
for (int taxa = 0; taxa < numTaxa; taxa++) {
if (diploid == false) {
String baseIUPAC = null;
try {
baseIUPAC = alignment.diploidAsString(site, genotypes[taxa]);
} catch (Exception e) {
String[] b = alignment.genotypeAsStringArray(taxa, site);
bw.close();
myLogger.debug(e.getMessage(), e);
throw new IllegalArgumentException("There is no String representation for diploid values: " + b[0] + ":" + b[1] + " getBase(): 0x" + Integer.toHexString(alignment.genotype(taxa, site)) + "\nTry Exporting as Diploid Values.");
}
if ((baseIUPAC == null) || baseIUPAC.equals("?")) {
String[] b = alignment.genotypeAsStringArray(taxa, site);
bw.close();
throw new IllegalArgumentException("There is no String representation for diploid values: " + b[0] + ":" + b[1] + " getBase(): 0x" + Integer.toHexString(alignment.genotype(taxa, site)) + "\nTry Exporting as Diploid Values.");
}
bw.write(baseIUPAC);
} else {
byte[] temp = GenotypeTableUtils.getDiploidValues(genotypes[taxa]);
bw.write(alignment.genotypeAsString(site, temp[0]));
bw.write(alignment.genotypeAsString(site, temp[1]));
}
if (taxa != (numTaxa - 1)) {
bw.write(delimChar);
}
}
bw.write("\n");
if (listener != null) {
listener.progress((int) (((double) (site + 1) / (double) numSites) * 100.0), null);
}
}
return fullFileName;
} catch (Exception e) {
myLogger.debug(e.getMessage(), e);
throw new IllegalArgumentException("Error writing Hapmap file: " + filename + ": " + ExceptionUtils.getExceptionCauses(e));
} finally {
try {
bw.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
/**
* Writes given alignment to a VCF file
*
* @param gt
* @param filename
* @return
*/
public static String writeToVCF(GenotypeTable gt, String filename, boolean keepDepth) {
return writeToVCF(gt,filename,keepDepth,null);
}
public static String writeToVCF(GenotypeTable gt, String filename, boolean keepDepth, ProgressListener listener) {
final char delimChar = '\t';
boolean hasDepth = gt.hasDepth() && keepDepth;
try {
filename = Utils.addSuffixIfNeeded(filename, ".vcf", new String[]{".vcf", ".vcf.gz"});
BufferedWriter bw = Utils.getBufferedWriter(filename);
bw.write("##fileformat=VCFv4.0");
bw.newLine();
if (!gt.hasReference()) {
bw.write("##Tassel=");
bw.newLine();
}
bw.write("##FORMAT=");
bw.newLine();
bw.write("##FORMAT=");
bw.newLine();
bw.write("##FORMAT=");
bw.newLine();
bw.write("##FORMAT=");
bw.newLine();
bw.write("##FORMAT=");
bw.newLine();
bw.write("##INFO=");
bw.newLine();
bw.write("##INFO=");
bw.newLine();
bw.write("##INFO=");
bw.newLine();
writeVCFSampleAnnotationToWriter(gt, bw);
bw.write("#CHROM" + delimChar + "POS" + delimChar + "ID" + delimChar + "REF" + delimChar + "ALT" + delimChar + "QUAL" + delimChar + "FILTER" + delimChar + "INFO" + delimChar + "FORMAT");
for (int taxa = 0; taxa < gt.numberOfTaxa(); taxa++) {
String taxonName = gt.taxaName(taxa).trim();
bw.write(delimChar + taxonName);
}
bw.newLine();
int noAlleles = 0;
for (int site = 0; site < gt.numberOfSites(); site++) {
int numSites = gt.numberOfSites();
Position p = gt.positions().get(site);
String[] knownVariants = p.getKnownVariants();
byte refAllele = p.getAllele(WHICH_ALLELE.Reference);
int[] sortedAlleles = gt.allelesSortedByFrequency(site)[0]; // which alleles are actually present among the genotypes
//ZRM22 March 18 2016 move to add reference into sortedAlleles array if its missing
int[] sortedAllelesTemp = VCFUtil.resolveRefSorted(sortedAlleles, refAllele);
//ZRM22 June 6 2016 fix variants with ref allele
sortedAlleles = sortedAllelesTemp;
//ZRM22 Jan7 Remake
//If knownVariants.length is greater than 0 its either from a VCF file or Hapmap
if(knownVariants.length>0) {
//ReOrder based on variant alleles
//Store a tempSortedAlleles so we can appropriately handle hapmap to vcf
//int[] tempSortedAlleles = new int[knownVariants.length];
//ArrayList to hold the Sorted Alleles Indices Temporarily as the ordering will change
ArrayList tempSortedAlleles = new ArrayList();
//Loop through all the knownVariants and check to see if we have an indel
boolean knownVariantIndel = VCFUtil.indelInKnownVariant(knownVariants);
//If we do have an indel, we can add the variants after picking off the first character to the tempSortedAlleles
if(knownVariantIndel) {
//Loop through the variants
for(int i = 0; i < knownVariants.length; i++) {
//Pull off the first character if it exists
if(knownVariants[i].length()>1) {
String parsedVariant = knownVariants[i].substring(1);
tempSortedAlleles.add((int)NucleotideAlignmentConstants.getNucleotideAlleleByte(parsedVariant.charAt(0)));
}
else {
//Mark as deletion
tempSortedAlleles.add((int)NucleotideAlignmentConstants.getNucleotideAlleleByte('-'));
}
}
} else {
//If we dont have an indel, we can add it to the allele array
if(sortedAlleles.length();
}
int nIndex = -1;
for(int i = 0; i nIndex) {
knownVariantsSmall[i-1] = knownVariants[i];
}
}
knownVariants = knownVariantsSmall;
}
}
//END ZRM22 Jan7
//Make a copy of KnownVaraints in case we need to add some
ArrayList knownVariantsList = new ArrayList();
boolean indelsExist = false;
boolean indelsInKnownVariants = VCFUtil.indelInKnownVariant(knownVariants);
if(indelsInKnownVariants) {
indelsExist = true;
}
//Go through sorted alleles and also check for indels
for(int i = 0 ;i sortedAllelesList = new ArrayList();
HashMap sortedAlleleKnownVariantMap = new HashMap();
for(int i = 0; i < sortedAlleles.length; i++) {
//Add it to the new sorted list
sortedAllelesList.add(sortedAlleles[i]);
if(!tempSortedAlleles.contains(sortedAlleles[i])) {
//Check for an indel
if(indelsExist) {
if(NucleotideAlignmentConstants.getHaplotypeNucleotide((byte)sortedAlleles[i]).equals("-")) {
//Add an Entry to the sortedAllele, knownVariant mapping
sortedAlleleKnownVariantMap.put(sortedAlleles[i],"N");
}
else {
sortedAlleleKnownVariantMap.put(sortedAlleles[i],NucleotideAlignmentConstants.getHaplotypeNucleotide((byte)sortedAlleles[i]));
}
}
else {
sortedAlleleKnownVariantMap.put(sortedAlleles[i],NucleotideAlignmentConstants.getHaplotypeNucleotide((byte)sortedAlleles[i]));
}
}
else {
//Find the index in tempSortedAlleles
int variantIndex = tempSortedAlleles.indexOf(sortedAlleles[i]);
//Use it to get the correct KnownVariant
sortedAlleleKnownVariantMap.put(sortedAlleles[i],knownVariants[variantIndex]);
}
}
//loop through tempSortedAlleles and make sure we have them all
//Else add to the end
for(int i = 0; i < tempSortedAlleles.size(); i++) {
if(!sortedAllelesList.contains(tempSortedAlleles.get(i))) {
sortedAllelesList.add(tempSortedAlleles.get(i));
sortedAlleleKnownVariantMap.put(tempSortedAlleles.get(i),knownVariantsList.get(i));
}
}
int[] sortedAllelesExtended = new int[sortedAllelesList.size()];
for(int i = 0; i < sortedAllelesExtended.length; i++) {
sortedAllelesExtended[i] = sortedAllelesList.get(i);
}
sortedAlleles = sortedAllelesExtended;
String[] knownVariantsExtended = new String[sortedAllelesList.size()];
for(int i = 0; i < knownVariantsExtended.length; i++) {
knownVariantsExtended[i] = sortedAlleleKnownVariantMap.get(sortedAllelesList.get(i));
}
knownVariants = knownVariantsExtended;
//TODO Cleanup
// //Go through sorted alleles
// for(int i = 0 ;i alleleRedirectMap = new HashMap();
String[] alleleRedirect = new String[16];
Arrays.fill(alleleRedirect, ".");
for (int i = 0; i < sortedAlleles.length; i++) {
alleleRedirect[sortedAlleles[i]] = "" + i;
alleleRedirectMap.put(NucleotideAlignmentConstants.getHaplotypeNucleotide((byte) sortedAlleles[i]), i);
}
bw.write(gt.chromosomeName(site)); // chromosome
bw.write(delimChar);
bw.write(gt.chromosomalPosition(site) + ""); // position
bw.write(delimChar);
bw.write(gt.siteName(site)); // site name
bw.write(delimChar);
if (nAlleles == 0) { //used to be ==0
//System.out.println("A0:"+gt.chromosomeName(site)+":"+gt.chromosomalPosition(site));
noAlleles++;
bw.write(".\t.\t.\tPASS\t.\tGT");
for (int taxa = 0; taxa < gt.numberOfTaxa(); taxa++) {
bw.write("\t./.");
}
bw.newLine();
continue;
}
//bw.write(NucleotideAlignmentConstants.getHaplotypeNucleotide((byte) sortedAlleles[0])); // ref allele
//Fix for indels 8_27
if(knownVariants.length==0) {
bw.write(NucleotideAlignmentConstants.getHaplotypeNucleotide((byte) sortedAlleles[0])); // ref allele
}
else {
bw.write(knownVariants[0]);
}
bw.write(delimChar);
StringBuilder altAllelesBuilder = new StringBuilder("");
//ZRM 8_27
String altString = "";
int indelIndex = -1;
if(knownVariants.length==0 || knownVariants.length altAlleles = new ArrayList();
for(int aa = 1; aa!k.equals("VARIANT"))
.filter(k -> !(k.equals("DP") && hasDepth)) //Get rid of the DP tag if we have depth in the genotype table already.
.map(key->{
String[] annos=ga.getTextAnnotation(key);
if(annos[0].equals("TRUE")) return key;
return key+Arrays.stream(annos).collect(Collectors.joining(",","=",""));
})
.collect(Collectors.joining(";"));
if (hasDepth) {
//bw.write("DP=" + gt.depth().depthForSite(site)); // DP
//To Fix bug where ";DP=100" string would occur
if(annotationHolder.equals("")) {
annotationHolder += "DP=" + gt.depth().depthForSite(site);
}
else if(annotationHolder.equals(".")) {
//Fix bug where we have .;DP=100 showing up.
annotationHolder = "DP="+ gt.depth().depthForSite(site);
}
else {
annotationHolder += ";DP=" + gt.depth().depthForSite(site);
}
}
if(!annotationHolder.equals("")) {
bw.write(annotationHolder);
}
else {
bw.write("."); // DP
}
bw.write(delimChar);
if (hasDepth) {
bw.write("GT:AD:DP:GQ:PL");
} else {
bw.write("GT");
}
for (int taxa = 0; taxa < gt.numberOfTaxa(); taxa++) {
bw.write(delimChar);
// GT = genotype
byte[] values = gt.genotypeArray(taxa, site);
if(knownVariants.length>0) {
bw.write(alleleRedirect[values[0]]+"/"+alleleRedirect[values[1]]);
}
else {
//Ramu Fix
// if(alleleRedirect[values[0]].equals(".")) {
// bw.write(alleleRedirect[values[0]]);
// }
// else {
// if(indelIndex != -1 && Integer.parseInt(alleleRedirect[values[0]]) > indelIndex) {
// bw.write(""+(Integer.parseInt(alleleRedirect[values[0]]) -1 ));
// }
// else {
// bw.write(alleleRedirect[values[0]]);
// }
// }
//handle if no Known Variants(from a different file type)
if(NucleotideAlignmentConstants.getHaplotypeNucleotide((byte)values[0]).equals("-")) {
//TODO handle Missing better
bw.write(".");
}
else {
if(alleleRedirect[values[0]].equals(".")) {
bw.write(alleleRedirect[values[0]]);
}
else {
if(indelIndex != -1 && Integer.parseInt(alleleRedirect[values[0]]) > indelIndex) {
bw.write(""+(Integer.parseInt(alleleRedirect[values[0]]) -1 ));
}
else {
bw.write(alleleRedirect[values[0]]);
}
}
}
bw.write("/");
//Ramu Fix
// if(alleleRedirect[values[1]].equals(".")) {
// bw.write(alleleRedirect[values[1]]);
// }
// else {
// if(indelIndex != -1 && Integer.parseInt(alleleRedirect[values[1]]) > indelIndex) {
// bw.write(""+(Integer.parseInt(alleleRedirect[values[1]]) -1));
// }
// else {
// bw.write(alleleRedirect[values[1]]);
// }
// }
if(NucleotideAlignmentConstants.getHaplotypeNucleotide((byte)values[1]).equals("-")) {
bw.write(".");
}
else {
if(alleleRedirect[values[1]].equals(".")) {
bw.write(alleleRedirect[values[1]]);
}
else {
if(indelIndex != -1 && Integer.parseInt(alleleRedirect[values[1]]) > indelIndex) {
bw.write(""+(Integer.parseInt(alleleRedirect[values[1]]) -1));
}
else {
bw.write(alleleRedirect[values[1]]);
}
}
}
}
if (!(hasDepth)) {
continue;
}
bw.write(":");
// AD
int[] siteAlleleDepths = gt.depthForAlleles(taxa, site);
int siteTotalDepth = 0;
ArrayList depthsList = new ArrayList();
//Fix missing commas in depth information
for(int ss = 0; ss < sortedAlleles.length; ss++) {
if(ss!=indelIndex && sortedAlleles[ss]""+depth).collect(Collectors.joining(",")));
//
// for (int ss = 0; ss < sortedAlleles.length; ss++) {
// //bw.write("" + AlleleDepthUtil.decode(siteAlleleDepths[sortedAlleles[ss]]));
// if(ss!=indelIndex) {
// bw.write("" + siteAlleleDepths[sortedAlleles[ss]]);
// if (ss < sortedAlleles.length - 1 && ss+1!=indelIndex) {
// bw.write(',');
// }
// siteTotalDepth += siteAlleleDepths[sortedAlleles[ss]];
// }
//
// }
bw.write(":");
// DP
bw.write(siteTotalDepth + "");
int[] scores = new int[]{-1, -1, -1, -1};
if (values[0] != GenotypeTable.UNKNOWN_ALLELE) {
int altDepth = (sortedAlleles.length < 2 || sortedAlleles[1]>=siteAlleleDepths.length) ? 0 : siteAlleleDepths[sortedAlleles[1]];
altDepth = (altDepth<0) ? 0 : altDepth;
//int refDepth = (siteAlleleDepths[sortedAlleles[0]]==-1) ? 0 : siteAlleleDepths[sortedAlleles[0]];
//Check to see if either the major or alt allele has depth
if(siteAlleleDepths[sortedAlleles[0]] >= 0 && altDepth >= 0) {
scores = VCFUtil.getScore(siteAlleleDepths[sortedAlleles[0]], altDepth);
bw.write(":");
// GQ
bw.write(scores[3] + "");
bw.write(":");
// PL
int k = sortedAlleles.length - 1;
int[] fullPL = new int[(k * (k+1)/2)+k+1];
//Set all the values to 255 as Higher PL means its less likely to be correct
//Zero PL means the probability of error is 0
Arrays.fill(fullPL,255);
//Leaving these indicies in expanded form so we know its correct
//it should really just be in positions 0,1, and 2 regardless of number of sites
//(k*(k+1)/2)+j
//If we only have 1 allele we should only have 1 likelihood
if(fullPL.length==1) {
fullPL[0] = scores[0];
}
else {
fullPL[(0 * (0 + 1)/2) + 0] = scores[0];
fullPL[(1 * (1 + 1)/2) + 0] = scores[1];
fullPL[(1 * (1 + 1)/2) + 1] = scores[2];
}
for(int i = 0; i < fullPL.length-1; i++) {
bw.write(fullPL[i] + ",");
}
bw.write(""+fullPL[fullPL.length-1]);
// //Leaving these indicies in expanded form so we know its correct
// //it should really just be in positions 0,1, and 2 regardless of number of sites
// //(k*(k+1)/2)+j
// fullPL[(0 * (0 + 1)/2) + 0] = scores[0];
// fullPL[(1 * (1 + 1)/2) + 0] = scores[1];
// fullPL[(1 * (1 + 1)/2) + 1] = scores[2];
// for(int i = 0; i < fullPL.length-1; i++) {
// bw.write(fullPL[i] + ",");
// }
// bw.write(""+fullPL[fullPL.length-1]);
// //
// //bw.write(scores[0] + "," + scores[1] + "," + scores[2]);
}
}
// else {
// //If unknown just write out :0:0:0,0,0
// bw.write(":0:0,0,0");
// }
}
bw.newLine();
if (listener != null) {
listener.progress((int) (((double) (site + 1) / (double) numSites) * 100.0), null);
}
}
if (noAlleles > 0) {
myLogger.warn("Warning: " + noAlleles + " sites have no alleles.");
}
bw.flush();
bw.close();
} catch (Exception e) {
e.printStackTrace();
throw new IllegalArgumentException("Error writing VCF file: " + filename + ": " + ExceptionUtils.getExceptionCauses(e));
}
return filename;
}
private static void writeVCFSampleAnnotationToWriter(GenotypeTable gt, BufferedWriter bw) throws IOException {
for (Taxon taxon : gt.taxa()) {
GeneralAnnotation annotation = taxon.getAnnotation();
if ((annotation == null) || (annotation.numAnnotations() == 0)) {
continue;
}
Multimap annoMap = taxon.getAnnotation().getAnnotationAsMap();
String annoString = Joiner.on(',').withKeyValueSeparator("=").join(annoMap.entries());
bw.write("##SAMPLE=");
bw.newLine();
}
}
/**
* Writes given set of alignments to a set of Plink files
*
* @param alignment
* @param filename
* @param delimChar
*/
public static String writeToPlink(GenotypeTable alignment, String filename, char delimChar) {
if (delimChar != ' ' && delimChar != '\t') {
throw new IllegalArgumentException("Delimiter charater must be either a blank space or a tab.");
}
BufferedWriter MAPbw = null;
BufferedWriter PEDbw = null;
String mapFileName = Utils.addSuffixIfNeeded(filename, ".plk.map");
String pedFileName = Utils.addSuffixIfNeeded(filename, ".plk.ped");
try {
MAPbw = new BufferedWriter(new FileWriter(mapFileName), 1000000);
int numSites = alignment.numberOfSites();
for (int site = 0; site < numSites; site++) {
MAPbw.write(alignment.chromosomeName(site)); // chromosome name
MAPbw.write(delimChar);
MAPbw.write(alignment.siteName(site)); // rs#
MAPbw.write(delimChar);
MAPbw.write("-9"); // genetic distance unavailable
MAPbw.write(delimChar);
MAPbw.write(Integer.toString(alignment.chromosomalPosition(site))); // position
MAPbw.write("\n");
}
MAPbw.close();
PEDbw = new BufferedWriter(new FileWriter(pedFileName), 1000000);
// Compiled : Pattern
Pattern splitter = Pattern.compile(":");
int numTaxa = alignment.numberOfTaxa();
for (int taxa = 0; taxa < numTaxa; taxa++) {
String[] name = splitter.split(alignment.taxaName(taxa).trim());
if (name.length != 1) {
PEDbw.write(name[1]); // namelvl 1 if is available
} else {
PEDbw.write("-9");
}
PEDbw.write(delimChar);
PEDbw.write(alignment.taxaName(taxa).trim()); // namelvl 0
PEDbw.write(delimChar);
PEDbw.write("-9"); // paternal ID unavailable
PEDbw.write(delimChar);
PEDbw.write("-9"); // maternal ID unavailable
PEDbw.write(delimChar);
PEDbw.write("-9"); // gender is both
PEDbw.write(delimChar);
PEDbw.write("-9"); // phenotype unavailable, might have to change to "-9" for missing affection status
PEDbw.write(delimChar);
for (int site = 0; site < numSites; site++) {
String[] b = getSNPValueForPlink(alignment.genotypeAsStringArray(taxa, site));
PEDbw.write(b[0]);
PEDbw.write(delimChar);
PEDbw.write(b[b.length - 1]);
if (site != numSites - 1) {
PEDbw.write(delimChar);
}
}
PEDbw.write("\n");
}
PEDbw.close();
return mapFileName + " and " + pedFileName;
} catch (Exception e) {
myLogger.error("Error writing Plink files: " + mapFileName + " and " + pedFileName + ": " + ExceptionUtils.getExceptionCauses(e));
throw new IllegalArgumentException("Error writing Plink files: " + mapFileName + " and " + pedFileName + ": " + ExceptionUtils.getExceptionCauses(e));
} finally {
try {
PEDbw.close();
} catch (Exception e) {
// do nothing
}
try {
MAPbw.close();
} catch (Exception e) {
// do nothing
}
}
}
private static String[] getSNPValueForPlink(String[] base) {
for (int i = 0; i < base.length; i++) {
if (base[i].equals("N")) {
base[i] = "0";
} else if (base[i].equals("0")) {
base[i] = "D";
}
}
return base;
}
public static String saveDelimitedAlignment(GenotypeTable theAlignment, String delimit, String saveFile) {
if ((saveFile == null) || (saveFile.length() == 0)) {
return null;
}
saveFile = Utils.addSuffixIfNeeded(saveFile, ".txt");
FileWriter fw = null;
BufferedWriter bw = null;
try {
fw = new FileWriter(new File(saveFile));
bw = new BufferedWriter(fw);
bw.write("Taxa");
int numSites = theAlignment.numberOfSites();
for (int j = 0; j < numSites; j++) {
bw.write(delimit);
bw.write(String.valueOf(theAlignment.chromosomalPosition(j)));
}
bw.write("\n");
for (int r = 0, n = theAlignment.numberOfTaxa(); r < n; r++) {
bw.write(theAlignment.taxaName(r));
for (int i = 0; i < numSites; i++) {
bw.write(delimit);
bw.write(theAlignment.genotypeAsString(r, i));
}
bw.write("\n");
}
return saveFile;
} catch (Exception e) {
myLogger.error("Error writing Delimited Alignment: " + saveFile + ": " + ExceptionUtils.getExceptionCauses(e));
throw new IllegalArgumentException("Error writing Delimited Alignment: " + saveFile + ": " + ExceptionUtils.getExceptionCauses(e));
} finally {
try {
bw.close();
fw.close();
} catch (Exception e) {
// do nothing
}
}
}
/**
* print alignment (in PHYLIP SEQUENTIAL format)
*/
public static void printSequential(GenotypeTable a, Writer out) throws IOException {
// PHYLIP header line
out.write(" " + a.numberOfTaxa() + " " + a.numberOfSites() + " S" + "\n");
// Print sequences
for (int s = 0; s < a.numberOfTaxa(); s++) {
int n = 0;
while (n < a.numberOfSites()) {
if (n == 0) {
format.displayLabel(out, a.taxaName(s), 10);
out.write(" ");
} else {
out.write(" ");
}
printNextSites(a, out, false, s, n, 50);
out.write("\n");
n += 50;
}
}
}
/**
* print alignment (in PHYLIP 3.4 INTERLEAVED format)
*/
public static void printInterleaved(GenotypeTable a, Writer out) throws IOException {
int n = 0;
// PHYLIP header line
out.write(" " + a.numberOfTaxa() + " " + a.numberOfSites() + "\n");
// Print sequences
while (n < a.numberOfSites()) {
for (int s = 0; s < a.numberOfTaxa(); s++) {
if (n == 0) {
format.displayLabel(out, a.taxaName(s), 10);
out.write(" ");
} else {
out.write(" ");
}
printNextSites(a, out, true, s, n, 50);
out.write("\n");
}
out.write("\n");
n += 50;
}
}
/**
* Print alignment (in CLUSTAL W format)
*/
public static void printCLUSTALW(GenotypeTable a, Writer out) throws IOException {
int n = 0;
// CLUSTAL W header line
out.write("CLUSTAL W multiple sequence alignment\n\n");
// Print sequences
while (n < a.numberOfSites()) {
out.write("\n");
for (int s = 0; s < a.numberOfTaxa(); s++) {
format.displayLabel(out, a.taxaName(s), 10);
out.write(" ");
printNextSites(a, out, false, s, n, 50);
out.write("\n");
}
// Blanks in status line are necessary for some parsers)
out.write(" \n");
n += 50;
}
}
private static void printNextSites(GenotypeTable a, Writer out, boolean chunked, int seq, int start, int num) throws IOException {
// Print next num characters
for (int i = 0; (i < num) && (start + i < a.numberOfSites()); i++) {
// Chunks of 10 characters
if (i % 10 == 0 && i != 0 && chunked) {
out.write(' ');
}
out.write(a.genotypeAsString(seq, start + i));
}
}
public static String writeAlignmentToSerialGZ(GenotypeTable sba, String outFile) {
long time = System.currentTimeMillis();
File theFile = null;
FileOutputStream fos = null;
GZIPOutputStream gz = null;
ObjectOutputStream oos = null;
try {
theFile = new File(Utils.addSuffixIfNeeded(outFile, ".serial.gz"));
fos = new FileOutputStream(theFile);
gz = new GZIPOutputStream(fos);
oos = new ObjectOutputStream(gz);
oos.writeObject(sba);
return theFile.getName();
} catch (Exception e) {
e.printStackTrace();
myLogger.error("Error writing Serial GZ: " + theFile.getName() + ": " + ExceptionUtils.getExceptionCauses(e));
throw new IllegalArgumentException("Error writing Serial GZ: " + theFile.getName() + ": " + ExceptionUtils.getExceptionCauses(e));
} finally {
try {
oos.flush();
oos.close();
gz.close();
fos.close();
} catch (Exception e) {
// do nothing
}
myLogger.info("writeAlignmentToSerialGZ: " + theFile.toString() + " Time: " + (System.currentTimeMillis() - time));
}
}
/**
* Simple method to export a GenotypeTable to a fasta file.
* @param gt Import GenotypeTable which we want to export.
* @param out FileWriter which will be used to export the FASTA file.
*/
public static void writeFasta(GenotypeTable gt, Writer out) {
try {
TaxaList tl = gt.taxa();
for(int i = 0; i < tl.size(); i++) {
out.write(">");
out.write(tl.get(i).getName());
out.write("\n");
for(int j = 0; j < gt.positions().size(); j++) {
byte call = gt.genotype(i,j);
out.write(NucleotideAlignmentConstants.getNucleotideIUPAC(call));
}
out.write("\n");
}
}
catch(Exception e) {
e.printStackTrace();
myLogger.error("Error writing FASTA file: " + ExceptionUtils.getExceptionCauses(e));
throw new IllegalArgumentException("Error writing FastaAlignment: "+ ExceptionUtils.getExceptionCauses(e));
}
}
/**
* Simple method to export a GenotypeTable to a fasta file while removing Gap characters.
* @param gt GenotypeTable which we want to export.
* @param out FileWriter which will be used to export the FASTA file.
*/
public static void writeFastaNoGaps(GenotypeTable gt, Writer out) {
try {
TaxaList tl = gt.taxa();
for(int i = 0; i < tl.size(); i++) {
out.write(">");
out.write(tl.get(i).getName());
out.write("\n");
for(int j = 0; j < gt.positions().size(); j++) {
byte call = gt.genotype(i,j);
if(!NucleotideAlignmentConstants.getNucleotideIUPAC(call).equals("-")) {
out.write(NucleotideAlignmentConstants.getNucleotideIUPAC(call));
}
}
out.write("\n");
}
}
catch(Exception e) {
e.printStackTrace();
myLogger.error("Error writing FASTA file: " + ExceptionUtils.getExceptionCauses(e));
throw new IllegalArgumentException("Error writing FastaAlignment: "+ ExceptionUtils.getExceptionCauses(e));
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy