net.maizegenetics.analysis.gbs.MergeDuplicateSNPsPlugin Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of tassel Show documentation
Show all versions of tassel Show documentation
TASSEL is a software package to evaluate traits associations, evolutionary patterns, and linkage
disequilibrium.
The newest version!
/*
* MergeDuplicateSNPsPlugin
*/
package net.maizegenetics.analysis.gbs;
import net.maizegenetics.dna.snp.GenotypeTableBuilder;
import net.maizegenetics.dna.snp.GenotypeTable;
import net.maizegenetics.dna.snp.GenotypeTableUtils;
import net.maizegenetics.dna.snp.ExportUtils;
import net.maizegenetics.dna.snp.NucleotideAlignmentConstants;
import net.maizegenetics.dna.snp.ImportUtils;
import net.maizegenetics.dna.snp.genotypecall.GenotypeCallTableBuilder;
import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.dna.map.GeneralPosition;
import net.maizegenetics.dna.map.PositionListBuilder;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.util.ArgsEngine;
import net.maizegenetics.dna.snp.io.VCFUtil;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import javax.swing.*;
import java.awt.*;
import java.io.File;
import java.util.*;
/**
* This class is intended to be run directly after DiscoverySNPCallerPlugin,
* using the HapMap file from that step as input.
*
* It finds duplicate SNPs in the HapMap file, and merges them if they have the
* same pair of alleles (not necessarily in the same maj/min order) and if there
* mismatch rate is no greater than the threshold (-maxMisMat). If -callHets is
* on, then genotypic disagreements will be called heterozygotes (otherwise set
* to 'N' = default).
*
* By default, any remaining unmerged duplicate SNPs (but not indels) will be
* deleted. They can be kept by invoking the -kpUnmergDups option.
*
* If the germplasm is not fully inbred, and still contains residual
* heterozygosity (like the maize NAM or IBM populations do) then -callHets
* should be on and -maxMisMat should be set fairly high (0.1 to 0.2, depending
* on the amount of heterozygosity).
*
* Todo the VCF support has been commented out, but this should all be merged into the main pipeline.
*
* @author jcg233
*/
public class MergeDuplicateSNPsPlugin extends AbstractPlugin {
private static Logger myLogger = LogManager.getLogger(MergeDuplicateSNPsPlugin.class);
private static ArgsEngine myArgsEngine = null;
private String suppliedInputFileName, suppliedOutputFileName, infile, outfile;
private String snpLogFileName;
private SNPLogging snpLogging = null;
private double maxMisMat = 0.05;
private boolean usePedigree = false;
private HashMap taxaFs = null;
private boolean[] useTaxaForCompare = null;
private int nInbredTaxa = Integer.MIN_VALUE;
private boolean callHets = false; // true = when two genotypes disagree at a SNP, call it a heterozygote; false = set to missing;
private boolean kpUnmergDups = false; // keep unmerged SNPs (not indels) in the data file
private int startChr = 1, endChr = 10;
private static enum INPUT_FORMAT {hapmap, vcf}; //input file format, acceptable values are "hapmap" "vcf"
private INPUT_FORMAT inputFormat = INPUT_FORMAT.hapmap;
private int myMaxNumAlleles;
public MergeDuplicateSNPsPlugin() {
super(null, false);
}
public MergeDuplicateSNPsPlugin(Frame parentFrame) {
super(parentFrame, false);
}
private void printUsage() {
myLogger.info(
"\n\nUsage is as follows:\n"
+ "-hmp Input GBS genotype file (in HapMap format). Use a plus sign (+) as a wild card character to specify multiple chromosome numbers.\n"
+ "-vcf Input GBS genotype file (in VCF format). Use a plus sign (+) as a wild card character to specify multiple chromosome numbers. Options -hmp and -vcf are mutual exclusive.\n"
+ "-o Output HapMap file. Use a plus sign (+) as a wild card character to specify multiple chromosome numbers.\n"
+ "-misMat Threshold genotypic mismatch rate above which the duplicate SNPs won't be merged (default: " + maxMisMat + ")\n"
+ "-p Pedigree file containing full sample names (or expected names after merging) & expected inbreeding\n"
+ " coefficient (F) for each. Only highly inbred taxa, with F >= 0.8 (e.g., S3 or more), will be used\n"
+ " to test if two duplicate SNPs agree with each other (default: use ALL taxa to compare duplicate SNPs)\n"
+ "-callHets When two genotypes disagree at a SNP, call it a heterozygote (default: " + callHets + " = set to missing)\n"
+ "-kpUnmergDups When two duplicate SNPs were not merged (different alleles or too many mismatches), keep them (default: " + kpUnmergDups + " = delete them)\n"
+ "-sC Start chromosome\n"
+ "-eC End chromosome\n"
+ "-maxAlleleVCF Maximum number of alleles allowed in vcf file.\n"
+ "-snpLog SNPs Removed Log file name\n\n");
}
@Override
public void setParameters(String[] args) {
if (args.length == 0) {
printUsage();
throw new IllegalArgumentException("\n\nPlease use the above arguments/options.\n\n");
}
if (myArgsEngine == null) {
myArgsEngine = new ArgsEngine();
myArgsEngine.add("-hmp", "--hmpFile", true);
myArgsEngine.add("-vcf", "--vcfFile", true);
myArgsEngine.add("-o", "--outFile", true);
myArgsEngine.add("-misMat", "--maxMismatchRate", true);
myArgsEngine.add("-p", "--pedigree-file", true);
myArgsEngine.add("-callHets", "--callHeterozygotes", false);
myArgsEngine.add("-kpUnmergDups", "--keepUnmergedDuplicates", false);
myArgsEngine.add("-sC", "--startChromosome", true);
myArgsEngine.add("-eC", "--endChromosome", true);
myArgsEngine.add("-maxAlleleVCF", "--maxAlleleVCF", true);
myArgsEngine.add("-snpLog", "", true);
}
myArgsEngine.parse(args);
if (myArgsEngine.getBoolean("-hmp")) {
if (myArgsEngine.getBoolean("-vcf")){
throw new IllegalArgumentException("-hmp and -vcf options are mutual exclusive!\n");
}
suppliedInputFileName = myArgsEngine.getString("-hmp");
inputFormat = INPUT_FORMAT.hapmap;
} else if (myArgsEngine.getBoolean("-vcf")){
suppliedInputFileName = myArgsEngine.getString("-vcf");
inputFormat = INPUT_FORMAT.vcf;
}
else {
printUsage();
throw new IllegalArgumentException("Please specify a HapMap or vcf file to filter.\n");
}
if (myArgsEngine.getBoolean("-o")) {
suppliedOutputFileName = myArgsEngine.getString("-o");
} else {
printUsage();
throw new IllegalArgumentException("Please specify an output file name.\n");
}
if (myArgsEngine.getBoolean("-misMat")) {
maxMisMat = Double.parseDouble(myArgsEngine.getString("-misMat"));
}
if (myArgsEngine.getBoolean("-p")) {
String pedigreeFileStr = myArgsEngine.getString("-p");
File pedigreeFile = new File(pedigreeFileStr);
if (!pedigreeFile.exists() || !pedigreeFile.isFile()) {
printUsage();
throw new IllegalArgumentException("Can't find the pedigree input file (-p option: " + pedigreeFileStr + ").");
}
taxaFs = DiscoverySNPCallerPlugin.readTaxaFsFromFile(pedigreeFile);
if (taxaFs == null) {
throw new IllegalArgumentException("Problem reading the pedigree file. Progam aborted.");
}
usePedigree = true;
}
if (myArgsEngine.getBoolean("-callHets")) {
callHets = true;
}
if (myArgsEngine.getBoolean("-kpUnmergDups")) {
kpUnmergDups = true;
}
if (myArgsEngine.getBoolean("-sC")) {
startChr = Integer.parseInt(myArgsEngine.getString("-sC"));
}
if (myArgsEngine.getBoolean("-eC")) {
endChr = Integer.parseInt(myArgsEngine.getString("-eC"));
}
if (endChr - startChr < 0) {
printUsage();
throw new IllegalArgumentException("Error: The start chromosome is higher than the end chromosome.");
}
if (myArgsEngine.getBoolean("-snpLog")) {
snpLogFileName = myArgsEngine.getString("-snpLog");
}
if (myArgsEngine.getBoolean("-maxAlleleVCF")) {
if (! myArgsEngine.getBoolean("-vcf")){
throw new IllegalArgumentException("-maxAlleleVCF option only works with -vcf input.\n");
}
myMaxNumAlleles = Integer.parseInt(myArgsEngine.getString("-maxAlleleVCF"));
}
else
{
myMaxNumAlleles = VCFUtil.VCF_DEFAULT_MAX_NUM_ALLELES;
}
snpLogging = new SNPLogging(snpLogFileName, this.getClass());
}
@Override
public DataSet performFunction(DataSet input) {
for (int chr = startChr; chr <= endChr; chr++) {
infile = suppliedInputFileName.replace("+", "" + chr);
outfile = suppliedOutputFileName.replace("+", "" + chr);
myLogger.info("Reading: " + infile);
GenotypeTable a;
try {
if (inputFormat == INPUT_FORMAT.hapmap)
{
a = ImportUtils.readFromHapmap(infile, this);
}
else if (inputFormat == INPUT_FORMAT.vcf)
{
a = ImportUtils.readFromVCF(infile, this, true);
}
else
{
throw new IllegalArgumentException("File format " + inputFormat + " is not recognized!");
}
myLogger.info("Original Alignment Taxa:" + a.numberOfTaxa() + " Sites:" + a.numberOfSites());
if (usePedigree && !maskNonInbredTaxa(a)) {
throw new IllegalArgumentException("Mismatch between taxa names in the input hapmap and pedigree files.");
}
} catch (Exception e) {
myLogger.info("Could not read input hapmap file for chr" + chr + ":\n\t" + infile + "\n\te: " + e + "\n\tSkipping...");
continue;
}
//MutableNucleotideAlignment msa = null;
GenotypeCallTableBuilder msa=GenotypeCallTableBuilder.getInstance(a.numberOfTaxa(),a.numberOfSites());
PositionListBuilder posBuilder=new PositionListBuilder();
// if (inputFormat == INPUT_FORMAT.hapmap)
// {
// msa = MutableNucleotideAlignment.getInstance(a.taxa(), a.numberOfSites());
// }
// else if (inputFormat == INPUT_FORMAT.vcf)
// {
// msa = MutableVCFAlignment.getInstance(a.taxa(), a.numberOfSites(), myMaxNumAlleles);
// }
ArrayList samePosAL = new ArrayList();
Integer[] samePos = null;
int currentPos = a.chromosomalPosition(0);
for (int s = 0; s < a.numberOfSites(); s++) { // must be sorted by position, as HapMap files typically are (ImportUtils.readFromHapmap() fails if they aren't)
int newPos = a.chromosomalPosition(s);
if (newPos == currentPos) { // assumes that the strands are all '+' (in DiscoverySNPCallerPlugin(), - strand genos were complemented)
samePosAL.add(s); // collect markers with the same position
} else {
samePos = samePosAL.toArray(new Integer[samePosAL.size()]);
if (samePosAL.size() > 1) { // merge sets of 2 or more markers with the same position and alleles (alleles are not necessarily in the same order, maj/min)
if (inputFormat == INPUT_FORMAT.hapmap)
{
processSNPsWithSamePosition(samePos, a, chr, currentPos, msa, posBuilder);
}
//TODO restore VCF type data
// else if (inputFormat == INPUT_FORMAT.vcf)
// {
// processSNPsWithSamePositionForVCF(samePos, a, chr, currentPos, msa);
// }
} else { // site has a unique position: write its genos to the msa
byte[] genos = new byte[a.numberOfTaxa()];
for (int t = 0; t < a.numberOfTaxa(); ++t) {
genos[t] = a.genotype(t, samePos[0]);
}
addSiteToMutableAlignment(chr, currentPos, genos, msa, posBuilder);
// if (inputFormat==INPUT_FORMAT.vcf)
// {
// int lastSiteIndex = msa.numberOfSites() -1;
// msa.setCommonAlleles(lastSiteIndex, a.allelesBySortType(Alignment.ALLELE_SCOPE_TYPE.Depth, samePos[0]));
// msa.setReferenceAllele(lastSiteIndex, a.referenceAllele(samePos[0]));
// for (int tt=0; tt();
samePosAL.add(s);
currentPos = newPos;
}
}
// finish last site or set of sites
samePos = samePosAL.toArray(new Integer[samePosAL.size()]);
if (samePosAL.size() > 1) {
//processSNPsWithSamePosition(samePos, a, chr, currentPos, msa);
if (inputFormat==INPUT_FORMAT.hapmap)
{
processSNPsWithSamePosition(samePos, a, chr, currentPos, msa, posBuilder);
}
// else if (inputFormat==INPUT_FORMAT.vcf)
// {
// processSNPsWithSamePositionForVCF(samePos, a, chr, currentPos, msa);
// }
} else { // site has a unique position: write its genos to the msa
byte[] genos = new byte[a.numberOfTaxa()];
for (int t = 0; t < a.numberOfTaxa(); ++t) {
genos[t] = a.genotype(t, samePos[0]);
}
addSiteToMutableAlignment(chr, currentPos, genos, msa, posBuilder);
// if (inputFormat==INPUT_FORMAT.vcf)
// {
// int lastSiteIndex = msa.numberOfSites() -1;
// msa.setCommonAlleles(lastSiteIndex, a.allelesBySortType(Alignment.ALLELE_SCOPE_TYPE.Depth, samePos[0]));
// msa.setReferenceAllele(lastSiteIndex, a.referenceAllele(samePos[0]));
// for (int tt=0; tt MergedDepthTable = new HashMap();
// HashMap AllAlleleDepthTable = new HashMap();
//
// for (int s:samePos)
// {
// //get alleles for each site
// byte[] alleles = a.allelesBySortType(Alignment.ALLELE_SCOPE_TYPE.Depth, s);
//
// //initiate new alleles in the allele list into the hashmap
// for (byte allele:alleles)
// {
// if (!MergedDepthTable.containsKey(allele))
// {
// int[] newDepth = new int[taxaCount];
// Arrays.fill(newDepth, 0);
// MergedDepthTable.put(allele, newDepth);
// AllAlleleDepthTable.put(allele, 0);
// }
// }
// //add up the depth for each alleles
// for (int t = 0; t < a.numberOfTaxa(); ++t)
// {
// byte[] currDepth = a.depthForAlleles(t, s); // get the current depth for site s taxa t
// for (int i=0; i < alleles.length; i++)
// {
// if (currDepth[i]>0)
// {
// MergedDepthTable.get(alleles[i])[t] += currDepth[i];
// AllAlleleDepthTable.put(alleles[i], AllAlleleDepthTable.get(alleles[i]) + currDepth[i]);
// }
// }
// }
// }
//
// //sort allele by count
// ArrayList AllAllelesList = new ArrayList ();
// AllAllelesList.addAll(AllAlleleDepthTable.keySet());
// Collections.sort(AllAllelesList, new HashValueComparator(AllAlleleDepthTable));
//
// //get top alleles and read depth for each allele
// int allelesCount = (AllAllelesList.size()>myMaxNumAlleles)?myMaxNumAlleles:AllAllelesList.size();
// byte CommonAlleles[] = new byte[allelesCount]; //all alleles
// int[][] alleleDepthsInTaxa = new int[myMaxNumAlleles][taxaCount];
// for(int[] row: alleleDepthsInTaxa)
// {
// Arrays.fill(row, 0);
// }
//
// for (int i=0; i0)
// {
// myMisMatchRate=(double)nMisMatch/nCompared;
// }
// if (myMisMatchRate127?(byte)127:(byte)alleleDepthsInTaxa[i][t];
//
// }
// msa.setDepthForAlleles(t, lastSiteIndex, alleleDepth);
// }
// }
// else
// {
// System.out.println("Not merged position: " + a.chromosomalPosition(samePos[0]) + " Mismatch: "+ (int)(myMisMatchRate *100) + "%." );
//
//
// if (kpUnmergDups)
// {
// for (int s:samePos)
// {
// genos = new byte[a.numberOfTaxa()];
// for (int t = 0; t < a.numberOfTaxa(); ++t) {
// genos[t] = a.genotype(t, s);
// }
// addSiteToMutableAlignment(chr, currentPos, genos, msa);
//
// int lastSiteIndex = msa.numberOfSites() -1;
// msa.setCommonAlleles(lastSiteIndex, a.allelesBySortType(Alignment.ALLELE_SCOPE_TYPE.Depth, s));
// msa.setReferenceAllele(lastSiteIndex, a.referenceAllele(s));
// for (int tt=0; tt samePosAL = new ArrayList();
// int currentPos = theMSA.chromosomalPosition(0);
// for (int s = 0; s < theMSA.numberOfSites(); s++) {
// int newPos = theMSA.chromosomalPosition(s);
// if (newPos == currentPos) {
// samePosAL.add(s); // collect markers with the same position
// } else {
// if (samePosAL.size() > 1) {
// Integer[] samePos = samePosAL.toArray(new Integer[samePosAL.size()]);
// for (int i = 0; i < samePos.length; ++i) {
// if (theMSA.majorAllele(samePos[i].intValue()) != NucleotideAlignmentConstants.GAP_DIPLOID_ALLELE
// && theMSA.minorAllele(samePos[i].intValue()) != NucleotideAlignmentConstants.GAP_DIPLOID_ALLELE) {
// snpLogging.writeEntry(theMSA, samePos[i], null, null, "Delete Remaining Duplicates", "Removed", null, null);
// theMSA.clearSiteForRemoval(samePos[i]);
// }
// }
// }
// // start a new collection of markers
// samePosAL = new ArrayList();
// samePosAL.add(s);
// currentPos = newPos;
// }
// }
// // finish last site or set of sites
// if (samePosAL.size() > 1) {
// Integer[] samePos = samePosAL.toArray(new Integer[samePosAL.size()]);
// for (int i = 0; i < samePos.length; ++i) {
// if (theMSA.majorAllele(samePos[i].intValue()) != NucleotideAlignmentConstants.GAP_DIPLOID_ALLELE
// && theMSA.minorAllele(samePos[i].intValue()) != NucleotideAlignmentConstants.GAP_DIPLOID_ALLELE) {
// snpLogging.writeEntry(theMSA, samePos[i], null, null, "Delete Remaining Duplicates", "Removed", null, null);
// theMSA.clearSiteForRemoval(samePos[i]);
// }
// }
// }
// snpLogging.close();
// theMSA.clean();
// myLogger.info("Number of sites written after deleting any remaining, unmerged duplicate SNPs: " + theMSA.numberOfSites());
// }
private boolean maskNonInbredTaxa(GenotypeTable a) {
useTaxaForCompare = new boolean[a.numberOfTaxa()]; // initialized to false
nInbredTaxa = 0;
try {
for (int taxon = 0; taxon < a.numberOfTaxa(); taxon++) {
if (taxaFs.containsKey(a.taxaName(taxon))) {
if (taxaFs.get(a.taxaName(taxon)) >= 0.8) {
useTaxaForCompare[taxon] = true;
nInbredTaxa++;
}
} else {
if (a.taxaName(taxon).contentEquals("REFERENCE_GENOME")) {
useTaxaForCompare[taxon] = false;
} else {
throw new Exception("Taxon " + a.taxaName(taxon) + " not found in the pedigree file");
}
}
}
myLogger.info(nInbredTaxa + " highly inbred taxa (with an expected F >= 0.8) were found in the input hapmap file (according to the pedigree file)");
return true;
} catch (Exception e) {
myLogger.error("Mismatch between taxa names in the input hapmap file and the pedigree file e=" + e);
e.printStackTrace();
return false;
}
}
private static byte resolveHet(byte geno1, byte geno2) {
byte[] result = new byte[2];
result[0] = (byte) (geno1 >>> 4);
byte temp = (byte) (geno1 & 0xf);
int count = 1;
if (temp != result[0]) {
result[count++] = temp;
}
temp = (byte) (geno2 >>> 4);
if (temp == result[0]) {
// do nothing
} else if (count == 1) {
result[count++] = temp;
} else if (temp != result[1]) {
throw new IllegalStateException();
}
temp = (byte) (geno2 & 0xf);
if (temp == result[0]) {
// do nothing
} else if (count == 1) {
result[count++] = temp;
} else if (temp != result[1]) {
throw new IllegalStateException();
}
return (byte) ((result[0] << 4) | (result[1]));
}
@Override
public ImageIcon getIcon() {
throw new UnsupportedOperationException("Not supported yet.");
}
@Override
public String getButtonName() {
throw new UnsupportedOperationException("Not supported yet.");
}
@Override
public String getToolTipText() {
throw new UnsupportedOperationException("Not supported yet.");
}
}