All Downloads are FREE. Search and download functionalities are using the official Maven repository.

umcg.genetica.io.pileup.PileupToVcf Maven / Gradle / Ivy

There is a newer version: 1.0.7
Show newest version
package umcg.genetica.io.pileup;

import gnu.trove.map.TIntObjectMap;
import gnu.trove.map.hash.TIntObjectHashMap;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.molgenis.genotype.Allele;
import org.molgenis.genotype.Alleles;
import org.molgenis.genotype.RandomAccessGenotypeData;
import org.molgenis.genotype.RandomAccessGenotypeDataReaderFormats;
import org.molgenis.genotype.variant.GeneticVariant;

/**
 *
 * @author Patrick Deelen
 */
public class PileupToVcf {

	private static final String VERSION = "1.3";
	private static final String HEADER =
			"  /---------------------------------------\\\n"
			+ "  |             Pileup to VCF             |\n"
			+ "  |                                       |\n"
			+ "  |             Patrick Deelen            |\n"
			+ "  |        [email protected]        |\n"
			+ "  |                                       |\n"
			+ "  |           Dasha Zhernakova,           |\n"
			+ "  |      Lude Franke, Morris Swertz       |\n"
			+ "  |                                       |\n"
			+ "  |     Genomics Coordication Center      |\n"
			+ "  |        Department of Genetics         |\n"
			+ "  |  University Medical Center Groningen  |\n"
			+ "  \\---------------------------------------/";
	private static final Options OPTIONS;
	private static final DateFormat DATE_TIME_FORMAT = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
	private static final Date currentDataTime = new Date();
	
	static {
		OPTIONS = new Options();
	
		OptionBuilder.withArgName("path");
		OptionBuilder.hasArg();
		OptionBuilder.isRequired();
		OptionBuilder.withDescription("Pileup file max 1 sample");
		OPTIONS.addOption(OptionBuilder.create('p'));
		
		OptionBuilder.withArgName("string");
		OptionBuilder.hasArg();
		OptionBuilder.isRequired();
		OptionBuilder.withDescription("Sample name");
		OPTIONS.addOption(OptionBuilder.create('s'));
		
		OptionBuilder.withArgName("path");
		OptionBuilder.hasArg();
		OptionBuilder.isRequired();
		OptionBuilder.withDescription("Output VCF");
		OPTIONS.addOption(OptionBuilder.create('v'));
		
		OptionBuilder.withArgName("path");
		OptionBuilder.hasArgs();
		OptionBuilder.isRequired();
		OptionBuilder.withDescription("Reference vcf. Output only contains SNPs present in this file with same ref and alt alleles");
		OPTIONS.addOption(OptionBuilder.create('r'));
		
		OptionBuilder.withArgName("int");
		OptionBuilder.hasArgs();
		OptionBuilder.withDescription("Minimum base quality to include read in ref / alt count");
		OPTIONS.addOption(OptionBuilder.create('b'));
	
	}
	
	public static void main(String[] args) throws ParseException, IOException {
		
		System.out.println(HEADER);
		System.out.println();
		System.out.println("          --- Version: " + VERSION + " ---");
		System.out.println();
		System.out.println("More information: http://molgenis.org/systemsgenetics");
		System.out.println();

		System.out.println("Current date and time: " + DATE_TIME_FORMAT.format(currentDataTime));
		System.out.println();

		System.out.flush(); //flush to make sure header is before errors
		try {
			Thread.sleep(25); //Allows flush to complete
		} catch (InterruptedException ex) {
		}
		
		if (args.length == 0) {
			new HelpFormatter().printHelp(" ", OPTIONS);
		}
			
		final CommandLine commandLine = new PosixParser().parse(OPTIONS, args, true);

		final int minBaseQuality = commandLine.hasOption('b') ? Integer.parseInt(commandLine.getOptionValue('b')) : 0;
		final RandomAccessGenotypeData referenceVcf = RandomAccessGenotypeDataReaderFormats.VCF.createGenotypeData(commandLine.getOptionValue('r'));
		final PileupFile pileupFile = new PileupFile(commandLine.getOptionValue('p'), minBaseQuality);
		final File outputVcf = new File(commandLine.getOptionValue('v'));
		final String sample = commandLine.getOptionValue('s');
				
		System.out.println("Ref VCF: " + commandLine.getOptionValue('r'));
		System.out.println("Pileup file: " + pileupFile.getAbsolutePath());
		System.out.println("Output VCF: " + outputVcf.getAbsolutePath());
		System.out.println("Sample name: " + sample);
		System.out.println("Min base quality: " + minBaseQuality);
		
		convertPileupToVcf2(pileupFile, sample, outputVcf, referenceVcf, minBaseQuality);
		
		System.out.println("Conversion complete");
		
	}
	
		
   /**
	* Parse pileup file and create vcf files with only SNPs that are in referenceGenotype data. VCF only contains the counts for the 2 alleles in the reference genotype data. Only parses the first samples in the pileup file
	* 
	* @param pileupFile
	* @param sample
	* @param outputVcf
	* @param referenceGenotypes
	* @throws IOException 
	*/
    public static void convertPileupToVcf(final PileupFile pileupFile, final String sample, final File outputVcf, final RandomAccessGenotypeData referenceGenotypes, final int minBaseQuality) throws IOException {
        
		final BufferedWriter vcfWriter = new BufferedWriter(new FileWriter(outputVcf));
			
		vcfWriter.append("##fileformat=VCFv4.2\n");
		vcfWriter.append("##source=ConvertPileupToVcf version: " + VERSION + "\n");
		vcfWriter.append("##pileupFile=" + pileupFile.getAbsolutePath() + "\n");
		vcfWriter.append("##minBaseQuality=" + minBaseQuality + "\n");
		vcfWriter.append("##FORMAT=\n");
		vcfWriter.append("##FORMAT=\n");
		vcfWriter.append("##FORMAT=\n");
		vcfWriter.append("##FORMAT=\n");
		vcfWriter.append("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" + sample + "\n");
		
		for(PileupEntry pileupEntry : pileupFile){
			GeneticVariant refVariant = referenceGenotypes.getSnpVariantByPos(pileupEntry.getChr(), pileupEntry.getPos());
			if(refVariant != null){
				
				Allele refAllele = refVariant.getVariantAlleles().get(0);
				Allele altAllele = refVariant.getVariantAlleles().get(1);
				
				int refCount = pileupEntry.getAlleleCount(refAllele);
				int altCount = pileupEntry.getAlleleCount(altAllele);
				
				if(refCount == 0 && altCount == 0){
					continue;
				}
			
				vcfWriter.append(refVariant.getSequenceName());
				vcfWriter.append('\t');
				vcfWriter.append(String.valueOf(refVariant.getStartPos()));
				vcfWriter.append('\t');
				vcfWriter.append(refVariant.getPrimaryVariantId());
				vcfWriter.append('\t');
				vcfWriter.append(refAllele.getAlleleAsString());
				vcfWriter.append('\t');
				vcfWriter.append(altAllele.getAlleleAsString());
				vcfWriter.append("\t.\tPASS\t.\tAD:RQ:TD:ADQ\t");
				vcfWriter.append(String.valueOf(refCount));
				vcfWriter.append(',');
				vcfWriter.append(String.valueOf(altCount));
				vcfWriter.append(':');
				vcfWriter.append(String.valueOf(pileupEntry.getAlleleAverageQuality(refAllele)));
				vcfWriter.append(',');
				vcfWriter.append(String.valueOf(pileupEntry.getAlleleAverageQuality(altAllele)));
				vcfWriter.append(':');
				vcfWriter.append(String.valueOf(pileupEntry.getReadDepth()));
				vcfWriter.append(':');
				vcfWriter.append(String.valueOf(pileupEntry.getMinimumBaseQuality()));
				vcfWriter.append('\n');
				
			}
		}
		
		vcfWriter.close();
	
    }

	/**
	* Parse pileup file and create vcf files with only SNPs that are in referenceGenotype data. VCF only contains the counts for the 2 alleles in the reference genotype data. Only parses the first samples in the pileup file
	* 
	* @param pileupFile
	* @param sample
	* @param outputVcf
	* @param referenceGenotypes
	* @throws IOException 
	*/
    public static void convertPileupToVcf2(final PileupFile pileupFile, final String sample, final File outputVcf, final RandomAccessGenotypeData referenceGenotypes, final int minBaseQuality) throws IOException {
        
		HashMap> variantsAlleles = new HashMap>();
		
		for(GeneticVariant variant : referenceGenotypes){
			
			TIntObjectMap chrVariantAlleles = variantsAlleles.get(variant.getSequenceName());
			if(chrVariantAlleles == null){
				chrVariantAlleles = new TIntObjectHashMap();
				variantsAlleles.put(variant.getSequenceName(), chrVariantAlleles);
			}
			chrVariantAlleles.put(variant.getStartPos(), variant.getVariantAlleles());
			
		}
		
		
		final BufferedWriter vcfWriter = new BufferedWriter(new FileWriter(outputVcf));
			
		vcfWriter.append("##fileformat=VCFv4.2\n");
		vcfWriter.append("##source=ConvertPileupToVcf version: " + VERSION + "\n");
		vcfWriter.append("##pileupFile=" + pileupFile.getAbsolutePath() + "\n");
		vcfWriter.append("##minBaseQuality=" + minBaseQuality + "\n");
		vcfWriter.append("##FORMAT=\n");
		vcfWriter.append("##FORMAT=\n");
		vcfWriter.append("##FORMAT=\n");
		vcfWriter.append("##FORMAT=\n");
		vcfWriter.append("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" + sample + "\n");
		
		for(PileupEntry pileupEntry : pileupFile){
			
			Alleles variantAlleles = null;
			TIntObjectMap chrVariantAlleles = variantsAlleles.get(pileupEntry.getChr());
			if(chrVariantAlleles != null){
				variantAlleles = chrVariantAlleles.get(pileupEntry.getPos());
			}
			
			
			if(variantAlleles != null){
				
				Allele refAllele = variantAlleles.get(0);
				Allele altAllele = variantAlleles.get(1);
				
				int refCount = pileupEntry.getAlleleCount(refAllele);
				int altCount = pileupEntry.getAlleleCount(altAllele);
				
				if(refCount == 0 && altCount == 0){
					continue;
				}
			
				vcfWriter.append(pileupEntry.getChr());
				vcfWriter.append('\t');
				vcfWriter.append(String.valueOf(pileupEntry.getPos()));
				vcfWriter.append('\t');
				vcfWriter.append('.');
				vcfWriter.append('\t');
				vcfWriter.append(refAllele.getAlleleAsString());
				vcfWriter.append('\t');
				vcfWriter.append(altAllele.getAlleleAsString());
				vcfWriter.append("\t.\tPASS\t.\tAD:RQ:TD:ADQ\t");
				vcfWriter.append(String.valueOf(refCount));
				vcfWriter.append(',');
				vcfWriter.append(String.valueOf(altCount));
				vcfWriter.append(':');
				vcfWriter.append(String.valueOf(pileupEntry.getAlleleAverageQuality(refAllele)));
				vcfWriter.append(',');
				vcfWriter.append(String.valueOf(pileupEntry.getAlleleAverageQuality(altAllele)));
				vcfWriter.append(':');
				vcfWriter.append(String.valueOf(pileupEntry.getReadDepth()));
				vcfWriter.append(':');
				vcfWriter.append(String.valueOf(pileupEntry.getMinimumBaseQuality()));
				vcfWriter.append('\n');
				
			}
		}
		
		vcfWriter.close();
	
    }

	
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy