
eqtlmappingpipeline.ase.AseConfiguration Maven / Gradle / Ivy
The newest version!
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package eqtlmappingpipeline.ase;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.log4j.Logger;
import org.molgenis.genotype.GenotypeDataException;
import org.molgenis.genotype.RandomAccessGenotypeDataReaderFormats;
/**
*
* @author Patrick Deelen
*/
public class AseConfiguration {
private static final Logger LOGGER;
private static final Options OPTIONS;
public static final String ENCODING = "UTF-8";
private final List inputFiles;
private final File outputFolder;
private final int minTotalReads;
private final int minAlleleReads;
private final double minAlleleReadFraction;
private final File logFile;
private final boolean debugMode;
private final int minSamples;
private final int threads;
private final String[] refBasePaths;
private final RandomAccessGenotypeDataReaderFormats refDataType;
private final int refDataCacheSize;
private final int maxTotalReads;
private final File gtf;
private final File sampleToRefSampleFile;
private final String chrFilter;
private final int chunkSize;
private final File mappabilityTrackFile;
private final double mappabilityMinimum;
static {
LOGGER = Logger.getLogger(AseConfiguration.class);
OPTIONS = new Options();
OptionBuilder.withArgName("path");
OptionBuilder.hasArgs();
OptionBuilder.withDescription("Paths to one or more vcf.gz files or folders with each per chr 1 vcf.gz file. Tabix file must be present");
OptionBuilder.withLongOpt("input");
OPTIONS.addOption(OptionBuilder.create('i'));
OptionBuilder.withArgName("path");
OptionBuilder.hasArgs();
OptionBuilder.withDescription("One or more text files with on each line a path to a vcf.gz file or a folder with per chr 1 vcf.gz file. Tabix file must be present. Can be combined with --input");
OptionBuilder.withLongOpt("inputList");
OPTIONS.addOption(OptionBuilder.create('l'));
OptionBuilder.withArgName("path");
OptionBuilder.hasArgs();
OptionBuilder.withDescription("Path to output folder");
OptionBuilder.withLongOpt("output");
OptionBuilder.isRequired();
OPTIONS.addOption(OptionBuilder.create('o'));
OptionBuilder.withArgName("int");
OptionBuilder.hasArgs();
OptionBuilder.withDescription("Min number total reads for genotype");
OptionBuilder.withLongOpt("minReads");
OptionBuilder.isRequired();
OPTIONS.addOption(OptionBuilder.create('r'));
OptionBuilder.withArgName("int");
OptionBuilder.hasArgs();
OptionBuilder.withDescription("Min number of reads per allele");
OptionBuilder.withLongOpt("minAlleleReads");
OptionBuilder.isRequired();
OPTIONS.addOption(OptionBuilder.create('a'));
OptionBuilder.withArgName("int");
OptionBuilder.hasArgs();
OptionBuilder.withDescription("Min percentage of read per allele. Default 0. Can be used in combination --minAlleleReads");
OptionBuilder.withLongOpt("minAllelePercentage");
OPTIONS.addOption(OptionBuilder.create('p'));
OptionBuilder.withArgName("int");
OptionBuilder.hasArgs();
OptionBuilder.withDescription("Min number of samples per ASE effect");
OptionBuilder.withLongOpt("minNumSamples");
OptionBuilder.isRequired();
OPTIONS.addOption(OptionBuilder.create('s'));
OptionBuilder.withArgName("int");
OptionBuilder.hasArgs();
OptionBuilder.withDescription("Maximum number of threads to start for parallel reading of multiple VCF files. Defaults to number of cores");
OptionBuilder.withLongOpt("threads");
OPTIONS.addOption(OptionBuilder.create('t'));
OptionBuilder.withArgName("basePath");
OptionBuilder.hasArgs();
OptionBuilder.withDescription("The path to the reference genotypes. These genotypes will be used to determine if a sample is hetrozygous");
OptionBuilder.withLongOpt("genotypes");
OPTIONS.addOption(OptionBuilder.create("g"));
OptionBuilder.withArgName("type");
OptionBuilder.hasArg();
OptionBuilder.withDescription("The input data type. If not defined will attempt to automatically select the first matching dataset on the specified path\n"
+ "* PED_MAP - plink PED MAP files.\n"
+ "* PLINK_BED - plink BED BIM FAM files.\n"
+ "* VCF - bgziped vcf with tabix index file\n"
+ "* VCFFOLDER - matches all bgziped vcf files + tabix index in a folder\n"
+ "* SHAPEIT2 - shapeit2 phased haplotypes .haps & .sample\n"
+ "* GEN - Oxford .gen & .sample\n"
+ "* TRITYPER - TriTyper format folder");
OptionBuilder.withLongOpt("genotypesType");
OPTIONS.addOption(OptionBuilder.create("G"));
OptionBuilder.withArgName("int");
OptionBuilder.hasArg();
OptionBuilder.withDescription("Reference genotype data cache. Trade memory usage for speed");
OptionBuilder.withLongOpt("cache");
OPTIONS.addOption(OptionBuilder.create("c"));
OptionBuilder.withArgName("path");
OptionBuilder.hasArg();
OptionBuilder.withDescription("Tab separated file with column 1 sample ID and column 2 sample ID in reference genotype data, no header. If only contains a mapping for a subset of samples the original identifier in the reference is used");
OptionBuilder.withLongOpt("sampleCoupling");
OPTIONS.addOption(OptionBuilder.create("sc"));
OptionBuilder.withArgName("path");
OptionBuilder.hasArg();
OptionBuilder.withDescription("Optional .gtf file for annotations of ASE effects. Must be grouped by chromosome");
OptionBuilder.withLongOpt("gtf");
OPTIONS.addOption(OptionBuilder.create("f"));
OptionBuilder.withArgName("int");
OptionBuilder.hasArg();
OptionBuilder.withDescription("Maximum number of total reads");
OptionBuilder.withLongOpt("maxReads");
OPTIONS.addOption(OptionBuilder.create("m"));
OptionBuilder.withArgName("boolean");
OptionBuilder.withDescription("Activate debug mode. This will result in a more verbose log file");
OptionBuilder.withLongOpt("debug");
OPTIONS.addOption(OptionBuilder.create('d'));
OptionBuilder.withArgName("string");
OptionBuilder.hasArg();
OptionBuilder.withDescription("Confine analysis to specified chromosome");
OptionBuilder.withLongOpt("chr");
OPTIONS.addOption(OptionBuilder.create("ch"));
OptionBuilder.withArgName("int");
OptionBuilder.hasArg();
OptionBuilder.withDescription("In the case of reference genotype data anayslis can be faster when chucked");
OptionBuilder.withLongOpt("chunkSize");
OPTIONS.addOption(OptionBuilder.create("cs"));
OptionBuilder.withArgName("path");
OptionBuilder.hasArg();
OptionBuilder.withDescription("Mappability track in BedGraph format.");
OptionBuilder.withLongOpt("mappabilityTrack");
OPTIONS.addOption(OptionBuilder.create("mt"));
OptionBuilder.withArgName("int");
OptionBuilder.hasArg();
OptionBuilder.withDescription("Minimum mappability to include ASE in results in Mb.");
OptionBuilder.withLongOpt("mappabilityMinimum");
OPTIONS.addOption(OptionBuilder.create("mm"));
}
public AseConfiguration(String... args) throws ParseException {
final CommandLine commandLine = new PosixParser().parse(OPTIONS, args, true);
if (!commandLine.hasOption('i') && !commandLine.hasOption('l')) {
throw new ParseException("At least --input or --inputList need to be supplied");
}
ArrayList inputFilesTmp = new ArrayList();
if (commandLine.hasOption('i')) {
String[] inputPaths = commandLine.getOptionValues('i');
for (String inputPath : inputPaths) {
inputFilesTmp.add(new File(inputPath));
}
}
if (commandLine.hasOption('l')) {
String[] inputListFilePaths = commandLine.getOptionValues('l');
for (String inputListFilePath : inputListFilePaths) {
try {
BufferedReader inputListFileReader = new BufferedReader(new InputStreamReader(new FileInputStream(inputListFilePath), ENCODING));
String line;
while ((line = inputListFileReader.readLine()) != null) {
inputFilesTmp.add(new File(line));
}
} catch (FileNotFoundException ex) {
throw new ParseException("Could not find file with list of input files: " + inputListFilePath);
} catch (UnsupportedEncodingException ex) {
throw new RuntimeException("Fatal error", ex);
} catch (IOException ex) {
throw new ParseException("Error reading list of input files: " + inputListFilePath);
}
}
}
inputFiles = Collections.unmodifiableList(inputFilesTmp);
outputFolder = new File(commandLine.getOptionValue('o'));
logFile = new File(outputFolder, "ase.log");
try {
minTotalReads = Integer.parseInt(commandLine.getOptionValue('r'));
if (minTotalReads <= 0) {
throw new ParseException("--minReads must be larger than 0");
}
} catch (NumberFormatException e) {
throw new ParseException("Error parsing --minReads \"" + commandLine.getOptionValue('r') + "\" is not an int");
}
try {
minAlleleReads = Integer.parseInt(commandLine.getOptionValue('a'));
if (minAlleleReads < 0) {
throw new ParseException("--minAlleleReads must be positive");
}
} catch (NumberFormatException e) {
throw new ParseException("Error parsing --minAlleleReads \"" + commandLine.getOptionValue('a') + "\" is not an int");
}
if (commandLine.hasOption('p')) {
try {
minAlleleReadFraction = Double.parseDouble(commandLine.getOptionValue('p')) / 100;
if (minAlleleReadFraction < 0 || minAlleleReadFraction >= 0.5) {
throw new ParseException("--minAllelePercentage must be in interval [0, 50)");
}
} catch (NumberFormatException e) {
throw new ParseException("Error parsing --minAllelePercentage \"" + commandLine.getOptionValue('p') + "\" is not a double");
}
} else {
minAlleleReadFraction = 0;
}
try {
minSamples = Integer.parseInt(commandLine.getOptionValue('s'));
if (minSamples <= 0) {
throw new ParseException("--minNumSamples must be larger than 0");
}
} catch (NumberFormatException e) {
throw new ParseException("Error parsing --minNumSamples \"" + commandLine.getOptionValue('s') + "\" is not an int");
}
int availCores = Runtime.getRuntime().availableProcessors();
if (commandLine.hasOption('t')) {
try {
int threadOption = Integer.parseInt(commandLine.getOptionValue('t'));
if (threadOption <= 0) {
throw new ParseException("--threads must be larger than 0");
}
threads = threadOption > availCores ? availCores : threadOption;
} catch (NumberFormatException e) {
throw new ParseException("Error parsing --threads \"" + commandLine.getOptionValue('t') + "\" is not an int");
}
} else {
threads = availCores;
}
if (commandLine.hasOption('c')) {
try {
refDataCacheSize = Integer.parseInt(commandLine.getOptionValue('c'));
if (refDataCacheSize < 0) {
throw new ParseException("--cache must be positive");
}
} catch (NumberFormatException e) {
throw new ParseException("Error parsing --cache \"" + commandLine.getOptionValue('c') + "\" is not an int");
}
} else {
refDataCacheSize = 1000;
}
if (commandLine.hasOption('g')) {
refBasePaths = commandLine.getOptionValues('g');
try {
if (commandLine.hasOption('G')) {
refDataType = RandomAccessGenotypeDataReaderFormats.valueOf(commandLine.getOptionValue('G').toUpperCase());
} else {
if (refBasePaths[0].endsWith(".vcf")) {
throw new ParseException("Only vcf.gz is supported. Please see manual on how to do create a vcf.gz file.");
}
try {
refDataType = RandomAccessGenotypeDataReaderFormats.matchFormatToPath(refBasePaths[0]);
} catch (GenotypeDataException e) {
throw new ParseException("Unable to determine reference data type based on specified path. Please specify --genotypesType");
}
}
} catch (IllegalArgumentException e) {
throw new ParseException("Error parsing --genotypesType \"" + commandLine.getOptionValue('G') + "\" is not a valid input data format");
}
} else {
refBasePaths = null;
refDataType = null;
}
if (commandLine.hasOption('m')) {
try {
maxTotalReads = Integer.parseInt(commandLine.getOptionValue('m'));
if (maxTotalReads <= 0) {
throw new ParseException("--maxReads must be larger than 0");
}
} catch (NumberFormatException e) {
throw new ParseException("Error parsing --maxReads \"" + commandLine.getOptionValue('m') + "\" is not an int");
}
} else {
maxTotalReads = Integer.MAX_VALUE;
}
if (commandLine.hasOption('f')) {
String gencodeGtfPath = commandLine.getOptionValue('f');
gtf = new File(gencodeGtfPath);
} else {
gtf = null;
}
if (commandLine.hasOption("sc")) {
sampleToRefSampleFile = new File(commandLine.getOptionValue("sc"));
} else {
sampleToRefSampleFile = null;
}
chrFilter = commandLine.hasOption("ch") ? commandLine.getOptionValue("ch") : null;
if (commandLine.hasOption("cs")) {
try {
chunkSize = Integer.parseInt(commandLine.getOptionValue("cs")) * 1000000;
if (chunkSize <= 0) {
throw new ParseException("--chunkSize must be larger than 0");
}
} catch (NumberFormatException e) {
throw new ParseException("Error parsing --chunkSize \"" + commandLine.getOptionValue("cs") + "\" is not an int");
}
} else {
chunkSize = Integer.MAX_VALUE;
}
if(commandLine.hasOption("mt")){
if(!commandLine.hasOption("mm")){
throw new ParseException("--mappabilityTrack is set but --mappabilityMinimum is not");
}
mappabilityTrackFile = new File(commandLine.getOptionValue("mt"));
try {
mappabilityMinimum = Integer.parseInt(commandLine.getOptionValue("mm"));
if (mappabilityMinimum < 0 || mappabilityMinimum > 1) {
throw new ParseException("--mappabilityMinimum must be between 0 and 1");
}
} catch (NumberFormatException e) {
throw new ParseException("Error parsing --mappabilityMinimum \"" + commandLine.getOptionValue("mm") + "\" is not a double");
}
} else {
mappabilityTrackFile = null;
mappabilityMinimum = 0;
}
debugMode = commandLine.hasOption('d');
}
public void printOptions() {
System.out.println("Interpreted arguments: ");
System.out.println(" - Input files or folders (" + Ase.DEFAULT_NUMBER_FORMATTER.format(inputFiles.size()) + " in total): ");
if (inputFiles.size() > 5) {
System.out.println(" * Reading more than 5 files. See log file for list.");
} else {
for (File inputFile : inputFiles) {
System.out.println(" * " + inputFile.getAbsolutePath());
}
}
LOGGER.info("Input files or folders (" + Ase.DEFAULT_NUMBER_FORMATTER.format(inputFiles.size()) + " in total): ");
for (File inputFile : inputFiles) {
LOGGER.info(" * " + inputFile.getAbsolutePath());
}
System.out.println(" - Output folder: " + outputFolder.getAbsolutePath());
LOGGER.info("Output folder: " + outputFolder.getAbsolutePath());
System.out.println(" - Minimum number of reads per genotype: " + minTotalReads);
LOGGER.info("Minimum number of reads per genotype: " + minTotalReads);
System.out.println(" - Minimum number of reads per allele: " + minAlleleReads);
LOGGER.info("Minimum number of reads per allele: " + minAlleleReads);
System.out.println(" - Minimum percentage of reads per allele: " + minAlleleReadFraction * 100 + "%");
LOGGER.info("Minimum percentage of reads per allele: " + minAlleleReadFraction * 100 + "%");
if (maxTotalReads != Integer.MAX_VALUE) {
System.out.println(" - Maximum number of reads per genotype: " + maxTotalReads);
LOGGER.info("Maximum number of reads per genotype: " + maxTotalReads);
}
System.out.println(" - Minimum number of samples per ASE effect: " + minSamples);
LOGGER.info("Minimum number of samples per ASE effect: " + minSamples);
System.out.println(" - Number of threads to use: " + threads);
LOGGER.info("Number of threads to use: " + threads);
if (chrFilter != null) {
System.out.println(" - Confine analysis to chr: " + chrFilter);
LOGGER.info("Confine analysis to chr: " + chrFilter);
}
if (isRefSet()) {
System.out.print(" - Reference genotypes " + refDataType.getName() + ":");
LOGGER.info("Reference genotypes " + refDataType.getName() + ":");
for (String path : refBasePaths) {
System.out.print(" " + path);
LOGGER.info(" " + path);
}
System.out.println();
System.out.println(" - Reference genotype cache size: " + Ase.DEFAULT_NUMBER_FORMATTER.format(refDataCacheSize));
LOGGER.info("Reference genotype cache size: " + Ase.DEFAULT_NUMBER_FORMATTER.format(refDataCacheSize));
if (isSampleToRefSampleFileSet()) {
System.out.println(" - Sample mapping to reference file: " + sampleToRefSampleFile.getAbsolutePath());
LOGGER.info("Sample mapping to reference file: " + sampleToRefSampleFile.getAbsolutePath());
}
}
if (isGtfSet()) {
System.out.println(" - GTF file: " + gtf.getAbsolutePath());
LOGGER.info("GTF file: " + gtf.getAbsolutePath());
}
if(chunkSize != Integer.MAX_VALUE){
System.out.println(" - Chunk size: " + Ase.DEFAULT_NUMBER_FORMATTER.format(chunkSize));
LOGGER.info("Chunk size: " + Ase.DEFAULT_NUMBER_FORMATTER.format(chunkSize));
}
if(isMappabilityTrackSet()){
System.out.println(" - Mappability track: " + mappabilityTrackFile.getAbsolutePath());
LOGGER.info("Mappability track: " + mappabilityTrackFile.getAbsolutePath());
System.out.println(" - Mappability minimum: " + mappabilityMinimum);
LOGGER.info("Mappability mimimum: " + mappabilityMinimum);
}
System.out.println();
System.out.flush(); //flush to make sure config is before errors
try {
Thread.sleep(25); //Allows flush to complete
} catch (InterruptedException ex) {
}
}
public static void printHelp() {
new HelpFormatter().printHelp(" ", OPTIONS);
}
public List getInputFiles() {
return inputFiles;
}
public File getOutputFolder() {
return outputFolder;
}
public int getMinTotalReads() {
return minTotalReads;
}
public int getMinAlleleReads() {
return minAlleleReads;
}
public File getLogFile() {
return logFile;
}
public boolean isDebugMode() {
return debugMode;
}
public int getMinSamples() {
return minSamples;
}
public int getThreads() {
return threads;
}
public String[] getRefBasePaths() {
return refBasePaths;
}
public RandomAccessGenotypeDataReaderFormats getRefDataType() {
return refDataType;
}
public boolean isRefSet() {
return refBasePaths != null;
}
public int getRefDataCacheSize() {
return refDataCacheSize;
}
public File getGtf() {
return gtf;
}
public int getMaxTotalReads() {
return maxTotalReads;
}
public boolean isGtfSet() {
return gtf != null;
}
public double getMinAlleleReadFraction() {
return minAlleleReadFraction;
}
public File getSampleToRefSampleFile() {
return sampleToRefSampleFile;
}
public boolean isSampleToRefSampleFileSet() {
return sampleToRefSampleFile != null;
}
public String getChrFilter() {
return chrFilter;
}
public boolean isMappabilityTrackSet(){
return mappabilityTrackFile != null;
}
public int getChunkSize() {
return chunkSize;
}
public File getMappabilityTrackFile() {
return mappabilityTrackFile;
}
public double getMappabilityMinimum() {
return mappabilityMinimum;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy