net.maizegenetics.pangenome.smallseq.CreateTestGenomes Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
package net.maizegenetics.pangenome.smallseq;
import com.google.common.collect.Lists;
import com.google.common.collect.Range;
import net.maizegenetics.dna.WHICH_ALLELE;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.dna.map.PositionList;
import net.maizegenetics.dna.snp.ExportUtils;
import net.maizegenetics.dna.snp.GenotypeTable;
import net.maizegenetics.dna.snp.GenotypeTableBuilder;
import net.maizegenetics.taxa.Taxon;
import net.maizegenetics.util.Utils;
import org.apache.commons.math3.distribution.PoissonDistribution;
import java.io.BufferedWriter;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.net.Socket;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import static net.maizegenetics.dna.snp.NucleotideAlignmentConstants.*;
import static net.maizegenetics.pangenome.smallseq.SmallSeqPaths.*;
public class CreateTestGenomes {
private static Range geneRangeWithoutGBS=Range.open(10,20);
private static Random random = new Random(0);
private static double sequencingErrorRate=0.01;
public static String getHostIpAddr() {
// From the accepted answer here:
// https://stackoverflow.com/questions/9481865/getting-the-ip-address-of-the-current-machine-using-java
try {
Socket socket2 = new Socket();
socket2.connect(new InetSocketAddress("google.com", 80));
String ip = socket2.getLocalAddress().toString();
// the IP comes back with a beginning "/" as it substrings "google.com/"
// with the code taking everything past the host name, which for our test is "google.com"
String ipfinal = ip.substring(1);
return ipfinal;
} catch (Exception exc) {
throw new IllegalStateException("getHostIpAddr: ERROR getting host ip address: " + exc.getMessage());
}
}
public static void create(int lengthOfGenes, int lengthOfInterGenes, double proportionOfRefInterGeneDuplicated,
double proportionOfInterGeneDeleted, int numberOfGenes, int lengthOfReads,
int haplotypeDivergence, int intraHaplotypeDivergence, double wgsDepth, double gbsDepth, double insertProportion) throws IOException {
System.out.println("Clearing the directory " + answerDir);
Files.createDirectories(Paths.get(answerDir));
Files.createDirectories(Paths.get(dataDir));
Files.createDirectories(Paths.get(refGenomeDir));
Files.createDirectories(Paths.get(postgresDockerDir));
Files.createDirectories(Paths.get(genotypingDebugDir));
// Create the genomes
Map tnameSequenceMap = createGenomes(numberOfGenes, lengthOfGenes, lengthOfInterGenes,
proportionOfRefInterGeneDuplicated, haplotypeDivergence, intraHaplotypeDivergence, proportionOfInterGeneDeleted, insertProportion);
// create postgres Dockerfile
// We are using a Dockerfile to enable smallseq to know/change the image name
try (BufferedWriter bw = Utils.getBufferedWriter(postgresDockerfile)) {
bw.write("FROM postgres:12.9-alpine\n");
bw.write("# No entrypoint is specified. Default to the entry point from the postgres docker.\n");
} catch (IOException ioe) {
ioe.printStackTrace();
}
// Create db config file
try (BufferedWriter bw = Utils.getBufferedWriter(dbConfigFile)) {
StringBuilder sb = new StringBuilder();
sb.append("host=localHost\n");
sb.append("user=sqlite\n");
sb.append("password=sqlite\n");
sb.append("DB=").append(phgDBName).append("\n");
sb.append("DBtype=sqlite\n");
sb.append("minTaxa=1\n");
sb.append("minSites=5\n");
bw.write(sb.toString());
} catch (IOException ioe) {
ioe.printStackTrace();
}
// Create db config file for docker. DB lives in a different place
String[] splitNameString = phgDBName.split("/");
String phgNameOnly = splitNameString[splitNameString.length - 1];
try (BufferedWriter bw = Utils.getBufferedWriter(dbDockerConfigFile)) {
StringBuilder sb = new StringBuilder();
sb.append("# db connection parameters\n");
sb.append("host=localHost\n");
sb.append("user=sqlite\n");
sb.append("password=sqlite\n");
sb.append("DB=/phg/").append(phgNameOnly).append("\n");
sb.append("DBtype=sqlite\n");
sb.append("#Liquibase output directory\n");
sb.append("liquibaseOutdir=/phg/outputDir\n");
// Populate the files needed for Assembly by AnchorWave
sb.append("AssemblyMAFFromAnchorWavePlugin.outputDir=").append(alignDirDocker).append("\n");
sb.append("AssemblyMAFFromAnchorWavePlugin.keyFile=").append(asmMAFKeyFileDocker).append("\n");
sb.append("AssemblyMAFFromAnchorWavePlugin.gffFile=").append(anchorGFFFileBaseDocker).append("\n");
sb.append("AssemblyMAFFromAnchorWavePlugin.refFasta=").append(refGenomePathDocker).append("\n");
sb.append("AssemblyMAFFromAnchorWavePlugin.threadsPerRun=4").append("\n");
sb.append("AssemblyMAFFromAnchorWavePlugin.numRuns=1").append("\n");
// Add bedFile needed for LoadHaplotypesFromGVCFPLugin. Not previously needed as WGS
// did this from groovy scripts, which creates a bedfile from the db and passes that
// in to the plugin
sb.append("bedFile="+anchorBedFileBaseDocker+"\n");
sb.append("# Haplotype creation parameters\n");
sb.append("referenceFasta="+refGenomeDirDocker+"Ref.fa\n");
sb.append("refServerPath=irods:/path/to/reference\n");
// "anchors" is needed for LoadAllIntervalsToPHGDBPlugin
sb.append("anchors="+anchorBedFileBaseDocker+"\n");
sb.append("genomeData=/"+refLoadDataFileDocker+"\n");
sb.append("consensusMethodName=CONSENSUS\n");
sb.append("asmMethodName=mummer4\n");
sb.append("asmKeyFile="+ asmKeyFileDocker + "\n");
sb.append("AssemblyHaplotypesMultiThreadPlugin.isTestMethod=false\n"); // make true if want test method type
sb.append("outputDir="+alignDirDocker+"\n");
sb.append("gvcfOutputDir="+alignGVCFDirDocker+"\n");
sb.append("wgsMethodName=GATK_PIPELINE\n");
sb.append("haplotypeMethodName=GATK_PIPELINE\n");
sb.append("LoadHaplotypesFromGVCFPlugin.isTestMethod=false\n");// make true if want test method type
sb.append("wgsKeyFile="+keyFileDocker+"\n");
sb.append("inputConsensusMethods=GATK_PIPELINE\n");
sb.append("gvcfDir="+gvcfDirDocker+"\n");
sb.append("PopulatePHGDBPipelinePlugin.gvcfServerPath=localHost;"+baseGVCFDirDocker+"\n"); // upload gvcfs: LoadHaplotypesFromGVCFPlugin needs this in its keyfile
sb.append("localGVCFFolder="+localGvcfDirDocker+"\n"); // download gvcfs: for HaplotypeGraphBuilderPlugin and HaplotypeGraphStreamBuilderPlugin - they download gvcfs
sb.append("extendedWindowSize=0\n");
sb.append("GQ_min=50\n");
// sb.append("QUAL_min=200\n");
sb.append("DP_poisson_min=.01\n");
sb.append("DP_poisson_max=.99\n");
sb.append("filterHets=true\n");
sb.append("numThreads=5\n");
sb.append("# HapCountBestPathToTextPlugin parameters\n");
sb.append("maxNodesPerRange=30\n");
sb.append("minTaxaPerRange=1\n");
sb.append("minReads=0\n");
sb.append("maxGBSReads=1000\n");
sb.append("minTransitionProb=0.001\n");
sb.append("probReadMappedCorrectly=0.99\n");
sb.append("emissionMethod=allCounts\n");
sb.append("splitTaxa=true\n");
sb.append("# RunHapCollapsePipelinePlugin parameters\n");
sb.append("minTaxa=2\n");
sb.append("minSites=20\n");
//sb.append("method=coverage\n");
//sb.append("method=upgma\n");
sb.append("clusteringMode=upgma_assembly\n");// lcj - upgma will not work with gvcf assemblies
sb.append("rankingFile="+rankingFileDocker + "\n");
sb.append("includeVariants=true\n"); // lcj added for phg205 List version of variants
sb.append("mxDiv=0.005\n");
sb.append("maxError=0.2\n");
sb.append("useDepth=false\n");
sb.append("replaceNsWithMajor=false\n");
sb.append("exportMergedVCF=/tempFileDir/data/outputs/mergedVCFs/\n");
//sb.append("assemblyMethod=lynnMummer4"); // lcj - here for testing , change if you want a different method for assemblies than mummer4
// sb.append("exclusionString= FORMAT/GQ<50 || QUAL<200 || (FORMAT/AD[0]>0 && FORMAT/AD[1]>0) || (FORMAT/AD[0]>0 && FORMAT/AD[2]>0) || (FORMAT/AD[1]>0 && FORMAT/AD[2]>0)").append("\n");
bw.write(sb.toString());
} catch (IOException ioe) {
ioe.printStackTrace();
}
// Get the system IP address for postgres use. For postgres, we access the db through a
// postgres docker, and it will not accept 0.0.0.0 or localhost - it needs the
// actual ip address
String ipfinal = getHostIpAddr();
// Create postgres docker config file
try (BufferedWriter bw = Utils.getBufferedWriter(dbDockerPostgresConfigFile)) {
StringBuilder sb = new StringBuilder();
sb.append("# db connection parameters\n");
sb.append("host=").append(ipfinal).append(":").append(postgresPort).append("\n"); // postgres docker container has port 5433 mapped to 5432
sb.append("user=").append(postgresUser).append("\n");
sb.append("password=").append(postgresUserPwd).append("\n");
sb.append("DB=").append(phgDBNamePostgres).append("\n");
sb.append("DBtype=postgres\n");
sb.append("#Liquibase output directory\n");
sb.append("liquibaseOutdir=/phg/outputDir\n");
// Populate the files needed for Assembly by AnchorWave
sb.append("AssemblyMAFFromAnchorWavePlugin.outputDir=").append(alignDirDocker).append("\n");
sb.append("AssemblyMAFFromAnchorWavePlugin.keyFile=").append(asmMAFKeyFileDocker).append("\n");
sb.append("AssemblyMAFFromAnchorWavePlugin.gffFile=").append(anchorGFFFileBaseDocker).append("\n");
sb.append("AssemblyMAFFromAnchorWavePlugin.refFasta=").append(refGenomePathDocker).append("\n");
sb.append("AssemblyMAFFromAnchorWavePlugin.threadsPerRun=4").append("\n");
sb.append("AssemblyMAFFromAnchorWavePlugin.numRuns=1").append("\n");
// Needed for loading ASMHaployptes - WGS are loaded from Groovy, which makes
// its own bedFile and passes it on the command line
sb.append("bedFile="+anchorBedFileBaseDocker+"\n");
sb.append("# Haplotype creation parameters\n");
sb.append("referenceFasta="+refGenomeDirDocker+"Ref.fa\n");
sb.append("refServerPath=irods:/path/to/reference\n");
// "anchors" is needed for LoadAllIntervalsToPHGDBPlugin
sb.append("anchors="+anchorBedFileBaseDocker+"\n");
sb.append("genomeData=/"+refLoadDataFileDocker+"\n");
sb.append("consensusMethodName=CONSENSUS\n");
sb.append("asmMethodName=mummer4\n");
sb.append("asmKeyFile="+ asmKeyFileDocker + "\n");
sb.append("AssemblyHaplotypesMultiThreadPlugin.isTestMethod=false\n"); // make true if want test method type
sb.append("outputDir="+alignDirDocker+"\n");
sb.append("gvcfOutputDir="+alignGVCFDirDocker+"\n");
sb.append("wgsMethodName=GATK_PIPELINE\n");
sb.append("haplotypeMethodName=GATK_PIPELINE\n");
sb.append("LoadHaplotypesFromGVCFPlugin.isTestMethod=false\n");// make true if want test method type
sb.append("wgsKeyFile="+keyFileDocker+"\n");
sb.append("inputConsensusMethods=GATK_PIPELINE\n");
sb.append("gvcfDir="+gvcfDirDocker+"\n");
sb.append("PopulatePHGDBPipelinePlugin.gvcfServerPath=localHost;"+baseGVCFDirDocker+"\n"); // upload gvcfs: LoadHaplotypesFromGVCFPlugin needs this in its keyfile
sb.append("localGVCFFolder="+localGvcfDirDocker+"\n"); // download gvcfs: for HaplotypeGraphBuilderPlugin and HaplotypeGraphStreamBuilderPlugin - they download gvcfs
sb.append("extendedWindowSize=0\n");
sb.append("GQ_min=50\n");
// sb.append("QUAL_min=200\n");
sb.append("DP_poisson_min=.01\n");
sb.append("DP_poisson_max=.99\n");
sb.append("filterHets=true\n");
sb.append("numThreads=5\n");
sb.append("# HapCountBestPathToTextPlugin parameters\n");
sb.append("maxNodesPerRange=30\n");
sb.append("minTaxaPerRange=1\n");
sb.append("minReads=0\n");
sb.append("maxGBSReads=1000\n");
sb.append("minTransitionProb=0.001\n");
sb.append("probReadMappedCorrectly=0.99\n");
sb.append("emissionMethod=allCounts\n");
sb.append("splitTaxa=true\n");
sb.append("# RunHapCollapsePipelinePlugin parameters\n");
sb.append("minTaxa=2\n");
sb.append("minSites=20\n");
//sb.append("method=coverage\n");
//sb.append("method=upgma\n");
sb.append("clusteringMode=upgma_assembly\n");// lcj - upgma will not work with gvcf assemblies
sb.append("rankingFile="+rankingFileDocker + "\n");
sb.append("includeVariants=true\n"); // lcj added for phg205 List version of variants
sb.append("mxDiv=0.005\n");
sb.append("maxError=0.2\n");
sb.append("useDepth=false\n");
sb.append("replaceNsWithMajor=false\n");
sb.append("exportMergedVCF=/tempFileDir/data/outputs/mergedVCFs/\n");
//sb.append("assemblyMethod=lynnMummer4"); // lcj - here for testing , change if you want a different method for assemblies than mummer4
// sb.append("exclusionString= FORMAT/GQ<50 || QUAL<200 || (FORMAT/AD[0]>0 && FORMAT/AD[1]>0) || (FORMAT/AD[0]>0 && FORMAT/AD[2]>0) || (FORMAT/AD[1]>0 && FORMAT/AD[2]>0)").append("\n");
bw.write(sb.toString());
} catch (IOException ioe) {
ioe.printStackTrace();
}
System.out.println("Created sequences and VCF :" + vcfFile);
//Deletions are retained for the alignment process
ExportUtils.writeToVCF(makeGenotypeTable(tnameSequenceMap), vcfFile, false);
try(BufferedWriter rankingFileWriter = Utils.getBufferedWriter(rankingFile)) {
// these are arbitrary rankings - the last taxa will be ranked with highest priority
// This allows for additional taxa to be added/deleted without changes to this code.
int count=0;
for (Taxon taxon : taxaList) {
//Write to the ranking file
String name = taxon.getName();
count++;
StringBuilder sb = new StringBuilder();
sb.append(name);
sb.append("\t");
sb.append(count);
sb.append("\n");
rankingFileWriter.write(sb.toString());
}
}
try(BufferedWriter keyFileWriter = Utils.getBufferedWriter(keyFile)) {
keyFileWriter.write("sample_name\tsample_description\tfiles\ttype\tchrPhased\tgenePhased\tphasingConf\tlibraryID\tgvcfServerPath\n");
for (Taxon taxon : taxaList) {
//Write to the keyFile
String name = taxon.getName();
StringBuilder sb = new StringBuilder();
sb.append(name);
sb.append("\t");
sb.append(name);
sb.append(" line aligned\t");
sb.append(name);
sb.append("_R1.fastq\t");
sb.append("FASTQ\t");
sb.append("true\ttrue\t.99\tdummyLib1\t");
sb.append(gvcfServerPath);
sb.append("\n");
keyFileWriter.write(sb.toString());
}
}
catch (IOException e) {
e.printStackTrace();
}
try(BufferedWriter genotypingKeyFileWriter = Utils.getBufferedWriter(genotypingKeyFile);
BufferedWriter assemblyKeyFileWriter = Utils.getBufferedWriter(asmKeyFile);
BufferedWriter assemblyMAFKeyFileWriter = Utils.getBufferedWriter(asmMAFKeyFile)) {
genotypingKeyFileWriter.write("cultivar\tflowcell_lane\tfilename\tPlateID\n");
assemblyKeyFileWriter.write("AssemblyServerDir\tRefDir\tRefFasta\tAssemblyDir\tAssemblyGenomeFasta\tAssemblyFasta\tAssemblyDBName\tChromosome\n");
assemblyMAFKeyFileWriter.write("AssemblyServerDir\tAssemblyGenomeFasta\tAssemblyDir\tAssemblyFasta\tAssemblyDBName\tDescription\n");
tnameSequenceMap.forEach((name, seqWithGaps) -> {
//Remove the deletions from the sequence
String gapSeq = seqWithGaps.replace("-", "");
// Add insertions: replace "+" with string of 10 C's
String seq = gapSeq.replace("+", "CCCCCCCCCC");
try (BufferedWriter bw = Utils.getBufferedWriter(answerDir + name + fastaSuffix)) {
bw.write(">1\n"); // creates fastas for all taxa, including ref. Only chrom 1
bw.write(seq);
bw.write("\n");
} catch (IOException e) {
e.printStackTrace();
}
//Copy the assembly fasta to the correct mounted directory
try {
Files.copy(Paths.get(answerDir + name + fastaSuffix), Paths.get(assemblyDir + name + fastaSuffix), StandardCopyOption.REPLACE_EXISTING);
}catch (IOException e) {
throw new IllegalStateException("Unable to copy assembly" + name + fastaSuffix,e);
}
//Write the entry to the keyfile
try {
// NOTE: name _ fastaSuffix appears twice below. The first time it is for the
// AssemblyGenomeFasta, the second time it is for the AssemblyFasta (per-chrom fasta).
// In a real setting, these 2 files will be different. For SmallSeq tests, they are
// the same as we only have 1 chromosome, so it doubles as the full fasta.
//
// The "Description" field of the new asm key file is added for anchorwave MAFs and is optional.
// If present, this field will be used in the PopulatePHGDBPipelinePlugin as the data for
// column "sample data" of the generated gvcf key file. If it is not present, PopulatePHGDBPipelinePlugin
// will write a sample description based on the assembly server directory and local directory
// paths for each assembly in the asm maf key file.
if(!name.startsWith("Rec") && !name.startsWith("Ref") && !name.endsWith("1")) {
assemblyKeyFileWriter.write("" + assemblyServerDir + "\t" + refGenomeDirDocker + "\t" + refGenomeFile + "\t" + assemblyDirDocker +
"\t" + name + fastaSuffix + "\t" + name + fastaSuffix + "\t" + name + "_Assembly\t" + "1\n");
assemblyMAFKeyFileWriter.write("" + assemblyServerDir + "\t" + name + fastaSuffix + "\t" + assemblyDirDocker +
"\t" + name + fastaSuffix + "\t" + name + "_Assembly\t" + "smallSeq description for assembly " + name + "\n");
}
}
catch(IOException e) {
throw new IllegalStateException("Unable to add "+name+" to keyfile",e);
}
try (BufferedWriter bw = Utils.getBufferedWriter(genotypingFastqDir + name + "_R1.fastq")) {
bw.write(createFastqString(seq, lengthOfReads, wgsDepth, Range.singleton(Integer.MIN_VALUE)));
genotypingKeyFileWriter.write(name+"_wgs\twgsFlowcell\t"+name + "_R1.fastq\twgs\n" );
//Copy the file to use when genotyping as well.
} catch (IOException e) {
e.printStackTrace();
}
try {
Files.copy(Paths.get(genotypingFastqDir + name + "_R1.fastq"), Paths.get(fastqDir + name + "_R1.fastq"), StandardCopyOption.REPLACE_EXISTING);
}catch (IOException e) {
throw new IllegalStateException("Unable to copy genotypingFastqDir" + name + "_R1.fastq",e);
}
try (BufferedWriter bw = Utils.getBufferedWriter(genotypingFastqDir + name + "_R1_gbs.fastq")) {
Range positionsToAvoid = Range.open(geneRangeWithoutGBS.lowerEndpoint() * (lengthOfGenes + lengthOfInterGenes),
geneRangeWithoutGBS.upperEndpoint() * (lengthOfGenes + lengthOfInterGenes));
bw.write(createFastqString(seq, lengthOfReads, gbsDepth, positionsToAvoid));
genotypingKeyFileWriter.write(name+"_gbs\tgbsFlowcell\t"+name + "_R1_gbs.fastq\tgbs\n" );
} catch (IOException e) {
e.printStackTrace();
}
});
}
try (BufferedWriter bw = Utils.getBufferedWriter(anchorBedFile)) {
System.out.println("Creating anchor bed file :" + anchorBedFile);
bw.write(createAnchorBED(lengthOfGenes, lengthOfInterGenes, numberOfGenes, proportionOfInterGeneDeleted,'\t'));
}
Files.copy(Paths.get(anchorBedFile), Paths.get(anchorBedFileBase), StandardCopyOption.REPLACE_EXISTING);
try(BufferedWriter bw = Utils.getBufferedWriter(anchorGFFFile)) {
bw.write(createAnchorGFF(lengthOfGenes, lengthOfInterGenes, numberOfGenes, proportionOfInterGeneDeleted, '\t'));
}
Files.copy(Paths.get(anchorGFFFile), Paths.get(anchorGFFFileBase), StandardCopyOption.REPLACE_EXISTING);
//
Files.copy(Paths.get(answerDir + refGenomeName + fastaSuffix), Paths.get(refGenomePath));
// Create ref data file
createRefDBLoadFile(); // needed for loading reference data
// Create the Assembly genomeData files needed to load DB
for (String name : genomeNames.keySet()) {
createAssemblyLoadFile(name, answerDir);
}
}
private static void createRefDBLoadFile() {
String loadDataHeader = "Genotype\tHapnumber\tDataline\tploidy\tgenesPhased\tchromsPhased\tconfidence\tMethod\tMethodDetails\tgvcfServerPath\n";
try (BufferedWriter bw = Utils.getBufferedWriter(refLoadDataFile)) {
StringBuilder sb = new StringBuilder();
sb.append(loadDataHeader);
sb.append(refGenomeName).append("_Assembly").append("\t"); // genotype name
sb.append(0).append("\t"); // hapnumber
sb.append("generated").append("\t"); // dataline
sb.append("1").append("\t"); // ploidy
sb.append("true").append("\t"); // genesPhased
sb.append("true").append("\t"); // chromsPhased
sb.append("1").append("\t"); // confidence
sb.append("B73Ref_method").append("\t"); // method
sb.append("Test version for junit").append("\t"); // method details
sb.append(gvcfServerPath); // value to be stored in genome_file_data db table
sb.append("\n");
bw.write(sb.toString());
} catch (IOException ioe) {
ioe.printStackTrace();
}
}
private static void createAssemblyLoadFile(String taxonName, String outputDir) {
String loadDataHeader = "Genotype\tHapnumber\tDataline\tploidy\treference\tgenesPhased\tchromsPhased\tconfidence\tMethod\tMethodDetails\tRefVersion\tgvcfServerPath\n";
String loadFile = outputDir + taxonName + "_Assembly_load_data.txt";
try (BufferedWriter bw = Utils.getBufferedWriter(loadFile)) {
StringBuilder sb = new StringBuilder();
sb.append(loadDataHeader);
sb.append(taxonName).append("_Assembly").append("\t"); //
sb.append(0).append("\t"); // hapnumber
sb.append("minimap2").append("\t"); // dataline
sb.append("1").append("\t"); // ploidy
sb.append("false").append("\t"); // is reference
sb.append("true").append("\t"); // genesPhased
sb.append("true").append("\t"); // chromsPhased
sb.append("1").append("\t"); // confidence
sb.append("Assembly_minimap2").append("\t"); // method
sb.append("Aligned via minimap2").append("\t"); // method details
sb.append("B73Ref_version").append("\t"); // ref version
sb.append(gvcfServerPath); // remote host;path where gvcf files are stored
sb.append("\n");
bw.write(sb.toString());
} catch (IOException ioe) {
ioe.printStackTrace();
}
}
private static String createAnchorBED(int lengthOfGenes, int lengthOfInterGenes, int numberOfGenes,
double proportionOfInterGeneDeleted,char delimiter) {
// These are bed files: 0-based, inclusive/exclusive
boolean interGeneInBED=false;
int interGeneLengthAfterDeletion=lengthOfInterGenes - (int)(lengthOfInterGenes*proportionOfInterGeneDeleted);
StringBuilder anchorBed = new StringBuilder();
int currPosition = 0;
for (int currGene = 0; currGene < numberOfGenes; currGene++) {
int endPosition = currPosition + lengthOfGenes;
anchorBed.append(1).append(delimiter); //chromosome
anchorBed.append(currPosition).append(delimiter);
anchorBed.append(endPosition).append(delimiter);
anchorBed.append("FocusRegion").append("\n");
if(interGeneInBED) {
anchorBed.append(1).append(delimiter); //chromosome
anchorBed.append(endPosition).append(delimiter);
anchorBed.append(endPosition+interGeneLengthAfterDeletion).append(delimiter);
anchorBed.append("FocusComplement").append("\n");
}
currPosition = endPosition + interGeneLengthAfterDeletion;
// if (proportionOfInterGeneDeleted > 0.0) {
// currPosition = endPosition + interGeneLengthAfterDeletion;
// } else {
// currPosition = endPosition + lengthOfInterGenes+1;
// }
}
return anchorBed.toString();
}
private static String createAnchorGFF(int lengthOfGenes, int lengthOfInterGenes, int numberOfGenes,
double proportionOfInterGeneDeleted,char delimiter) {
// These are GFF files: 1-based, inclusive/inclusive
boolean interGeneInBED=false;
int interGeneLengthAfterDeletion=lengthOfInterGenes - (int)(lengthOfInterGenes*proportionOfInterGeneDeleted);
StringBuilder anchorBed = new StringBuilder();
int currPosition = 0;
for (int currGene = 0; currGene < numberOfGenes; currGene++) {
int endPosition = currPosition + lengthOfGenes;
// Create the Gene
//Chrom
anchorBed.append(1).append(delimiter);
//source
anchorBed.append(".").append(delimiter);
//feature
anchorBed.append("gene").append(delimiter);
//start
anchorBed.append(currPosition+1).append(delimiter);
//end
anchorBed.append(endPosition).append(delimiter);
//score
anchorBed.append(".").append(delimiter);
//strand
anchorBed.append("+").append(delimiter);
//Frame
anchorBed.append(".").append(delimiter);
//attribute
anchorBed.append("ID=gene:gene").append(currGene).append("\n");
// Create the mRNA - same coordinates as the gene
//Chrom
anchorBed.append(1).append(delimiter);
//source
anchorBed.append(".").append(delimiter);
//feature
anchorBed.append("mRNA").append(delimiter);
//start
anchorBed.append(currPosition+1).append(delimiter);
//end
anchorBed.append(endPosition).append(delimiter);
//score
anchorBed.append(".").append(delimiter);
//strand
anchorBed.append("+").append(delimiter);
//Frame
anchorBed.append(".").append(delimiter);
//attribute
anchorBed.append("ID=transcript:gene").append(currGene).append("_T001;")
.append("Parent=gene:gene").append(currGene).append("\n");
// Create the CDS - ends before the gene/mRNA end. This is arbitary,
// but follows an example from a B73 gff file
//Chrom
anchorBed.append(1).append(delimiter);
//source
anchorBed.append(".").append(delimiter);
//feature
anchorBed.append("CDS").append(delimiter);
//start
anchorBed.append(currPosition+1).append(delimiter);
//end - slightly smaller than gene and mRNA
anchorBed.append(endPosition - (endPosition/4)).append(delimiter);
//score
anchorBed.append(".").append(delimiter);
//strand
anchorBed.append("+").append(delimiter);
//Frame
anchorBed.append(".").append(delimiter);
//attribute
anchorBed.append("ID=CDS:gene").append(currGene).append("_P001;")
.append("Parent=transcript:gene").append(currGene).append("_T001\n");
currPosition = endPosition + interGeneLengthAfterDeletion;
}
return anchorBed.toString();
}
private static String createFastqString(String sequence, int lengthOfReads, double depth, Range regionToAvoid) {
PoissonDistribution poisson=new PoissonDistribution(lengthOfReads*sequencingErrorRate);
int readNumber = (int) ((depth * sequence.length()) / lengthOfReads);
String qualityScores = IntStream.range(0, lengthOfReads).mapToObj(i -> "H").collect(Collectors.joining(""));
StringBuilder fastq = new StringBuilder();
for (int i = 0; i < readNumber; i++) {
int startPos = random.nextInt(sequence.length() - lengthOfReads);
if(regionToAvoid.contains(startPos) || regionToAvoid.contains(startPos+lengthOfReads)) continue;
fastq.append("@M01032:387:000000000-ANP68:1:1102:20447:" + i + " 1:N:0:CCTAAGAC+GCGTAAGA").append("\n");
StringBuilder read = new StringBuilder(sequence.substring(startPos, startPos + lengthOfReads));
int errors=poisson.sample();
// System.out.println("errors = " + errors);
mutateSequence(read,errors);
fastq.append(read).append("\n");
fastq.append("+\n");
fastq.append(qualityScores).append("\n");
}
return fastq.toString();
}
private static GenotypeTable makeGenotypeTable(Map nameSeqMap) {
String refSeq = nameSeqMap.get(refGenomeName);
PositionList positionList = IntStream.range(0, refSeq.length())
.mapToObj(site -> Position.builder("1", site + 1)
.allele(WHICH_ALLELE.Reference, getNucleotideAlleleByte(refSeq.charAt(site)))
.allele(WHICH_ALLELE.Alternate, getNucleotideAlleleByte("T"))
.build()
)
.collect(PositionList.collectValidateOrder());
GenotypeTableBuilder gtb = GenotypeTableBuilder.getTaxaIncremental(positionList);
nameSeqMap.forEach((name, seq) -> {
gtb.addTaxon(new Taxon(name), convertGenotypeStringToDiploidByteArray(seq.toUpperCase()));
});
return gtb.build();
}
private static Map createGenomes(int numberOfGenes, int lengthOfGenes, int lengthOfInterGenes,
double proportionOfRefInterGeneDuplicated, int haplotypeDivergence, int intraHaplotypeDivergence,
double proportionOfInterGeneDeleted, double insertProportion) {
//Build the reference genome
//create a repetitive element of a particular size
int bpOfRepetitive = (int) (lengthOfInterGenes * proportionOfRefInterGeneDuplicated);
System.out.println("CreateGenomes: bpOfRepetitive = " + bpOfRepetitive);
String repetitiveElement = createBaseSequence(bpOfRepetitive);
System.out.println("repetitiveElement = " + repetitiveElement);
String baseGenome = IntStream.range(0, numberOfGenes)
.mapToObj(i -> {
StringBuilder interSeq = new StringBuilder(createBaseSequence(lengthOfInterGenes));
//stick the repetitive element in a random location within the intergenic region
if (lengthOfInterGenes > 0) {
interSeq.insert(random.nextInt(lengthOfInterGenes - bpOfRepetitive), repetitiveElement).setLength(lengthOfInterGenes);
}
return createBaseSequence(lengthOfGenes) + interSeq;
})
.collect(Collectors.joining(""));
// System.out.println(createBaseSequence(lengthOfGenes)); // LCJ - uncomment
//System.out.println("baseGenome = " + baseGenome); // LCJ - uncomment
Map genomeSequences = genomeNames.keySet().stream()
.filter(name -> !name.startsWith("Rec"))
.flatMap(name -> {
Map taxaSequenceMap = new HashMap<>();
//first diverge the pair of lines for the haplotype
taxaSequenceMap.put(name, createMutatedSequence(baseGenome, haplotypeDivergence,numberOfGenes, lengthOfGenes,
lengthOfInterGenes, proportionOfInterGeneDeleted, 0));
//create rare alleles within the haplotype
genomeNames.get(name).forEach(derivedTaxonName -> {
System.out.println("Generating sequence for " + derivedTaxonName);
taxaSequenceMap.put(derivedTaxonName, createMutatedSequence(taxaSequenceMap.get(name), intraHaplotypeDivergence,
numberOfGenes, lengthOfGenes, lengthOfInterGenes, proportionOfInterGeneDeleted, insertProportion));
});
return taxaSequenceMap.entrySet().stream();
}
).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
List potentialParents= Lists.newArrayList(genomeNames.keySet());
potentialParents.addAll(genomeNames.values());
for (int i = 1; i < numberOfGenes-1; i++) {
int breakpoint = i*(lengthOfGenes + lengthOfInterGenes); //recombine after first intergenic region
String parent1=potentialParents.get(random.nextInt(potentialParents.size()));
String parent2=potentialParents.get(random.nextInt(potentialParents.size()));
String recombinantSequence = genomeSequences.get(parent1).substring(0, breakpoint) + genomeSequences.get(parent2).substring(breakpoint);
genomeSequences.put("Rec"+parent1+parent2+"gco"+i, recombinantSequence);
}
// Uncomment if you really want to see the genomes, but it prints
// alot to the console, and this makes seeing the sqlite vs postgres
// results more difficult as these genomes are created once for each run.
// System.out.println(genomeSequences.toString());
return genomeSequences;
}
private static String createMutatedSequence(String baseGenome, int mutationFrequency, int numberOfGenes, int lengthOfGenes,
int lengthOfInterGenes, double proportionOfInterGeneDeleted, double insertionProportion) {
StringBuilder sequence = new StringBuilder(baseGenome);
int currIndex = 0;
int maxAnchorIdx = lengthOfInterGenes > 0 ? numberOfGenes * 2 : numberOfGenes;
for (int anchoridx = 0; anchoridx < maxAnchorIdx; anchoridx++) {
//even anchoridx are genes, odd interachors. If no interanchors, all are genes
if (anchoridx % 2 == 0 || lengthOfInterGenes == 0) { // process anchors
mutateSequence(sequence, currIndex, lengthOfGenes, mutationFrequency);
if (insertionProportion > 0) { // only doing insertions for anchors, not inter-anchors.
insertSequence(sequence, currIndex, lengthOfGenes, insertionProportion); // LCJ arbitrary insert of 0.2 percent size. Only anchors for derived taxon have insertion
}
currIndex += lengthOfGenes;
} else { // process inter-anchor
mutateSequence(sequence, currIndex, lengthOfInterGenes, mutationFrequency);
deleteSequence(sequence, currIndex, lengthOfInterGenes, proportionOfInterGeneDeleted); // only inter-anchors have deletion
currIndex += lengthOfInterGenes;
}
}
return sequence.toString();
}
private static void deleteSequence(StringBuilder sequence, int currIndex, int lengthOfInterGenes, double proportionOfInterGeneDeleted) {
int sizeOfDeletion = (int) (lengthOfInterGenes * proportionOfInterGeneDeleted);
int deletionStart = currIndex + random.nextInt(lengthOfInterGenes - sizeOfDeletion);
for (int i = deletionStart; i < deletionStart + sizeOfDeletion; i++) {
sequence.setCharAt(i, '-');
}
}
// For testing, the + will later be translated to a series of C
private static void insertSequence(StringBuilder sequence, int currIndex, int lengthOfGenes, double proportionOfGeneInsert) {
int sizeOfInsertion = (int) (lengthOfGenes * proportionOfGeneInsert);
if (sizeOfInsertion > 0) {
int insertionStart = currIndex + random.nextInt(lengthOfGenes);
sequence.setCharAt(insertionStart, '+');
}
}
/*
Base sequence are only ACG
*/
private static String createBaseSequence(int length) {
return random.ints(length, 0, 3) // 3 is A C G, values 0,1,2
.mapToObj(i -> getHaplotypeNucleotide((byte) i))
.collect(Collectors.joining(""));
}
/*
Base sequence are only ACG
*/
private static void mutateSequence(StringBuilder sequence, int offset, int length, int frequency) {
for (int i = 0; i < length / frequency; i++) {
sequence.setCharAt(offset + random.nextInt(length), 'T');
}
}
/*
Mutate a sequence with the full range of bp
*/
private static void mutateSequence(StringBuilder sequence, int numberOfMutations) {
for (int i = 0; i < numberOfMutations; i++) {
sequence.setCharAt(random.nextInt(sequence.length()), getNucleotideIUPACChar((byte)random.nextInt(4)));
}
}
}