All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.pangenome.smallseq.CreateTestGenomes Maven / Gradle / Ivy

There is a newer version: 1.10
Show newest version
package net.maizegenetics.pangenome.smallseq;

import com.google.common.collect.Lists;
import com.google.common.collect.Range;
import net.maizegenetics.dna.WHICH_ALLELE;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.dna.map.PositionList;
import net.maizegenetics.dna.snp.ExportUtils;
import net.maizegenetics.dna.snp.GenotypeTable;
import net.maizegenetics.dna.snp.GenotypeTableBuilder;
import net.maizegenetics.taxa.Taxon;
import net.maizegenetics.util.Utils;
import org.apache.commons.math3.distribution.PoissonDistribution;

import java.io.BufferedWriter;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.net.Socket;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

import static net.maizegenetics.dna.snp.NucleotideAlignmentConstants.*;
import static net.maizegenetics.pangenome.smallseq.SmallSeqPaths.*;

public class CreateTestGenomes {

    private static Range geneRangeWithoutGBS=Range.open(10,20);
    private static Random random = new Random(0);
    private static double sequencingErrorRate=0.01;

    public static String getHostIpAddr() {
        // From the accepted answer here:
        //   https://stackoverflow.com/questions/9481865/getting-the-ip-address-of-the-current-machine-using-java
        try {
            Socket socket2 = new Socket();
            socket2.connect(new InetSocketAddress("google.com", 80));
            String ip = socket2.getLocalAddress().toString();
            // the IP comes back with a beginning "/" as it substrings "google.com/"
            // with the code taking everything past the host name, which for our test is "google.com"
            String ipfinal = ip.substring(1);
            return ipfinal;
        } catch (Exception exc) {
            throw new IllegalStateException("getHostIpAddr: ERROR getting host ip address: " + exc.getMessage());
        }

    }

    public static void create(int lengthOfGenes, int lengthOfInterGenes, double proportionOfRefInterGeneDuplicated,
            double proportionOfInterGeneDeleted, int numberOfGenes, int lengthOfReads,
            int haplotypeDivergence, int intraHaplotypeDivergence, double wgsDepth, double gbsDepth, double insertProportion) throws IOException {
        System.out.println("Clearing the directory " + answerDir);

        Files.createDirectories(Paths.get(answerDir));
        Files.createDirectories(Paths.get(dataDir));
        Files.createDirectories(Paths.get(refGenomeDir));
        Files.createDirectories(Paths.get(postgresDockerDir));
        Files.createDirectories(Paths.get(genotypingDebugDir));

        // Create the genomes
        Map tnameSequenceMap = createGenomes(numberOfGenes,  lengthOfGenes,  lengthOfInterGenes,
                proportionOfRefInterGeneDuplicated,  haplotypeDivergence,  intraHaplotypeDivergence, proportionOfInterGeneDeleted, insertProportion);

        // create postgres Dockerfile
        // We are using a Dockerfile to enable smallseq to know/change the image name
        try (BufferedWriter bw = Utils.getBufferedWriter(postgresDockerfile)) {
            bw.write("FROM postgres:12.9-alpine\n");
            bw.write("# No entrypoint is specified. Default to the entry point from the postgres docker.\n");
        } catch (IOException ioe) {
            ioe.printStackTrace();
        }

        // Create db config file
        try (BufferedWriter bw = Utils.getBufferedWriter(dbConfigFile)) {
            StringBuilder sb = new StringBuilder();
            sb.append("host=localHost\n");
            sb.append("user=sqlite\n");
            sb.append("password=sqlite\n");
            sb.append("DB=").append(phgDBName).append("\n");
            sb.append("DBtype=sqlite\n");
            sb.append("minTaxa=1\n");
            sb.append("minSites=5\n");

            bw.write(sb.toString());
        } catch (IOException ioe) {
            ioe.printStackTrace();
        }

        // Create db config file for docker.  DB lives in a different place
        String[] splitNameString = phgDBName.split("/");
        String phgNameOnly = splitNameString[splitNameString.length - 1];

        try (BufferedWriter bw = Utils.getBufferedWriter(dbDockerConfigFile)) {
            StringBuilder sb = new StringBuilder();
            sb.append("# db connection parameters\n");
            sb.append("host=localHost\n");
            sb.append("user=sqlite\n");
            sb.append("password=sqlite\n");
            sb.append("DB=/phg/").append(phgNameOnly).append("\n");
            sb.append("DBtype=sqlite\n");
            sb.append("#Liquibase output directory\n");
            sb.append("liquibaseOutdir=/phg/outputDir\n");

            // Populate the files needed for Assembly by AnchorWave
            sb.append("AssemblyMAFFromAnchorWavePlugin.outputDir=").append(alignDirDocker).append("\n");
            sb.append("AssemblyMAFFromAnchorWavePlugin.keyFile=").append(asmMAFKeyFileDocker).append("\n");
            sb.append("AssemblyMAFFromAnchorWavePlugin.gffFile=").append(anchorGFFFileBaseDocker).append("\n");
            sb.append("AssemblyMAFFromAnchorWavePlugin.refFasta=").append(refGenomePathDocker).append("\n");
            sb.append("AssemblyMAFFromAnchorWavePlugin.threadsPerRun=4").append("\n");
            sb.append("AssemblyMAFFromAnchorWavePlugin.numRuns=1").append("\n");

            // Add bedFile needed for LoadHaplotypesFromGVCFPLugin.  Not previously needed as WGS
            // did this from groovy scripts, which creates a bedfile from the db and passes that
            // in to the plugin
            sb.append("bedFile="+anchorBedFileBaseDocker+"\n");


            sb.append("# Haplotype creation parameters\n");
            sb.append("referenceFasta="+refGenomeDirDocker+"Ref.fa\n");
            sb.append("refServerPath=irods:/path/to/reference\n");

            // "anchors" is needed for LoadAllIntervalsToPHGDBPlugin
            sb.append("anchors="+anchorBedFileBaseDocker+"\n");
            sb.append("genomeData=/"+refLoadDataFileDocker+"\n");

            sb.append("consensusMethodName=CONSENSUS\n");
            sb.append("asmMethodName=mummer4\n");
            sb.append("asmKeyFile="+ asmKeyFileDocker + "\n");
            sb.append("AssemblyHaplotypesMultiThreadPlugin.isTestMethod=false\n"); // make true if want test method type
            sb.append("outputDir="+alignDirDocker+"\n");
            sb.append("gvcfOutputDir="+alignGVCFDirDocker+"\n");
            sb.append("wgsMethodName=GATK_PIPELINE\n");
            sb.append("haplotypeMethodName=GATK_PIPELINE\n");
            sb.append("LoadHaplotypesFromGVCFPlugin.isTestMethod=false\n");// make true if want test method type
            sb.append("wgsKeyFile="+keyFileDocker+"\n");
            sb.append("inputConsensusMethods=GATK_PIPELINE\n");
            sb.append("gvcfDir="+gvcfDirDocker+"\n");
            sb.append("PopulatePHGDBPipelinePlugin.gvcfServerPath=localHost;"+baseGVCFDirDocker+"\n"); // upload gvcfs:  LoadHaplotypesFromGVCFPlugin needs this in its keyfile
            sb.append("localGVCFFolder="+localGvcfDirDocker+"\n"); // download gvcfs: for HaplotypeGraphBuilderPlugin and HaplotypeGraphStreamBuilderPlugin - they download gvcfs

            sb.append("extendedWindowSize=0\n");
            sb.append("GQ_min=50\n");
 //           sb.append("QUAL_min=200\n");
            sb.append("DP_poisson_min=.01\n");
            sb.append("DP_poisson_max=.99\n");
            sb.append("filterHets=true\n");
            sb.append("numThreads=5\n");
            sb.append("# HapCountBestPathToTextPlugin parameters\n");
            sb.append("maxNodesPerRange=30\n");
            sb.append("minTaxaPerRange=1\n");
            sb.append("minReads=0\n");
            sb.append("maxGBSReads=1000\n");
            sb.append("minTransitionProb=0.001\n");
            sb.append("probReadMappedCorrectly=0.99\n");
            sb.append("emissionMethod=allCounts\n");
            sb.append("splitTaxa=true\n");
            sb.append("# RunHapCollapsePipelinePlugin parameters\n");
            sb.append("minTaxa=2\n");
            sb.append("minSites=20\n");
            //sb.append("method=coverage\n");
            //sb.append("method=upgma\n");
            sb.append("clusteringMode=upgma_assembly\n");// lcj - upgma will not work with gvcf assemblies
            sb.append("rankingFile="+rankingFileDocker + "\n");
            sb.append("includeVariants=true\n"); // lcj added for phg205 List version of variants
            sb.append("mxDiv=0.005\n");
	        sb.append("maxError=0.2\n");
            sb.append("useDepth=false\n");
            sb.append("replaceNsWithMajor=false\n");
            sb.append("exportMergedVCF=/tempFileDir/data/outputs/mergedVCFs/\n");
            //sb.append("assemblyMethod=lynnMummer4"); // lcj - here for testing , change if you want a different method for assemblies than mummer4

//            sb.append("exclusionString= FORMAT/GQ<50 || QUAL<200 || (FORMAT/AD[0]>0 && FORMAT/AD[1]>0) || (FORMAT/AD[0]>0 && FORMAT/AD[2]>0) || (FORMAT/AD[1]>0 && FORMAT/AD[2]>0)").append("\n");
            bw.write(sb.toString());
        } catch (IOException ioe) {
            ioe.printStackTrace();
        }

        // Get  the system IP address for postgres use.  For postgres, we access the db through a
        // postgres docker, and it will not accept 0.0.0.0 or localhost - it needs the
        // actual ip address

        String ipfinal = getHostIpAddr();

        // Create postgres docker config file
        try (BufferedWriter bw = Utils.getBufferedWriter(dbDockerPostgresConfigFile)) {
            StringBuilder sb = new StringBuilder();
            sb.append("# db connection parameters\n");
            sb.append("host=").append(ipfinal).append(":").append(postgresPort).append("\n"); // postgres docker container has port 5433 mapped to 5432
            sb.append("user=").append(postgresUser).append("\n");
            sb.append("password=").append(postgresUserPwd).append("\n");
            sb.append("DB=").append(phgDBNamePostgres).append("\n");
            sb.append("DBtype=postgres\n");
            sb.append("#Liquibase output directory\n");
            sb.append("liquibaseOutdir=/phg/outputDir\n");

            // Populate the files needed for Assembly by AnchorWave
            sb.append("AssemblyMAFFromAnchorWavePlugin.outputDir=").append(alignDirDocker).append("\n");
            sb.append("AssemblyMAFFromAnchorWavePlugin.keyFile=").append(asmMAFKeyFileDocker).append("\n");
            sb.append("AssemblyMAFFromAnchorWavePlugin.gffFile=").append(anchorGFFFileBaseDocker).append("\n");
            sb.append("AssemblyMAFFromAnchorWavePlugin.refFasta=").append(refGenomePathDocker).append("\n");
            sb.append("AssemblyMAFFromAnchorWavePlugin.threadsPerRun=4").append("\n");
            sb.append("AssemblyMAFFromAnchorWavePlugin.numRuns=1").append("\n");

            // Needed for loading ASMHaployptes - WGS are loaded from Groovy, which makes
            // its own bedFile and passes it on the command line
            sb.append("bedFile="+anchorBedFileBaseDocker+"\n");


            sb.append("# Haplotype creation parameters\n");
            sb.append("referenceFasta="+refGenomeDirDocker+"Ref.fa\n");
            sb.append("refServerPath=irods:/path/to/reference\n");

            // "anchors" is needed for LoadAllIntervalsToPHGDBPlugin
            sb.append("anchors="+anchorBedFileBaseDocker+"\n");
            sb.append("genomeData=/"+refLoadDataFileDocker+"\n");

            sb.append("consensusMethodName=CONSENSUS\n");
            sb.append("asmMethodName=mummer4\n");
            sb.append("asmKeyFile="+ asmKeyFileDocker + "\n");
            sb.append("AssemblyHaplotypesMultiThreadPlugin.isTestMethod=false\n"); // make true if want test method type
            sb.append("outputDir="+alignDirDocker+"\n");
            sb.append("gvcfOutputDir="+alignGVCFDirDocker+"\n");
            sb.append("wgsMethodName=GATK_PIPELINE\n");
            sb.append("haplotypeMethodName=GATK_PIPELINE\n");
            sb.append("LoadHaplotypesFromGVCFPlugin.isTestMethod=false\n");// make true if want test method type
            sb.append("wgsKeyFile="+keyFileDocker+"\n");
            sb.append("inputConsensusMethods=GATK_PIPELINE\n");
            sb.append("gvcfDir="+gvcfDirDocker+"\n");
            sb.append("PopulatePHGDBPipelinePlugin.gvcfServerPath=localHost;"+baseGVCFDirDocker+"\n"); // upload gvcfs:  LoadHaplotypesFromGVCFPlugin needs this in its keyfile
            sb.append("localGVCFFolder="+localGvcfDirDocker+"\n"); // download gvcfs: for HaplotypeGraphBuilderPlugin and HaplotypeGraphStreamBuilderPlugin - they download gvcfs


            sb.append("extendedWindowSize=0\n");
            sb.append("GQ_min=50\n");
            //           sb.append("QUAL_min=200\n");
            sb.append("DP_poisson_min=.01\n");
            sb.append("DP_poisson_max=.99\n");
            sb.append("filterHets=true\n");
            sb.append("numThreads=5\n");
            sb.append("# HapCountBestPathToTextPlugin parameters\n");
            sb.append("maxNodesPerRange=30\n");
            sb.append("minTaxaPerRange=1\n");
            sb.append("minReads=0\n");
            sb.append("maxGBSReads=1000\n");
            sb.append("minTransitionProb=0.001\n");
            sb.append("probReadMappedCorrectly=0.99\n");
            sb.append("emissionMethod=allCounts\n");
            sb.append("splitTaxa=true\n");
            sb.append("# RunHapCollapsePipelinePlugin parameters\n");
            sb.append("minTaxa=2\n");
            sb.append("minSites=20\n");
            //sb.append("method=coverage\n");
            //sb.append("method=upgma\n");
            sb.append("clusteringMode=upgma_assembly\n");// lcj - upgma will not work with gvcf assemblies
            sb.append("rankingFile="+rankingFileDocker + "\n");
            sb.append("includeVariants=true\n"); // lcj added for phg205 List version of variants
            sb.append("mxDiv=0.005\n");
            sb.append("maxError=0.2\n");
            sb.append("useDepth=false\n");
            sb.append("replaceNsWithMajor=false\n");
            sb.append("exportMergedVCF=/tempFileDir/data/outputs/mergedVCFs/\n");
            //sb.append("assemblyMethod=lynnMummer4"); // lcj - here for testing , change if you want a different method for assemblies than mummer4

//            sb.append("exclusionString= FORMAT/GQ<50 || QUAL<200 || (FORMAT/AD[0]>0 && FORMAT/AD[1]>0) || (FORMAT/AD[0]>0 && FORMAT/AD[2]>0) || (FORMAT/AD[1]>0 && FORMAT/AD[2]>0)").append("\n");
            bw.write(sb.toString());
        } catch (IOException ioe) {
            ioe.printStackTrace();
        }
        System.out.println("Created sequences and VCF :" + vcfFile);
        //Deletions are retained for the alignment process
        ExportUtils.writeToVCF(makeGenotypeTable(tnameSequenceMap), vcfFile, false);

        try(BufferedWriter rankingFileWriter = Utils.getBufferedWriter(rankingFile)) {
            // these are arbitrary rankings - the last taxa will be ranked with highest priority
            // This allows for additional taxa to be added/deleted without changes to this code.
            int count=0;
            for (Taxon taxon : taxaList) {
                //Write to the ranking file
                String name = taxon.getName();
                count++;
                StringBuilder sb = new StringBuilder();
                sb.append(name);
                sb.append("\t");
                sb.append(count);
                sb.append("\n");

                rankingFileWriter.write(sb.toString());
            }
        }

        try(BufferedWriter keyFileWriter = Utils.getBufferedWriter(keyFile)) {
            keyFileWriter.write("sample_name\tsample_description\tfiles\ttype\tchrPhased\tgenePhased\tphasingConf\tlibraryID\tgvcfServerPath\n");
            for (Taxon taxon : taxaList) {
                //Write to the keyFile
                String name = taxon.getName();
                StringBuilder sb = new StringBuilder();
                sb.append(name);
                sb.append("\t");
                sb.append(name);
                sb.append(" line aligned\t");
                sb.append(name);
                sb.append("_R1.fastq\t");
                sb.append("FASTQ\t");
                sb.append("true\ttrue\t.99\tdummyLib1\t");
                sb.append(gvcfServerPath);
                sb.append("\n");

                keyFileWriter.write(sb.toString());
            }
        }
        catch (IOException e) {
            e.printStackTrace();
        }
        try(BufferedWriter genotypingKeyFileWriter = Utils.getBufferedWriter(genotypingKeyFile);
            BufferedWriter assemblyKeyFileWriter = Utils.getBufferedWriter(asmKeyFile);
            BufferedWriter assemblyMAFKeyFileWriter = Utils.getBufferedWriter(asmMAFKeyFile)) {
            genotypingKeyFileWriter.write("cultivar\tflowcell_lane\tfilename\tPlateID\n");
            assemblyKeyFileWriter.write("AssemblyServerDir\tRefDir\tRefFasta\tAssemblyDir\tAssemblyGenomeFasta\tAssemblyFasta\tAssemblyDBName\tChromosome\n");
            assemblyMAFKeyFileWriter.write("AssemblyServerDir\tAssemblyGenomeFasta\tAssemblyDir\tAssemblyFasta\tAssemblyDBName\tDescription\n");

            tnameSequenceMap.forEach((name, seqWithGaps) -> {
                //Remove the deletions from the sequence
                String gapSeq = seqWithGaps.replace("-", "");

                // Add insertions: replace "+" with string of 10 C's
                String seq = gapSeq.replace("+", "CCCCCCCCCC");

                try (BufferedWriter bw = Utils.getBufferedWriter(answerDir + name + fastaSuffix)) {
                    bw.write(">1\n"); // creates fastas for all taxa, including ref.  Only chrom 1
                    bw.write(seq);
                    bw.write("\n");
                } catch (IOException e) {
                    e.printStackTrace();
                }

                //Copy the assembly fasta to the correct mounted directory
                try {
                    Files.copy(Paths.get(answerDir + name + fastaSuffix), Paths.get(assemblyDir + name + fastaSuffix), StandardCopyOption.REPLACE_EXISTING);
                }catch (IOException e) {
                    throw new IllegalStateException("Unable to copy assembly" + name + fastaSuffix,e);
                }
                //Write the entry to the keyfile
                try {
                    // NOTE: name _ fastaSuffix appears twice below.  The first time it is for the
                    // AssemblyGenomeFasta, the second time it is for the AssemblyFasta (per-chrom fasta).
                    // In a real setting, these 2 files will be different.  For SmallSeq tests, they are
                    // the same as we only have 1 chromosome, so it doubles as the full fasta.
                    //
                    // The "Description" field of the new asm key file is added for anchorwave MAFs and is optional.
                    // If present, this field will be used in the PopulatePHGDBPipelinePlugin as the data for
                    // column "sample data" of the generated gvcf key file.  If it is not present, PopulatePHGDBPipelinePlugin
                    // will write a sample description based on the assembly server directory and local directory
                    // paths for each assembly in the asm maf key file.
                    if(!name.startsWith("Rec") && !name.startsWith("Ref") && !name.endsWith("1")) {
                        assemblyKeyFileWriter.write("" + assemblyServerDir + "\t" + refGenomeDirDocker + "\t" + refGenomeFile + "\t"  + assemblyDirDocker +
                                "\t" + name + fastaSuffix + "\t" + name + fastaSuffix + "\t" + name + "_Assembly\t" + "1\n");
                        assemblyMAFKeyFileWriter.write("" + assemblyServerDir + "\t"  + name + fastaSuffix + "\t" + assemblyDirDocker +
                                "\t" + name + fastaSuffix + "\t" + name + "_Assembly\t" + "smallSeq description for assembly " + name + "\n");
                    }
                }
                catch(IOException e) {
                    throw new IllegalStateException("Unable to add "+name+" to keyfile",e);
                }
                try (BufferedWriter bw = Utils.getBufferedWriter(genotypingFastqDir + name + "_R1.fastq")) {
                    bw.write(createFastqString(seq, lengthOfReads, wgsDepth, Range.singleton(Integer.MIN_VALUE)));
                    genotypingKeyFileWriter.write(name+"_wgs\twgsFlowcell\t"+name + "_R1.fastq\twgs\n" );
                    //Copy the file to use when genotyping as well.
                    } catch (IOException e) {
                    e.printStackTrace();
                }
                try {
                    Files.copy(Paths.get(genotypingFastqDir + name + "_R1.fastq"), Paths.get(fastqDir + name + "_R1.fastq"), StandardCopyOption.REPLACE_EXISTING);
                }catch (IOException e) {
                    throw new IllegalStateException("Unable to copy genotypingFastqDir" + name + "_R1.fastq",e);
                }

                try (BufferedWriter bw = Utils.getBufferedWriter(genotypingFastqDir + name + "_R1_gbs.fastq")) {
                    Range positionsToAvoid = Range.open(geneRangeWithoutGBS.lowerEndpoint() * (lengthOfGenes + lengthOfInterGenes),
                            geneRangeWithoutGBS.upperEndpoint() * (lengthOfGenes + lengthOfInterGenes));
                    bw.write(createFastqString(seq, lengthOfReads, gbsDepth, positionsToAvoid));
                    genotypingKeyFileWriter.write(name+"_gbs\tgbsFlowcell\t"+name + "_R1_gbs.fastq\tgbs\n" );
                } catch (IOException e) {
                    e.printStackTrace();
                }
            });
        }

        try (BufferedWriter bw = Utils.getBufferedWriter(anchorBedFile)) {
            System.out.println("Creating anchor bed file :" + anchorBedFile);           
            bw.write(createAnchorBED(lengthOfGenes, lengthOfInterGenes, numberOfGenes, proportionOfInterGeneDeleted,'\t'));
        }
        Files.copy(Paths.get(anchorBedFile), Paths.get(anchorBedFileBase), StandardCopyOption.REPLACE_EXISTING);

        try(BufferedWriter bw = Utils.getBufferedWriter(anchorGFFFile)) {
            bw.write(createAnchorGFF(lengthOfGenes, lengthOfInterGenes, numberOfGenes, proportionOfInterGeneDeleted, '\t'));
        }

        Files.copy(Paths.get(anchorGFFFile), Paths.get(anchorGFFFileBase), StandardCopyOption.REPLACE_EXISTING);

        //
        Files.copy(Paths.get(answerDir + refGenomeName + fastaSuffix), Paths.get(refGenomePath));

        // Create ref data file
        createRefDBLoadFile(); // needed for loading reference data

        // Create the Assembly genomeData files needed to load DB
        for (String name : genomeNames.keySet()) {
            createAssemblyLoadFile(name, answerDir);
        }

    }

    private static void createRefDBLoadFile() {

        String loadDataHeader = "Genotype\tHapnumber\tDataline\tploidy\tgenesPhased\tchromsPhased\tconfidence\tMethod\tMethodDetails\tgvcfServerPath\n";
        try (BufferedWriter bw = Utils.getBufferedWriter(refLoadDataFile)) {
            StringBuilder sb = new StringBuilder();
            sb.append(loadDataHeader);
            sb.append(refGenomeName).append("_Assembly").append("\t"); // genotype name
            sb.append(0).append("\t"); // hapnumber
            sb.append("generated").append("\t"); // dataline
            sb.append("1").append("\t"); // ploidy
            sb.append("true").append("\t"); // genesPhased
            sb.append("true").append("\t"); // chromsPhased
            sb.append("1").append("\t"); // confidence
            sb.append("B73Ref_method").append("\t"); // method
            sb.append("Test version for junit").append("\t"); // method details
            sb.append(gvcfServerPath); // value to be stored in genome_file_data db table
            sb.append("\n");

            bw.write(sb.toString());

        } catch (IOException ioe) {
            ioe.printStackTrace();
        }
    }


    private static void createAssemblyLoadFile(String taxonName, String outputDir) {

        String loadDataHeader = "Genotype\tHapnumber\tDataline\tploidy\treference\tgenesPhased\tchromsPhased\tconfidence\tMethod\tMethodDetails\tRefVersion\tgvcfServerPath\n";
        String loadFile = outputDir + taxonName + "_Assembly_load_data.txt";
        try (BufferedWriter bw = Utils.getBufferedWriter(loadFile)) {
            StringBuilder sb = new StringBuilder();
            sb.append(loadDataHeader);
            sb.append(taxonName).append("_Assembly").append("\t"); // 
            sb.append(0).append("\t"); // hapnumber
            sb.append("minimap2").append("\t"); // dataline
            sb.append("1").append("\t"); // ploidy
            sb.append("false").append("\t"); // is reference
            sb.append("true").append("\t"); // genesPhased
            sb.append("true").append("\t"); // chromsPhased
            sb.append("1").append("\t"); // confidence
            sb.append("Assembly_minimap2").append("\t"); // method
            sb.append("Aligned via minimap2").append("\t"); // method details
            sb.append("B73Ref_version").append("\t"); // ref version
            sb.append(gvcfServerPath); // remote host;path where gvcf files are stored
            sb.append("\n");

            bw.write(sb.toString());

        } catch (IOException ioe) {
            ioe.printStackTrace();
        }
    }

    private static String createAnchorBED(int lengthOfGenes, int lengthOfInterGenes, int numberOfGenes, 
            double proportionOfInterGeneDeleted,char delimiter) {
        // These are bed files:  0-based, inclusive/exclusive
        boolean interGeneInBED=false;
        int interGeneLengthAfterDeletion=lengthOfInterGenes - (int)(lengthOfInterGenes*proportionOfInterGeneDeleted);
        StringBuilder anchorBed = new StringBuilder();
        int currPosition = 0;
        for (int currGene = 0; currGene < numberOfGenes; currGene++) {
            int endPosition = currPosition + lengthOfGenes;
            anchorBed.append(1).append(delimiter);  //chromosome
            anchorBed.append(currPosition).append(delimiter);
            anchorBed.append(endPosition).append(delimiter);
            anchorBed.append("FocusRegion").append("\n");

            if(interGeneInBED) {
                anchorBed.append(1).append(delimiter);  //chromosome
                anchorBed.append(endPosition).append(delimiter);
                anchorBed.append(endPosition+interGeneLengthAfterDeletion).append(delimiter);
                anchorBed.append("FocusComplement").append("\n");
            }
            currPosition = endPosition + interGeneLengthAfterDeletion;
//            if (proportionOfInterGeneDeleted > 0.0) {
//                currPosition = endPosition + interGeneLengthAfterDeletion;
//            } else {
//                currPosition = endPosition + lengthOfInterGenes+1;
//            }
            
        }
        return anchorBed.toString();
    }

    private static String createAnchorGFF(int lengthOfGenes, int lengthOfInterGenes, int numberOfGenes,
                                          double proportionOfInterGeneDeleted,char delimiter) {
        // These are GFF files:  1-based, inclusive/inclusive
        boolean interGeneInBED=false;
        int interGeneLengthAfterDeletion=lengthOfInterGenes - (int)(lengthOfInterGenes*proportionOfInterGeneDeleted);
        StringBuilder anchorBed = new StringBuilder();
        int currPosition = 0;
        for (int currGene = 0; currGene < numberOfGenes; currGene++) {
            int endPosition = currPosition + lengthOfGenes;

            // Create the Gene
            //Chrom
            anchorBed.append(1).append(delimiter);
            //source
            anchorBed.append(".").append(delimiter);
            //feature
            anchorBed.append("gene").append(delimiter);
            //start
            anchorBed.append(currPosition+1).append(delimiter);
            //end
            anchorBed.append(endPosition).append(delimiter);
            //score
            anchorBed.append(".").append(delimiter);
            //strand
            anchorBed.append("+").append(delimiter);
            //Frame
            anchorBed.append(".").append(delimiter);
            //attribute
            anchorBed.append("ID=gene:gene").append(currGene).append("\n");

            // Create the mRNA - same coordinates as the gene
            //Chrom
            anchorBed.append(1).append(delimiter);
            //source
            anchorBed.append(".").append(delimiter);
            //feature
            anchorBed.append("mRNA").append(delimiter);
            //start
            anchorBed.append(currPosition+1).append(delimiter);
            //end
            anchorBed.append(endPosition).append(delimiter);
            //score
            anchorBed.append(".").append(delimiter);
            //strand
            anchorBed.append("+").append(delimiter);
            //Frame
            anchorBed.append(".").append(delimiter);
            //attribute
            anchorBed.append("ID=transcript:gene").append(currGene).append("_T001;")
                    .append("Parent=gene:gene").append(currGene).append("\n");

            // Create the CDS - ends before the gene/mRNA end.  This is arbitary,
            // but follows an example from a B73 gff file
            //Chrom
            anchorBed.append(1).append(delimiter);
            //source
            anchorBed.append(".").append(delimiter);
            //feature
            anchorBed.append("CDS").append(delimiter);
            //start
            anchorBed.append(currPosition+1).append(delimiter);
            //end - slightly smaller than gene and mRNA
            anchorBed.append(endPosition - (endPosition/4)).append(delimiter);
            //score
            anchorBed.append(".").append(delimiter);
            //strand
            anchorBed.append("+").append(delimiter);
            //Frame
            anchorBed.append(".").append(delimiter);
            //attribute
            anchorBed.append("ID=CDS:gene").append(currGene).append("_P001;")
                    .append("Parent=transcript:gene").append(currGene).append("_T001\n");

            currPosition = endPosition + interGeneLengthAfterDeletion;

        }
        return anchorBed.toString();
    }

    private static String createFastqString(String sequence, int lengthOfReads, double depth, Range regionToAvoid) {
        PoissonDistribution poisson=new PoissonDistribution(lengthOfReads*sequencingErrorRate);
        int readNumber = (int) ((depth * sequence.length()) / lengthOfReads);
        String qualityScores = IntStream.range(0, lengthOfReads).mapToObj(i -> "H").collect(Collectors.joining(""));
        StringBuilder fastq = new StringBuilder();
        for (int i = 0; i < readNumber; i++) {
            int startPos = random.nextInt(sequence.length() - lengthOfReads);
            if(regionToAvoid.contains(startPos) || regionToAvoid.contains(startPos+lengthOfReads)) continue;
            fastq.append("@M01032:387:000000000-ANP68:1:1102:20447:" + i + " 1:N:0:CCTAAGAC+GCGTAAGA").append("\n");
            StringBuilder read = new StringBuilder(sequence.substring(startPos, startPos + lengthOfReads));
            int errors=poisson.sample();
 //           System.out.println("errors = " + errors);
            mutateSequence(read,errors);
            fastq.append(read).append("\n");
            fastq.append("+\n");
            fastq.append(qualityScores).append("\n");
        }
        return fastq.toString();
    }

    private static GenotypeTable makeGenotypeTable(Map nameSeqMap) {
        String refSeq = nameSeqMap.get(refGenomeName);
        PositionList positionList = IntStream.range(0, refSeq.length())
                .mapToObj(site -> Position.builder("1", site + 1)
                        .allele(WHICH_ALLELE.Reference, getNucleotideAlleleByte(refSeq.charAt(site)))
                        .allele(WHICH_ALLELE.Alternate, getNucleotideAlleleByte("T"))
                        .build()
                )
                .collect(PositionList.collectValidateOrder());
        GenotypeTableBuilder gtb = GenotypeTableBuilder.getTaxaIncremental(positionList);
        nameSeqMap.forEach((name, seq) -> {
            gtb.addTaxon(new Taxon(name), convertGenotypeStringToDiploidByteArray(seq.toUpperCase()));
        });
        return gtb.build();
    }

    private static Map createGenomes(int numberOfGenes, int lengthOfGenes, int lengthOfInterGenes,
            double proportionOfRefInterGeneDuplicated, int haplotypeDivergence, int intraHaplotypeDivergence,
            double proportionOfInterGeneDeleted, double insertProportion) {
        //Build the reference genome
        //create a repetitive element of a particular size
        int bpOfRepetitive = (int) (lengthOfInterGenes * proportionOfRefInterGeneDuplicated);
        System.out.println("CreateGenomes: bpOfRepetitive = " + bpOfRepetitive);
        String repetitiveElement = createBaseSequence(bpOfRepetitive);
        System.out.println("repetitiveElement = " + repetitiveElement);
        String baseGenome = IntStream.range(0, numberOfGenes)
                .mapToObj(i -> {
                    StringBuilder interSeq = new StringBuilder(createBaseSequence(lengthOfInterGenes));
                    //stick the repetitive element in a random location within the intergenic region
                    if (lengthOfInterGenes > 0) {
                        interSeq.insert(random.nextInt(lengthOfInterGenes - bpOfRepetitive), repetitiveElement).setLength(lengthOfInterGenes);
                    }
                    return createBaseSequence(lengthOfGenes) + interSeq;
                })
                .collect(Collectors.joining(""));
       // System.out.println(createBaseSequence(lengthOfGenes)); // LCJ - uncomment
        //System.out.println("baseGenome = " + baseGenome); // LCJ - uncomment
        Map genomeSequences = genomeNames.keySet().stream()
                .filter(name -> !name.startsWith("Rec"))
                .flatMap(name -> {
                            Map taxaSequenceMap = new HashMap<>();
                            //first diverge the pair of lines for the haplotype
                            taxaSequenceMap.put(name, createMutatedSequence(baseGenome, haplotypeDivergence,numberOfGenes,  lengthOfGenes,
                                    lengthOfInterGenes,  proportionOfInterGeneDeleted, 0));
                            //create rare alleles within the haplotype
                            genomeNames.get(name).forEach(derivedTaxonName -> {
                                System.out.println("Generating sequence for " + derivedTaxonName);
                                taxaSequenceMap.put(derivedTaxonName, createMutatedSequence(taxaSequenceMap.get(name), intraHaplotypeDivergence,
                                        numberOfGenes,  lengthOfGenes, lengthOfInterGenes,  proportionOfInterGeneDeleted, insertProportion));
                            });
                            return taxaSequenceMap.entrySet().stream();
                        }
                ).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
        List potentialParents= Lists.newArrayList(genomeNames.keySet());
        potentialParents.addAll(genomeNames.values());
        for (int i = 1; i < numberOfGenes-1; i++) {
            int breakpoint = i*(lengthOfGenes + lengthOfInterGenes);  //recombine after first intergenic region
            String parent1=potentialParents.get(random.nextInt(potentialParents.size()));
            String parent2=potentialParents.get(random.nextInt(potentialParents.size()));
            String recombinantSequence = genomeSequences.get(parent1).substring(0, breakpoint) + genomeSequences.get(parent2).substring(breakpoint);
            genomeSequences.put("Rec"+parent1+parent2+"gco"+i, recombinantSequence);
        }
        // Uncomment if you really want to see the genomes, but it prints
        // alot to the console, and this makes seeing the sqlite vs postgres
        // results more difficult as these genomes are created once for each run.
       // System.out.println(genomeSequences.toString());

        return genomeSequences;
    }

    private static String createMutatedSequence(String baseGenome, int mutationFrequency, int numberOfGenes, int lengthOfGenes,
            int lengthOfInterGenes, double proportionOfInterGeneDeleted, double insertionProportion) {
        StringBuilder sequence = new StringBuilder(baseGenome);
        int currIndex = 0;
        int maxAnchorIdx = lengthOfInterGenes > 0 ? numberOfGenes * 2 : numberOfGenes;
        for (int anchoridx = 0; anchoridx < maxAnchorIdx; anchoridx++) {
            //even anchoridx are genes, odd interachors.  If no interanchors, all are genes
            if (anchoridx % 2 == 0 || lengthOfInterGenes == 0) { // process anchors
                mutateSequence(sequence, currIndex, lengthOfGenes, mutationFrequency);
                if (insertionProportion > 0) { // only doing insertions for anchors, not inter-anchors.
                    insertSequence(sequence, currIndex, lengthOfGenes, insertionProportion); // LCJ arbitrary insert of 0.2 percent size. Only anchors for derived taxon have insertion
                }                
                currIndex += lengthOfGenes;
            } else { // process inter-anchor
                mutateSequence(sequence, currIndex, lengthOfInterGenes, mutationFrequency);                                
                deleteSequence(sequence, currIndex, lengthOfInterGenes, proportionOfInterGeneDeleted); // only inter-anchors have deletion
                currIndex += lengthOfInterGenes;
            }
        }
        return sequence.toString();
    }

    private static void deleteSequence(StringBuilder sequence, int currIndex, int lengthOfInterGenes, double proportionOfInterGeneDeleted) {
        int sizeOfDeletion = (int) (lengthOfInterGenes * proportionOfInterGeneDeleted);
        int deletionStart = currIndex + random.nextInt(lengthOfInterGenes - sizeOfDeletion);
        for (int i = deletionStart; i < deletionStart + sizeOfDeletion; i++) {
            sequence.setCharAt(i, '-');
        }
    }
    
    // For testing, the + will later be translated to a series of C 
    private static void insertSequence(StringBuilder sequence, int currIndex, int lengthOfGenes, double proportionOfGeneInsert) {
        int sizeOfInsertion = (int) (lengthOfGenes * proportionOfGeneInsert);
        if (sizeOfInsertion > 0) {
            int insertionStart = currIndex + random.nextInt(lengthOfGenes);
            sequence.setCharAt(insertionStart, '+');
        }

    }

    /*
    Base sequence are only ACG
     */
    private static String createBaseSequence(int length) {
        return random.ints(length, 0, 3) // 3 is A C G, values 0,1,2
                .mapToObj(i -> getHaplotypeNucleotide((byte) i))
                .collect(Collectors.joining(""));
    }

    /*
    Base sequence are only ACG
     */
    private static void mutateSequence(StringBuilder sequence, int offset, int length, int frequency) {
        for (int i = 0; i < length / frequency; i++) {
            sequence.setCharAt(offset + random.nextInt(length), 'T');
        }
    }

    /*
    Mutate a sequence with the full range of bp
     */
    private static void mutateSequence(StringBuilder sequence, int numberOfMutations) {
        for (int i = 0; i < numberOfMutations; i++) {
            sequence.setCharAt(random.nextInt(sequence.length()), getNucleotideIUPACChar((byte)random.nextInt(4)));
        }
    }


}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy