All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.nmdp.ngs.tools.GenerateReads Maven / Gradle / Ivy

There is a newer version: 1.8.3
Show newest version
/*

    ngs-tools  Next generation sequencing (NGS/HTS) command line tools.
    Copyright (c) 2014-2015 National Marrow Donor Program (NMDP)

    This library is free software; you can redistribute it and/or modify it
    under the terms of the GNU Lesser General Public License as published
    by the Free Software Foundation; either version 3 of the License, or (at
    your option) any later version.

    This library is distributed in the hope that it will be useful, but WITHOUT
    ANY WARRANTY; with out even the implied warranty of MERCHANTABILITY or
    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
    License for more details.

    You should have received a copy of the GNU Lesser General Public License
    along with this library;  if not, write to the Free Software Foundation,
    Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA.

    > http://www.gnu.org/licenses/lgpl.html

*/
package org.nmdp.ngs.tools;

import static com.google.common.base.Preconditions.checkNotNull;

import static org.dishevelled.compress.Readers.reader;
import static org.dishevelled.compress.Writers.writer;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;

import java.util.concurrent.Callable;

import org.apache.commons.math3.distribution.NormalDistribution;
import org.apache.commons.math3.distribution.RealDistribution;

import org.apache.commons.math3.random.MersenneTwister;
import org.apache.commons.math3.random.RandomGenerator;

import org.biojava.bio.program.fastq.FastqVariant;
import org.biojava.bio.program.fastq.SangerFastqWriter;

import org.biojava.bio.seq.Sequence;
import org.biojava.bio.seq.SequenceIterator;

import org.biojava.bio.seq.io.SeqIOTools;

import org.dishevelled.commandline.ArgumentList;
import org.dishevelled.commandline.CommandLine;
import org.dishevelled.commandline.CommandLineParseException;
import org.dishevelled.commandline.CommandLineParser;
import org.dishevelled.commandline.Switch;
import org.dishevelled.commandline.Usage;

import org.dishevelled.commandline.argument.DoubleArgument;
import org.dishevelled.commandline.argument.FileArgument;
import org.dishevelled.commandline.argument.IntegerArgument;
import org.dishevelled.commandline.argument.StringArgument;

import org.nmdp.ngs.reads.CoverageStrategy;
import org.nmdp.ngs.reads.MutationStrategy;
import org.nmdp.ngs.reads.QualityStrategy;

import org.nmdp.ngs.reads.coverage.MeanCoverageStrategy;
import org.nmdp.ngs.reads.coverage.MinimumCoverageStrategy;

import org.nmdp.ngs.reads.mutation.AmbiguousSubstitutionMutationStrategy;
import org.nmdp.ngs.reads.mutation.CompositeMutationStrategy;
import org.nmdp.ngs.reads.mutation.DeletionMutationStrategy;
import org.nmdp.ngs.reads.mutation.IdentityMutationStrategy;
import org.nmdp.ngs.reads.mutation.IndelMutationStrategy;
import org.nmdp.ngs.reads.mutation.InsertionMutationStrategy;
import org.nmdp.ngs.reads.mutation.SubstitutionMutationStrategy;

import org.nmdp.ngs.reads.quality.RealDistributionQualityStrategy;
import org.nmdp.ngs.reads.quality.ScoreFunctionQualityStrategy;
import org.nmdp.ngs.reads.quality.ScoreFunctions;

/**
 * Generate next generation sequencing (NGS/HTS) reads.
 */
@SuppressWarnings("deprecation")
public final class GenerateReads implements Callable {
    private final File referenceFile;
    private final File readFile;
    private final RandomGenerator random;
    private final RealDistribution length;
    private final QualityStrategy quality;
    private final CoverageStrategy coverage;
    private final double mutationRate;
    private final MutationStrategy mutation;

    private static final double DEFAULT_MEAN_LENGTH = 60.0d;
    private static final double DEFAULT_LENGTH_VARIATION = 10.0d;
    private static final int DEFAULT_MINIMUM_COVERAGE = 20;
    private static final double DEFAULT_MEAN_QUALITY = 25.0d;
    private static final double DEFAULT_QUALITY_VARIATION = 5.0d;
    private static final double DEFAULT_MEAN_QUALITY_WEIGHT = 0.9d;
    private static final double DEFAULT_QUALITY_WEIGHT_VARIATION = 0.1d;
    private static final double DEFAULT_EXTEND_INSERTION_RATE = 0.4d;
    private static final int DEFAULT_MAXIMUM_INSERTION_LENGTH = 8;
    private static final double DEFAULT_INSERTION_RATE = 0.5d;
    private static final double DEFAULT_DELETION_RATE = 0.5d;
    private static final double DEFAULT_SUBSTITUTION_RATE = 0.4d;
    private static final double DEFAULT_INDEL_RATE = 0.6d;
    private static final double DEFAULT_AMBIGUOUS_RATE = 0.0d;
    private static final double DEFAULT_MUTATION_RATE = 0.05d;
    private static final double NO_VARIATION = 1.0E-100;
    static final CoverageStrategy DEFAULT_COVERAGE = new MinimumCoverageStrategy(DEFAULT_MINIMUM_COVERAGE);
    static final MutationStrategy DEFAULT_MUTATION = new IdentityMutationStrategy();
    private static final String USAGE = "ngs-generate-reads [args]";


    /**
     * Generate next generation sequencing (NGS/HTS) reads.
     *
     * @param referenceFile reference file, if any
     * @param readFile read file, if any
     * @param random random generator, must not be null
     * @param length length distribution, must not be null
     * @param quality quality strategy, must not be null
     * @param coverage coverage strategy, must not be null
     * @param mutationRate mutation rate
     * @param mutation mutation strategy, must not be null
     */
    public GenerateReads(final File referenceFile,
                         final File readFile,
                         final RandomGenerator random,
                         final RealDistribution length,
                         final QualityStrategy quality,
                         final CoverageStrategy coverage,
                         final double mutationRate,
                         final MutationStrategy mutation) {

        checkNotNull(random);
        checkNotNull(length);
        checkNotNull(quality);
        checkNotNull(coverage);
        checkNotNull(mutation);
        this.referenceFile = referenceFile;
        this.readFile = readFile;
        this.random = random;
        this.length = length;
        this.quality = quality;
        this.coverage = coverage;
        this.mutationRate = mutationRate;
        this.mutation = mutation;
    }


    @Override
    public Integer call() throws Exception {
        BufferedReader reader = null;
        PrintWriter writer = null;
        try {
            reader = reader(referenceFile);

            SequenceIterator sequences = SeqIOTools.readFastaDNA(reader);
            while (sequences.hasNext()) {
                Sequence sequence = sequences.nextSequence();
                try {
                    writer = writer(readFile, true);
                    new org.nmdp.ngs.reads.GenerateReads(sequence, FastqVariant.FASTQ_SANGER, random, length, quality, coverage, mutationRate, mutation, writer, new SangerFastqWriter()).run();
                }
                catch (IOException e) {
                    e.printStackTrace();
                    System.exit(-1);
                }
                finally {
                    try {
                        writer.flush();
                    }
                    catch (Exception e) {
                        // ignore
                    }
                }
            }

            return 0;
        }
        finally {
            try {
                reader.close();
            }
            catch (Exception e) {
                // ignore
            }
            try {
                writer.close();
            }
            catch (Exception e) {
                // ignore
            }
        }
    }


    /**
     * Main.
     *
     * @param args command line arguments
     */
    public static void main(final String[] args) {
        Switch about = new Switch("a", "about", "display about message");
        Switch help = new Switch("h", "help", "display help message");
        FileArgument referenceFile = new FileArgument("r", "reference", "reference input file, in fasta format, default stdin", false);
        FileArgument readFile = new FileArgument("o", "read", "read output file, in fastq format, default stdout", false);

        DoubleArgument meanLength = new DoubleArgument("l", "mean-length", "mean length, default " + DEFAULT_MEAN_LENGTH, false);
        DoubleArgument lengthVariation = new DoubleArgument("v", "length-variation", "length variation, default " + DEFAULT_LENGTH_VARIATION, false);
        IntegerArgument minimumCoverage = new IntegerArgument("c", "minimum-coverage", "minimum coverage, default " + DEFAULT_MINIMUM_COVERAGE, false);
        IntegerArgument meanCoverage = new IntegerArgument("g", "mean-coverage", "mean coverage", false);
        StringArgument qualityType = new StringArgument("u", "quality", "quality strategy type { illumina, normal }, default normal", false);
        DoubleArgument meanQualityWeight = new DoubleArgument("w", "mean-quality-weight", "mean quality weight, default " + DEFAULT_MEAN_QUALITY_WEIGHT, false);
        DoubleArgument qualityWeightVariation = new DoubleArgument("t", "quality-weight-variation", "quality weight variation, default " + DEFAULT_QUALITY_WEIGHT_VARIATION, false);
        DoubleArgument meanQuality = new DoubleArgument("q", "mean-quality", "mean quality, default " + DEFAULT_MEAN_QUALITY, false);
        DoubleArgument qualityVariation = new DoubleArgument("f", "quality-variation", "quality variation, default " + DEFAULT_QUALITY_VARIATION, false);
        StringArgument mutationType = new StringArgument("m", "mutation", "mutation strategy type { substitution, insertion, deletion, ambiguous, indel, composite }, default identity", false);
        DoubleArgument extendInsertionRate = new DoubleArgument("x", "extend-insertion-rate", "extend insertion rate, default " + DEFAULT_EXTEND_INSERTION_RATE, false);
        IntegerArgument maximumInsertionLength = new IntegerArgument("e", "maximum-insertion-length", "maximum insertion length, default " + DEFAULT_MAXIMUM_INSERTION_LENGTH, false);
        DoubleArgument insertionRate = new DoubleArgument("i", "insertion-rate", "insertion rate, default " + DEFAULT_INSERTION_RATE, false);
        DoubleArgument deletionRate = new DoubleArgument("d", "deletion-rate", "deletion rate, default " + DEFAULT_DELETION_RATE, false);
        DoubleArgument substitutionRate = new DoubleArgument("s", "substitution-rate", "substitution rate, default " + DEFAULT_SUBSTITUTION_RATE, false);
        DoubleArgument indelRate = new DoubleArgument("y", "indel-rate", "indel rate, default " + DEFAULT_INDEL_RATE, false);
        DoubleArgument ambiguousRate = new DoubleArgument("b", "ambiguous-rate", "ambiguous substitution rate, default " + DEFAULT_AMBIGUOUS_RATE, false);
        DoubleArgument mutationRate = new DoubleArgument("n", "mutation-rate", "mutation rate, default " + DEFAULT_MUTATION_RATE, false);
        IntegerArgument seed = new IntegerArgument("z", "seed", "random number seed, default relates to current time", false);

        ArgumentList arguments = new ArgumentList(about, help, referenceFile, readFile, meanLength, lengthVariation, minimumCoverage, meanCoverage,
                                                  qualityType, meanQualityWeight, qualityWeightVariation, meanQuality, qualityVariation,
                                                  mutationType, extendInsertionRate, maximumInsertionLength, insertionRate, deletionRate,
                                                  substitutionRate, indelRate, ambiguousRate, mutationRate, seed);

        CommandLine commandLine = new CommandLine(args);

        GenerateReads generateReads = null;
        try {
            CommandLineParser.parse(commandLine, arguments);
            if (about.wasFound()) {
                About.about(System.out);
                System.exit(0);
            }
            if (help.wasFound()) {
                Usage.usage(USAGE, null, commandLine, arguments, System.out);
                System.exit(0);
            }

            RandomGenerator random = seed.wasFound() ? new MersenneTwister(seed.getValue()) : new MersenneTwister();

            double lv = Math.max(NO_VARIATION, lengthVariation.getValue(DEFAULT_LENGTH_VARIATION));
            RealDistribution length = new NormalDistribution(random, meanLength.getValue(DEFAULT_MEAN_LENGTH), lv, NormalDistribution.DEFAULT_INVERSE_ABSOLUTE_ACCURACY);

            CoverageStrategy coverage = DEFAULT_COVERAGE;
            if (minimumCoverage.wasFound()) {
                coverage = new MinimumCoverageStrategy(minimumCoverage.getValue());
            }
            else if (meanCoverage.wasFound()) {
                coverage = new MeanCoverageStrategy(meanCoverage.getValue());
            }

            QualityStrategy quality = null;
            if ("illumina".equals(qualityType.getValue())) {
                RealDistribution realDistribution = new NormalDistribution(random, meanQualityWeight.getValue(DEFAULT_MEAN_QUALITY_WEIGHT), qualityWeightVariation.getValue(DEFAULT_QUALITY_WEIGHT_VARIATION), NormalDistribution.DEFAULT_INVERSE_ABSOLUTE_ACCURACY);
                quality = new ScoreFunctionQualityStrategy(realDistribution, ScoreFunctions.illumina());
            }
            else {
                RealDistribution realDistribution = new NormalDistribution(random, meanQuality.getValue(DEFAULT_MEAN_QUALITY), qualityVariation.getValue(DEFAULT_QUALITY_VARIATION), NormalDistribution.DEFAULT_INVERSE_ABSOLUTE_ACCURACY);
                quality = new RealDistributionQualityStrategy(realDistribution);
            }

            MutationStrategy mutation = DEFAULT_MUTATION;
            if (mutationType.wasFound()) {
                if ("substitution".equals(mutationType.getValue())) {
                    mutation = new SubstitutionMutationStrategy(random);
                }
                else if ("ambiguous".equals(mutationType.getValue())) {
                    mutation = new AmbiguousSubstitutionMutationStrategy();
                }
                else if ("insertion".equals(mutationType.getValue())) {
                    mutation = new InsertionMutationStrategy(random, extendInsertionRate.getValue(DEFAULT_EXTEND_INSERTION_RATE), maximumInsertionLength.getValue(DEFAULT_MAXIMUM_INSERTION_LENGTH));
                }
                else if ("deletion".equals(mutationType.getValue())) {
                    mutation = new DeletionMutationStrategy();
                }
                else if ("indel".equals(mutationType.getValue())) {
                    InsertionMutationStrategy insertion = new InsertionMutationStrategy(random, insertionRate.getValue(DEFAULT_INSERTION_RATE), maximumInsertionLength.getValue(DEFAULT_MAXIMUM_INSERTION_LENGTH));
                    DeletionMutationStrategy deletion = new DeletionMutationStrategy();
                    mutation = new IndelMutationStrategy(random, insertion, insertionRate.getValue(DEFAULT_INSERTION_RATE), deletion, deletionRate.getValue(DEFAULT_DELETION_RATE));
                }
                else if ("composite".equals(mutationType.getValue())) {
                    SubstitutionMutationStrategy substitution = new SubstitutionMutationStrategy(random);
                    InsertionMutationStrategy insertion = new InsertionMutationStrategy(random, insertionRate.getValue(DEFAULT_INSERTION_RATE), maximumInsertionLength.getValue(DEFAULT_MAXIMUM_INSERTION_LENGTH));
                    DeletionMutationStrategy deletion = new DeletionMutationStrategy();
                    IndelMutationStrategy indel = new IndelMutationStrategy(random, insertion, insertionRate.getValue(DEFAULT_INSERTION_RATE), deletion, deletionRate.getValue(DEFAULT_DELETION_RATE));
                    AmbiguousSubstitutionMutationStrategy ambiguous = new AmbiguousSubstitutionMutationStrategy();
                    mutation = new CompositeMutationStrategy(random, substitution, substitutionRate.getValue(DEFAULT_SUBSTITUTION_RATE), indel, indelRate.getValue(DEFAULT_INDEL_RATE), ambiguous, ambiguousRate.getValue(DEFAULT_AMBIGUOUS_RATE));
                }
            }

            generateReads = new GenerateReads(referenceFile.getValue(), readFile.getValue(), random, length, quality, coverage, mutationRate.getValue(DEFAULT_MUTATION_RATE), mutation);
        }
        catch (CommandLineParseException e) {
            if (about.wasFound()) {
                About.about(System.out);
                System.exit(0);
            }
            if (help.wasFound()) {
                Usage.usage(USAGE, null, commandLine, arguments, System.out);
                System.exit(0);
            }
            Usage.usage(USAGE, e, commandLine, arguments, System.err);
            System.exit(-1);
        }
        try {
            System.exit(generateReads.call());
        }
        catch (Exception e) {
            e.printStackTrace();
            System.exit(1);
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy