All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.milaboratory.mitcr.cli.Main Maven / Gradle / Ivy

Go to download

MiTCR is an open source software package aimed at extraction of information on repertoire of T-cell clones from Next Generation Sequencing (NGS) data. It is designed with the knowledge of the critical challenges arising in everyday processing of immunological data.

The newest version!
/*
 * MiTCR 
 *
 * Copyright (c) 2010-2013:
 *     Bolotin Dmitriy     
 *     Chudakov Dmitriy    
 *
 * MiTCR is free software: you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the Free
 * Software Foundation, either version 3 of the License, or (at your option)
 * any later version.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
 * more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program. If not, see .
 */
package com.milaboratory.mitcr.cli;

import cc.redberry.pipe.OutputPort;
import cc.redberry.pipe.util.CountLimitingOutputPort;
import com.milaboratory.core.clone.CloneSetClustered;
import com.milaboratory.core.io.CloneSetIO;
import com.milaboratory.core.segment.DefaultSegmentLibrary;
import com.milaboratory.core.segment.Gene;
import com.milaboratory.core.segment.SegmentLibrary;
import com.milaboratory.core.segment.Species;
import com.milaboratory.core.sequence.quality.QualityFormat;
import com.milaboratory.core.sequencing.io.fastq.SFastqReader;
import com.milaboratory.core.sequencing.read.SSequencingRead;
import com.milaboratory.mitcr.clonegenerator.AccumulatorType;
import com.milaboratory.mitcr.clonegenerator.BasicCloneGeneratorParameters;
import com.milaboratory.mitcr.clonegenerator.LQFilteringOffCloneGeneratorParameters;
import com.milaboratory.mitcr.clonegenerator.LQMappingCloneGeneratorParameters;
import com.milaboratory.mitcr.clsexport.ClsExporter;
import com.milaboratory.mitcr.clusterization.CloneClusterizationType;
import com.milaboratory.mitcr.pipeline.FullPipeline;
import com.milaboratory.mitcr.pipeline.Parameters;
import com.milaboratory.mitcr.qualitystrategy.DummyQualityInterpretationStrategy;
import com.milaboratory.mitcr.qualitystrategy.IlluminaQualityInterpretationStrategy;
import com.milaboratory.mitcr.statistics.AnalysisStatisticsAggregator;
import com.milaboratory.util.CompressionType;
import com.milaboratory.util.SmartProgressReporter;
import com.milaboratory.util.TablePrintStreamAdapter;
import org.apache.commons.cli.*;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Comparator;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;

import static java.lang.System.err;

public class Main {
    private static final Options options = new Options();
    private static final String PARAMETERS_SET_OPTION = "pset",
            EXPORT_OPTION = "export",
            SPECIES_OPTION = "species",
            GENE_OPTION = "gene",
            ERROR_CORECTION_LEVEL_OPTION = "ec",
            LQ_OPTION = "lq",
            QUALITY_THRESHOLD_OPTION = "quality",
            CLUSTERIZATION_OPTION = "pcrec",
            LIMIT_OPTION = "limit",
            REPORT_OPTION = "report",
            REPORTING_LEVEL_OPTION = "level",
            PHRED33_OPTION = "phred33",
            PHRED64_OPTION = "phred64",
            INCLUDE_CYS_PHE_OPTION = "cysphe",
            PRINT_DEBUG_OPTION = "debug",
            PRINT_VERSION_OPTION = "v",
            PRINT_HELP_OPTION = "h",
            THREADS_OPTION = "t",
            AVERAGE_QUALITY_OPTION = "average",
            COMPRESSED_OPTION = "compressed";
    private static final Map orderingMap = new HashMap<>();

    public static void main(String[] args) {
        int o = 0;

        BuildInformation buildInformation = BuildInformationProvider.get();

        final boolean isProduction = "default".equals(buildInformation.scmBranch); // buildInformation.version != null && buildInformation.version.lastIndexOf("SNAPSHOT") < 0;

        orderingMap.put(PARAMETERS_SET_OPTION, o++);
        orderingMap.put(SPECIES_OPTION, o++);
        orderingMap.put(GENE_OPTION, o++);
        orderingMap.put(ERROR_CORECTION_LEVEL_OPTION, o++);
        orderingMap.put(QUALITY_THRESHOLD_OPTION, o++);
        orderingMap.put(AVERAGE_QUALITY_OPTION, o++);
        orderingMap.put(LQ_OPTION, o++);
        orderingMap.put(CLUSTERIZATION_OPTION, o++);
        orderingMap.put(INCLUDE_CYS_PHE_OPTION, o++);
        orderingMap.put(LIMIT_OPTION, o++);
        orderingMap.put(EXPORT_OPTION, o++);
        orderingMap.put(REPORT_OPTION, o++);
        orderingMap.put(REPORTING_LEVEL_OPTION, o++);
        orderingMap.put(PHRED33_OPTION, o++);
        orderingMap.put(PHRED64_OPTION, o++);
        orderingMap.put(THREADS_OPTION, o++);
        orderingMap.put(COMPRESSED_OPTION, o++);
        orderingMap.put(PRINT_HELP_OPTION, o++);
        orderingMap.put(PRINT_VERSION_OPTION, o++);
        orderingMap.put(PRINT_DEBUG_OPTION, o++);

        options.addOption(OptionBuilder.withArgName("preset name")
                .hasArg().withDescription("preset of pipeline parameters to use")
                .create(PARAMETERS_SET_OPTION));

        options.addOption(OptionBuilder.withArgName("species")
                .hasArg().withDescription("overrides species ['hs' for Homo sapiens, 'mm' for us Mus musculus] " +
                        "(default for built-in presets is 'hs')")
                .create(SPECIES_OPTION));

        options.addOption(OptionBuilder.withArgName("gene")
                .hasArg().withDescription("overrides gene: TRB or TRA (default value for built-in parameter sets is TRB)")
                .create(GENE_OPTION));

        options.addOption(OptionBuilder.withArgName("0|1|2")
                .hasArg().withDescription("overrides error correction level (0 = don't correct errors, 1 = correct sequenecing " +
                        "errors only (see -" + QUALITY_THRESHOLD_OPTION + " and -" + LQ_OPTION + " options for details), " +
                        "2 = also correct PCR errors (see -" + CLUSTERIZATION_OPTION + " option)")
                .create(ERROR_CORECTION_LEVEL_OPTION));


        options.addOption(OptionBuilder.withArgName("value")
                .hasArg().withDescription("overrides quality threshold value for segment alignment and bad quality sequences " +
                        "correction algorithms. 0 tells the program not to process quality information. (default is 25)")
                .create(QUALITY_THRESHOLD_OPTION));

        if (!isProduction)
            options.addOption(OptionBuilder.hasArg(false).withDescription("use this option to output average instead of " +
                    "maximal, quality for CDR3 nucleotide sequences. (Experimental option, use with caution.)")
                    .create(AVERAGE_QUALITY_OPTION));

        options.addOption(OptionBuilder.withArgName("map | drop").hasArg()
                .withDescription("overrides low quality CDR3s processing strategy (drop = filter off, " +
                        "map = map onto clonotypes created from the high quality CDR3s). This option makes no difference if " +
                        "quality threshold (-" + QUALITY_THRESHOLD_OPTION + " option) is set to 0, or error correction " +
                        "level (-" + ERROR_CORECTION_LEVEL_OPTION + ") is 0.")
                .create(LQ_OPTION));

        options.addOption(OptionBuilder.withArgName("smd | ete").hasArg()
                .withDescription("overrides the PCR error correction algorithm: smd = \"save my diversity\", " +
                        "ete = \"eliminate these errors\". Default value for built-in parameters is ete.")
                .create(CLUSTERIZATION_OPTION));


        options.addOption(OptionBuilder.withArgName("0|1")
                .hasArg().withDescription("overrides weather include bounding Cys & Phe into CDR3 sequence")
                .create(INCLUDE_CYS_PHE_OPTION));

        options.addOption(OptionBuilder.withArgName("# of reads")
                .hasArg().withDescription("limits the number of input sequencing reads, use this parameter to " +
                        "normalize several datasets or to have a glance at the data")
                .create(LIMIT_OPTION));

        options.addOption(OptionBuilder.withArgName("new name")
                .hasArg().withDescription("use this option to export presets to a local xml files")
                .create(EXPORT_OPTION));

        options.addOption(OptionBuilder.withArgName("file name")
                .hasArg().withDescription("use this option to write analysis report (summary) to file")
                .create(REPORT_OPTION));

        options.addOption(OptionBuilder.withArgName("1|2|3")
                .hasArg(true).withDescription("output detalization level (1 = simple, 2 = medium, 3 = full, this format " +
                        "could be deserialized using mitcr API). Affects only tab-delimited output. Default value is 3.")
                .create(REPORTING_LEVEL_OPTION));

        options.addOption(OptionBuilder
                .hasArg(false).withDescription("add this option if input file is in old illumina format with 64 byte offset for quality " +
                        "string (MiTCR will try to automatically detect file format if one of the \"-phredXX\" options is not provided)")
                .create(PHRED64_OPTION));

        options.addOption(OptionBuilder
                .hasArg(false).withDescription("add this option if input file is in Phred+33 format for quality values " +
                        "(MiTCR will try to automatically detect file format if one of the \"-phredXX\" options is not provided)")
                .create(PHRED33_OPTION));

        options.addOption(OptionBuilder.withArgName("threads")
                .hasArg().withDescription("specifies the number of CDR3 extraction threads (default = number of available CPU cores)")
                .create(THREADS_OPTION));

        if (!isProduction)
            options.addOption(OptionBuilder.hasArg(false).withDescription("use compressed data structures for storing individual " +
                    "clone segments statistics (from which arises the clone segment information). This option reduces required " +
                    "amount of memory, but introduces small stochastic errors into the algorithm which determines clone " +
                    "segments. (Experimental option, use with caution.)")
                    .create(COMPRESSED_OPTION));

        options.addOption(OptionBuilder.hasArg(false).withDescription("print this message")
                .create(PRINT_HELP_OPTION));

        options.addOption(OptionBuilder
                .hasArg(false).withDescription("print version information")
                .create(PRINT_VERSION_OPTION));

        options.addOption(OptionBuilder
                .hasArg(false).withDescription("print additional information about analysis process")
                .create(PRINT_DEBUG_OPTION));


        PosixParser parser = new PosixParser();

        try {
            long input_limit = -1;
            int threads = Runtime.getRuntime().availableProcessors();
            int reporting_level = 3;
            int ec_level = 2;

            CommandLine cl = parser.parse(options, args, true);
            if (cl.hasOption(PRINT_HELP_OPTION)) {
                printHelp();
                return;
            }

            boolean averageQuality = cl.hasOption(AVERAGE_QUALITY_OPTION), compressedAggregators = cl.hasOption(COMPRESSED_OPTION);

            if (cl.hasOption(PRINT_VERSION_OPTION)) {
                System.out.println("MiTCR by MiLaboratory, version: " + buildInformation.version);
                System.out.println("Branch: " + buildInformation.scmBranch);
                System.out.println("Built: " + buildInformation.buildDate + ", " + buildInformation.jdk + " JDK, " +
                        "build machine: " + buildInformation.builtBy);
                System.out.println("SCM changeset: " + buildInformation.scmChangeset + " (" + buildInformation.scmDate.replace("\"", "") + ")");
                return;
            }

            //Normal execution

            String paramName = cl.getOptionValue(PARAMETERS_SET_OPTION);

            if (paramName == null) {
                err.println("No parameters set is specified.");
                return;
            }


            Parameters params = ParametersIO.getParameters(paramName);

            if (params == null) {
                err.println("No parameters set found with name '" + paramName + "'.");
                return;
            }

            String value;

            if ((value = cl.getOptionValue(THREADS_OPTION)) != null)
                threads = Integer.decode(value);

            if ((value = cl.getOptionValue(REPORTING_LEVEL_OPTION)) != null)
                reporting_level = Integer.decode(value);

            if ((value = cl.getOptionValue(LIMIT_OPTION)) != null)
                input_limit = Long.decode(value);

            if ((value = cl.getOptionValue(GENE_OPTION)) != null)
                params.setGene(Gene.fromXML(value));

            if ((value = cl.getOptionValue(SPECIES_OPTION)) != null)
                params.setSpecies(Species.getFromShortName(value));


            if ((value = cl.getOptionValue(INCLUDE_CYS_PHE_OPTION)) != null) {
                if (value.equals("1"))
                    params.getCDR3ExtractorParameters().setIncludeCysPhe(true);
                else if (value.equals("0"))
                    params.getCDR3ExtractorParameters().setIncludeCysPhe(false);
                else {
                    err.println("Illegal value for -" + INCLUDE_CYS_PHE_OPTION + " parameter.");
                    return;
                }
            }

            if ((value = cl.getOptionValue(ERROR_CORECTION_LEVEL_OPTION)) != null) {
                int v = Integer.decode(value);
                ec_level = v;
                if (v == 0) {
                    params.setCloneGeneratorParameters(new BasicCloneGeneratorParameters());
                    params.setClusterizationType(CloneClusterizationType.None);
                } else if (v == 1) {
                    params.setCloneGeneratorParameters(new LQMappingCloneGeneratorParameters());
                    params.setClusterizationType(CloneClusterizationType.None);
                } else if (v == 2) {
                    params.setCloneGeneratorParameters(new LQMappingCloneGeneratorParameters());
                    params.setClusterizationType(CloneClusterizationType.OneMismatch, .1f);
                } else
                    throw new RuntimeException("This (" + v +
                            ") error correction level is not supported.");
            }


            if ((value = cl.getOptionValue(QUALITY_THRESHOLD_OPTION)) != null) {
                int v = Integer.decode(value);
                if (v == 0)
                    params.setQualityInterpretationStrategy(new DummyQualityInterpretationStrategy());
                else
                    params.setQualityInterpretationStrategy(new IlluminaQualityInterpretationStrategy((byte) v));
            }

            if ((value = cl.getOptionValue(LQ_OPTION)) != null)
                if (ec_level > 0)
                    switch (value) {
                        case "map":
                            params.setCloneGeneratorParameters(new LQMappingCloneGeneratorParameters(
                                    ((BasicCloneGeneratorParameters) params.getCloneGeneratorParameters()).getSegmentInformationAggregationFactor(),
                                    3, true));
                            break;
                        case "drop":
                            params.setCloneGeneratorParameters(new LQFilteringOffCloneGeneratorParameters(
                                    ((BasicCloneGeneratorParameters) params.getCloneGeneratorParameters()).getSegmentInformationAggregationFactor()));
                            break;
                        default:
                            throw new RuntimeException("Wrong value for -" + LQ_OPTION + " option.");
                    }


            if ((value = cl.getOptionValue(CLUSTERIZATION_OPTION)) != null)
                if (ec_level > 1) // == 2
                    switch (value) {
                        case "smd":
                            params.setClusterizationType(CloneClusterizationType.V2D1J2T3Explicit);
                            break;
                        case "ete":
                            params.setClusterizationType(CloneClusterizationType.OneMismatch);
                            break;
                        default:
                            throw new RuntimeException("Wrong value for -" + CLUSTERIZATION_OPTION + " option.");
                    }

            ((BasicCloneGeneratorParameters) params.getCloneGeneratorParameters()).setAccumulatorType(AccumulatorType.get(compressedAggregators, averageQuality));

            if ((value = cl.getOptionValue(EXPORT_OPTION)) != null) {
                //Exporting parameters
                ParametersIO.exportParameters(params, value);
                return;
            }

            String[] offArgs = cl.getArgs();

            if (offArgs.length == 0) {
                err.println("Input file not specified.");
                return;
            } else if (offArgs.length == 1) {
                err.println("Output file not specified.");
                return;
            } else if (offArgs.length > 2) {
                err.println("Unrecognized argument.");
                return;
            }

            String inputFileName = offArgs[0];
            String outputFileName = offArgs[1];


            File input = new File(inputFileName);

            if (!input.exists()) {
                err.println("Input file not found.");
                return;
            }

            //TODO This also done inside SFastqReader constructor
            CompressionType compressionType = CompressionType.None;
            if (inputFileName.endsWith(".gz"))
                compressionType = CompressionType.GZIP;

            QualityFormat format = null; // If variable remains null file format will be detected automatically
            if (cl.hasOption(PHRED33_OPTION))
                format = QualityFormat.Phred33;
            if (cl.hasOption(PHRED64_OPTION))
                if (format == null)
                    format = QualityFormat.Phred64;
                else {
                    err.println("Options: -" + PHRED33_OPTION + " and -" + PHRED64_OPTION + " are mutually exclusive");
                    return;
                }


            SFastqReader reads = format == null ?
                    new SFastqReader(input, compressionType) :
                    new SFastqReader(input, format, compressionType);

            OutputPort inputToPipeline = reads;
            if (input_limit >= 0)
                inputToPipeline = new CountLimitingOutputPort<>(inputToPipeline, input_limit);

            SegmentLibrary library = DefaultSegmentLibrary.load();

            AnalysisStatisticsAggregator statisticsAggregator = new AnalysisStatisticsAggregator();

            FullPipeline pipeline = new FullPipeline(inputToPipeline, params, false, library);
            pipeline.setThreads(threads);
            pipeline.setAnalysisListener(statisticsAggregator);

            new Thread(new SmartProgressReporter(pipeline, err)).start(); // Printing status to the standard error stream

            pipeline.run();

            if (cl.hasOption(PRINT_DEBUG_OPTION)) {
                err.println("Memory = " + (Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory()));
                err.println("Clusterization: " + pipeline.getQC().getReadsClusterized() + "% of reads, " + pipeline.getQC().getClonesClusterized() + " % clones");
            }

            CloneSetClustered cloneSet = pipeline.getResult();

            if ((value = cl.getOptionValue(REPORT_OPTION)) != null) {
                File file = new File(value);
                TablePrintStreamAdapter table;
                if (file.exists())
                    table = new TablePrintStreamAdapter(new FileOutputStream(file, true));
                else {
                    table = new TablePrintStreamAdapter(file);
                    ReportExporter.printHeader(table);
                }
                //CloneSetQualityControl qc = new CloneSetQualityControl(library, params.getSpecies(), params.getGene(), cloneSet);
                ReportExporter.printRow(table, inputFileName, outputFileName, pipeline.getQC(), statisticsAggregator);
                table.close();
            }

            if (outputFileName.endsWith(".cls"))
                ClsExporter.export(pipeline, outputFileName.replace(".cls", "") + " " + new Date().toString(), input.getName(), outputFileName);
            else {
                //Dry run
                if (outputFileName.startsWith("-"))
                    return;

                ExportDetalizationLevel detalization = ExportDetalizationLevel.fromLevel(reporting_level);

                CompressionType compressionType1 = CompressionType.None;
                if (outputFileName.endsWith(".gz"))
                    compressionType1 = CompressionType.GZIP;
                CloneSetIO.exportCloneSet(outputFileName, cloneSet, detalization, params, input.getAbsolutePath(), compressionType1);
            }
        } catch (ParseException | RuntimeException | IOException e) {
            err.println("Error occurred in the analysis pipeline.");
            err.println();
            e.printStackTrace();
            //printHelp();
        }
    }

    public static void printHelp() {
        HelpFormatter formatter = new HelpFormatter();
        formatter.setOptionComparator(new Comparator




© 2015 - 2025 Weber Informatics LLC | Privacy Policy