All Downloads are FREE. Search and download functionalities are using the official Maven repository.

picard.sam.AddOrReplaceReadGroups Maven / Gradle / Ivy

Go to download

A set of command line tools (in Java) for manipulating high-throughput sequencing (HTS) data and formats such as SAM/BAM/CRAM and VCF.

There is a newer version: 3.2.0
Show newest version
package picard.sam;

import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMFileHeader.SortOrder;
import htsjdk.samtools.SAMFileWriter;
import htsjdk.samtools.SAMFileWriterFactory;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMTag;
import htsjdk.samtools.SamInputResource;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Iso8601Date;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.ProgressLogger;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.help.DocumentedFeature;
import picard.cmdline.CommandLineProgram;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import picard.cmdline.StandardOptionDefinitions;
import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;

import java.io.File;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Assigns all the reads in a file to a single new read-group.
 *
 * 

Summary

* Many tools (Picard and GATK for example) require or assume the presence of at least one RG tag, defining a "read-group" * to which each read can be assigned (as specified in the RG tag in the SAM record). * This tool enables the user to assign all the reads in the {@link #INPUT} to a single new read-group. * For more information about read-groups, see the * GATK Dictionary entry. *
* This tool accepts as INPUT BAM and SAM files or URLs from the * Global Alliance for Genomics and Health (GA4GH). *

Usage example:

*
 * java -jar picard.jar AddOrReplaceReadGroups \
 *       I=input.bam \
 *       O=output.bam \
 *       RGID=4 \
 *       RGLB=lib1 \
 *       RGPL=ILLUMINA \
 *       RGPU=unit1 \
 *       RGSM=20
 * 
*

Caveats

* The value of the tags must adhere (according to the SAM-spec) * with the regex
{@value #READGROUP_ID_REGEX}
(one or more characters from the ASCII range 32 through 126). In * particular <Space> is the only non-printing character allowed. *
* The program enables only the wholesale assignment of all the reads in the {@link #INPUT} to a single read-group. If your file * already has reads assigned to multiple read-groups, the original RG value will be lost. * * @author mdepristo */ @CommandLineProgramProperties( summary = AddOrReplaceReadGroups.USAGE_SUMMARY + AddOrReplaceReadGroups.USAGE_DETAILS, oneLineSummary = AddOrReplaceReadGroups.USAGE_SUMMARY, programGroup = ReadDataManipulationProgramGroup.class) @DocumentedFeature public class AddOrReplaceReadGroups extends CommandLineProgram { static final public String READGROUP_ID_REGEX="^[ -~]+$"; static final String USAGE_SUMMARY = "Assigns all the reads in a file to a single new read-group."; static final String USAGE_DETAILS = "\n\nThis tool accepts INPUT BAM and SAM files or URLs from the Global Alliance for Genomics and Health (GA4GH).\n" + "

Usage example:

" + "\n"+ "java -jar picard.jar AddOrReplaceReadGroups \\\n" + " I=input.bam \\\n" + " O=output.bam \\\n" + " RGID=4 \\\n" + " RGLB=lib1 \\\n" + " RGPL=ILLUMINA \\\n" + " RGPU=unit1 \\\n" + " RGSM=20\n " + "\n" + "

Caveats

\n" + "The value of the tags must adhere (according to the SAM-spec) " + "with the regex '" + READGROUP_ID_REGEX + "' (one or more characters from the ASCII range 32 through 126). " + "In particular <Space> is the only non-printing character allowed.\n" + "\n" + "The program enables only the wholesale assignment of all the reads in the INPUT to a single read-group. If your file " + "already has reads assigned to multiple read-groups, the original RG value will be lost. \n\n" + "For more information about read-groups, see the " + "GATK Dictionary entry."; @Argument(shortName= StandardOptionDefinitions.INPUT_SHORT_NAME, doc="Input file (BAM or SAM or a GA4GH url).") public String INPUT = null; @Argument(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "Output file (BAM or SAM).") public File OUTPUT = null; @Argument(shortName = StandardOptionDefinitions.SORT_ORDER_SHORT_NAME, optional = true, doc = "Optional sort order to output in. If not supplied OUTPUT is in the same order as INPUT.") public SortOrder SORT_ORDER; @Argument(shortName = "ID", doc = "Read-Group ID") public String RGID = "1"; @Argument(shortName = "LB", doc = "Read-Group library") public String RGLB; @Argument(shortName = "PL", doc = "Read-Group platform (e.g. ILLUMINA, SOLID)") public String RGPL; @Argument(shortName = "PU", doc = "Read-Group platform unit (eg. run barcode)") public String RGPU; @Argument(shortName = "SM", doc = "Read-Group sample name") public String RGSM; @Argument(shortName = "CN", doc = "Read-Group sequencing center name", optional = true) public String RGCN; @Argument(shortName = "DS", doc = "Read-Group description", optional = true) public String RGDS; @Argument(shortName = "DT", doc = "Read-Group run date", optional = true) public Iso8601Date RGDT; @Argument(shortName = "KS", doc = "Read-Group key sequence", optional = true) public String RGKS; @Argument(shortName = "FO", doc = "Read-Group flow order", optional = true) public String RGFO; @Argument(shortName = "PI", doc = "Read-Group predicted insert size", optional = true) public Integer RGPI; @Argument(shortName = "PG", doc = "Read-Group program group", optional = true) public String RGPG; @Argument(shortName = "PM", doc = "Read-Group platform model", optional = true) public String RGPM; private final Log log = Log.getInstance(AddOrReplaceReadGroups.class); protected int doWork() { IOUtil.assertInputIsValid(INPUT); IOUtil.assertFileIsWritable(OUTPUT); final SamReader in = SamReaderFactory.makeDefault() .referenceSequence(REFERENCE_SEQUENCE) .open(SamInputResource.of(INPUT)); // create the read-group we'll be using final SAMReadGroupRecord rg = new SAMReadGroupRecord(RGID); rg.setLibrary(RGLB); rg.setPlatform(RGPL); rg.setSample(RGSM); rg.setPlatformUnit(RGPU); if (RGCN != null) rg.setSequencingCenter(RGCN); if (RGDS != null) rg.setDescription(RGDS); if (RGDT != null) rg.setRunDate(RGDT); if (RGPI != null) rg.setPredictedMedianInsertSize(RGPI); if (RGPG != null) rg.setProgramGroup(RGPG); if (RGPM != null) rg.setPlatformModel(RGPM); if (RGKS != null) rg.setKeySequence(RGKS); if (RGFO != null) rg.setFlowOrder(RGFO); log.info(String.format("Created read-group ID=%s PL=%s LB=%s SM=%s%n", rg.getId(), rg.getPlatform(), rg.getLibrary(), rg.getSample())); // create the new header and output file final SAMFileHeader inHeader = in.getFileHeader(); final SAMFileHeader outHeader = inHeader.clone(); outHeader.setReadGroups(Collections.singletonList(rg)); if (SORT_ORDER != null) outHeader.setSortOrder(SORT_ORDER); final SAMFileWriter outWriter = new SAMFileWriterFactory().makeSAMOrBAMWriter(outHeader, outHeader.getSortOrder() == inHeader.getSortOrder(), OUTPUT); final ProgressLogger progress = new ProgressLogger(log); for (final SAMRecord read : in) { read.setAttribute(SAMTag.RG.name(), RGID); outWriter.addAlignment(read); progress.record(read); } // cleanup CloserUtil.close(in); outWriter.close(); return 0; } @Override protected String[] customCommandLineValidation() { final List validationFailures = new ArrayList<>(); checkTagValue("RGID", RGID).ifPresent(validationFailures::add); checkTagValue("RGLB", RGLB).ifPresent(validationFailures::add); checkTagValue("RGPL", RGPL).ifPresent(validationFailures::add); checkTagValue("RGPU", RGPU).ifPresent(validationFailures::add); checkTagValue("RGSM", RGSM).ifPresent(validationFailures::add); checkTagValue("RGCN", RGCN).ifPresent(validationFailures::add); checkTagValue("RGDS", RGDS).ifPresent(validationFailures::add); checkTagValue("RGKS", RGKS).ifPresent(validationFailures::add); checkTagValue("RGFO", RGFO).ifPresent(validationFailures::add); checkTagValue("RGPG", RGPG).ifPresent(validationFailures::add); checkTagValue("RGPM", RGPM).ifPresent(validationFailures::add); if (!validationFailures.isEmpty()) { return validationFailures.toArray(new String[validationFailures.size()]); } return super.customCommandLineValidation(); } private final Pattern pattern = Pattern.compile(READGROUP_ID_REGEX); private Optional checkTagValue(final String tagName, final String value) { if (value == null) { return Optional.empty(); } final Matcher matcher = pattern.matcher(value); if (matcher.matches()) { return Optional.empty(); } else { return Optional.of(String.format("The values of tags in a SAM header must adhere to the regular expression '%s'," + "but the value provided for %s, '%s', doesn't.",READGROUP_ID_REGEX, tagName, value)); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy