All Downloads are FREE. Search and download functionalities are using the official Maven repository.

picard.sam.CreateSequenceDictionary Maven / Gradle / Ivy

There is a newer version: 3.2.0
Show newest version
/*
 * The MIT License
 *
 * Copyright (c) 2009 The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
package picard.sam;

import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceDictionaryCodec;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.samtools.reference.ReferenceSequenceFileFactory;
import htsjdk.samtools.util.AsciiWriter;
import htsjdk.samtools.util.CloseableIterator;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.Md5CalculatingOutputStream;
import htsjdk.samtools.util.RuntimeIOException;
import htsjdk.samtools.util.SortingCollection;
import htsjdk.samtools.util.StringUtil;
import picard.PicardException;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
import picard.cmdline.programgroups.Fasta;
import picard.cmdline.StandardOptionDefinitions;

import java.io.*;
import java.math.BigInteger;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

/**
 * Create a SAM/BAM file from a fasta containing reference sequence. The output SAM file contains a header but no
 * SAMRecords, and the header contains only sequence records.
 */
@CommandLineProgramProperties(
        usage = CreateSequenceDictionary.USAGE_SUMMARY + CreateSequenceDictionary.USAGE_DETAILS,
        usageShort = CreateSequenceDictionary.USAGE_SUMMARY,
        programGroup = Fasta.class
)
public class CreateSequenceDictionary extends CommandLineProgram {
    static final String USAGE_SUMMARY = "Creates a sequence dictionary for a reference sequence.  ";
    static final String USAGE_DETAILS = "This tool creates a sequence dictionary file (with \".dict\" extension) from a reference " +
            "sequence provided in FASTA format, which is required by many processing and analysis tools. The output file contains a " +
            "header but no SAMRecords, and the header contains only sequence records." +
            "

" + "The reference sequence can be gzipped (both .fasta and .fasta.gz are supported)." + "" + "

Usage example:

" + "
" +
            "java -jar picard.jar CreateSequenceDictionary \\ 
" + " R=reference.fasta \\
" + " O=reference.dict" + "" + "
" + "
"; // The following attributes define the command-line arguments private static final Log logger = Log.getInstance(CreateSequenceDictionary.class); @Option(doc = "Input reference fasta or fasta.gz", shortName = StandardOptionDefinitions.REFERENCE_SHORT_NAME) public File REFERENCE; @Option(doc = "Output SAM file containing only the sequence dictionary. By default it will use the base name of the input reference with the .dict extension", shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, optional = true) public File OUTPUT; @Option(doc = "Put into AS field of sequence dictionary entry if supplied", optional = true) public String GENOME_ASSEMBLY; @Option(doc = "Put into UR field of sequence dictionary entry. If not supplied, input reference file is used", optional = true) public String URI; @Option(doc = "Put into SP field of sequence dictionary entry", optional = true) public String SPECIES; @Option(doc = "Make sequence name the first word from the > line in the fasta file. " + "By default the entire contents of the > line is used, excluding leading and trailing whitespace.") public boolean TRUNCATE_NAMES_AT_WHITESPACE = true; @Option(doc = "Stop after writing this many sequences. For testing.") public int NUM_SEQUENCES = Integer.MAX_VALUE; private final MessageDigest md5; public CreateSequenceDictionary() { try { md5 = MessageDigest.getInstance("MD5"); } catch (NoSuchAlgorithmException e) { throw new PicardException("MD5 algorithm not found", e); } } public static void main(final String[] argv) { System.exit(new CreateSequenceDictionary().instanceMain(argv)); } /** * Read all the sequences from the given reference file, and convert into SAMSequenceRecords * @param referenceFile fasta or fasta.gz * @return SAMSequenceRecords containing info from the fasta, plus from cmd-line arguments. */ @Deprecated public SAMSequenceDictionary makeSequenceDictionary(final File referenceFile) { final ReferenceSequenceFile refSeqFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(referenceFile, TRUNCATE_NAMES_AT_WHITESPACE); ReferenceSequence refSeq; final List ret = new ArrayList<>(); final Set sequenceNames = new HashSet<>(); for (int numSequences = 0; numSequences < NUM_SEQUENCES && (refSeq = refSeqFile.nextSequence()) != null; ++numSequences) { if (sequenceNames.contains(refSeq.getName())) { throw new PicardException("Sequence name appears more than once in reference: " + refSeq.getName()); } sequenceNames.add(refSeq.getName()); ret.add(makeSequenceRecord(refSeq)); } return new SAMSequenceDictionary(ret); } /** * Use reference filename to create URI to go into header if URI was not passed on cmd line. */ protected String[] customCommandLineValidation() { if (URI == null) { URI = "file:" + REFERENCE.getAbsolutePath(); } if (OUTPUT == null) { OUTPUT = ReferenceSequenceFileFactory.getDefaultDictionaryForReferenceSequence(REFERENCE); logger.info("Output dictionary will be written in ", OUTPUT); } return null; } /** * Do the work after command line has been parsed. * RuntimeException may be thrown by this method, and are reported appropriately. * * @return program exit status. */ protected int doWork() { if (OUTPUT.exists()) { throw new PicardException(OUTPUT.getAbsolutePath() + " already exists. Delete this file and try again, or specify a different output file."); } // SortingCollection is used to check uniqueness of sequence names final SortingCollection sequenceNames = makeSortingCollection(); try (BufferedWriter writer = makeWriter()) { final ReferenceSequenceFile refSeqFile = ReferenceSequenceFileFactory. getReferenceSequenceFile(REFERENCE, TRUNCATE_NAMES_AT_WHITESPACE); SAMSequenceDictionaryCodec samDictCodec = new SAMSequenceDictionaryCodec(writer); samDictCodec.encodeHeaderLine(false); // read reference sequence one by one and write its metadata for (ReferenceSequence refSeq = refSeqFile.nextSequence(); refSeq != null; refSeq = refSeqFile.nextSequence()) { final SAMSequenceRecord samSequenceRecord = makeSequenceRecord(refSeq); samDictCodec.encodeSequenceRecord(samSequenceRecord); sequenceNames.add(refSeq.getName()); } } catch (FileNotFoundException e) { throw new PicardException("File " + OUTPUT.getAbsolutePath() + " not found"); } catch (IOException e) { throw new PicardException("Can't write to or close output file " + OUTPUT.getAbsolutePath()); } // check uniqueness of sequences names final CloseableIterator iterator = sequenceNames.iterator(); if(!iterator.hasNext()) return 0; String current = iterator.next(); while (iterator.hasNext()) { final String next = iterator.next(); if (current.equals(next)) { OUTPUT.delete(); throw new PicardException("Sequence name " + current + " appears more than once in reference file"); } current = next; } return 0; } private BufferedWriter makeWriter() throws FileNotFoundException { return new BufferedWriter( new AsciiWriter(this.CREATE_MD5_FILE ? new Md5CalculatingOutputStream( new FileOutputStream(OUTPUT, false), new File(OUTPUT.getAbsolutePath() + ".md5") ) : new FileOutputStream(OUTPUT) ) ); } /** * Create one SAMSequenceRecord from a single fasta sequence */ private SAMSequenceRecord makeSequenceRecord(final ReferenceSequence refSeq) { final SAMSequenceRecord ret = new SAMSequenceRecord(refSeq.getName(), refSeq.length()); // Compute MD5 of upcased bases final byte[] bases = refSeq.getBases(); for (int i = 0; i < bases.length; ++i) { bases[i] = StringUtil.toUpperCase(bases[i]); } ret.setAttribute(SAMSequenceRecord.MD5_TAG, md5Hash(bases)); if (GENOME_ASSEMBLY != null) { ret.setAttribute(SAMSequenceRecord.ASSEMBLY_TAG, GENOME_ASSEMBLY); } ret.setAttribute(SAMSequenceRecord.URI_TAG, URI); if (SPECIES != null) { ret.setAttribute(SAMSequenceRecord.SPECIES_TAG, SPECIES); } return ret; } private String md5Hash(final byte[] bytes) { md5.reset(); md5.update(bytes); String s = new BigInteger(1, md5.digest()).toString(16); if (s.length() != 32) { final String zeros = "00000000000000000000000000000000"; s = zeros.substring(0, 32 - s.length()) + s; } return s; } private SortingCollection makeSortingCollection() { final String name = getClass().getSimpleName(); final File tmpDir = IOUtil.createTempDir(name, null); tmpDir.deleteOnExit(); // 256 byte for one name, and 1/10 part of all memory for this, rough estimate long maxNamesInRam = Runtime.getRuntime().maxMemory() / 256 / 10; return SortingCollection.newInstance( String.class, new StringCodec(), String::compareTo, (int) Math.min(maxNamesInRam, Integer.MAX_VALUE), tmpDir ); } private static class StringCodec implements SortingCollection.Codec { private DataInputStream dis; private DataOutputStream dos; public StringCodec clone() { return new StringCodec(); } public void setOutputStream(final OutputStream os) { dos = new DataOutputStream(os); } public void setInputStream(final InputStream is) { dis = new DataInputStream(is); } public void encode(final String str) { try { dos.writeUTF(str); } catch (IOException e) { throw new RuntimeIOException(e); } } public String decode() { try { return dis.readUTF(); } catch (EOFException e) { return null; } catch (IOException e) { throw new PicardException("Exception reading sequence name from temporary file.", e); } } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy