All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.walkers.fasta.ShiftFasta Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.tools.walkers.fasta;

import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.reference.FastaReferenceWriter;
import htsjdk.samtools.reference.FastaReferenceWriterBuilder;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.BetaFeature;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.GATKTool;
import org.broadinstitute.hellbender.engine.ReferenceDataSource;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import picard.cmdline.programgroups.ReferenceProgramGroup;

import java.io.FileWriter;
import java.io.IOException;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.List;
import java.util.ListIterator;

/**
 * Create a fasta with the bases shifted by offset
 *
 * delta1 = offset - 1
 * delta2 = total - delta1
 *
 * To shift forward:
 * if you are given a position in the regular fasta (pos_r) and want the position in the shifted fasta (pos_s):
 * if pos_r > delta1 => pos_s = pos_r - delta1  ==   pos_r - offset +1
 *   otherwise          pos_s = pos_r + delta2  ==   pos_r + total - offset + 1
 *
 * To shift back:
 * if you are given a position in the shifted fasta (pos_s) and want the position in the regular fasta (pos_r):
 * if pos_s > delta2 => pos_r = pos_s - delta2  ==   pos_s - total + offset - 1
 *   otherwise          pos_r = pos_s + delta1  ==   pos_s + offset - 1
 *
 * Example command line:
 * ShiftFasta
 * -R ""         // the reference to shift
 * -O ""           // output; the shifted fasta
 * --shift-back-output "" // output; the shiftback chain file to use when lifting over
 * --shift-offset-list ""    // optional; Specifies the offset to shift for each contig in the reference. If not specified, the offset will be half the length of the contig.
 * --interval-file-name "" // output; base name for output interval files (.intervals and .shifted.intervals) that should be used when calling variants against the unshifted and shifted reference.
 * --line-width 100
 */
@DocumentedFeature
@BetaFeature
@CommandLineProgramProperties(
        summary = "Create a new fasta starting at the shift-offset +1 position and a shift_back chain file that can be used with the Liftover tool. It will shift all contigs by default.",
        oneLineSummary = "Creates a shifted fasta file and shift_back file",
        programGroup = ReferenceProgramGroup.class
)
public class ShiftFasta extends GATKTool {

    @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
            shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
            doc = "Path to write the output fasta to")
    protected String output;

    public static final String SHIFT_BACK_OUTPUT = "shift-back-output";
    @Argument(fullName = SHIFT_BACK_OUTPUT,
            doc = "Path to write the shift_back file to")
    protected String shiftBackOutput;

    public static final String SHIFT_OFFSET_LIST = "shift-offset-list";
    @Argument(fullName = SHIFT_OFFSET_LIST,
            doc="Number of bases to skip in the reference before starting the shifted reference. If the reference contains multiple contigs, a value should be specified for each contig. " +
                "For example, if 300 is specified, the new fasta will start at the 301th base (count starting at 1)." +
                "If not specified, each contig will be shifted by half the number of bases. To skip the shifting of a contig, specify 0 in the list.", optional = true)
    private List shiftOffsets = null;

    public static final String INTERAL_FILE_NAME = "interval-file-name";
    @Argument(fullName = INTERAL_FILE_NAME,
            doc="Base name for interval files. Intervals will be midway between beginning and computed offset. If not specified, no interval files will be written.", optional = true)
    private String intervalFilename;

    public static final String LINE_WIDTH_LONG_NAME = "line-width";
    @Argument(fullName= LINE_WIDTH_LONG_NAME, doc="Maximum length of sequence to write per line", optional=true)
    public int basesPerLine = FastaReferenceWriter.DEFAULT_BASES_PER_LINE;

    private ReferenceDataSource refSource;
    private FastaReferenceWriter refWriter;
    private FileWriter chainFileWriter;
    private FileWriter intervalRegularWriter;
    private FileWriter intervalShiftedWriter;

    private int chainId = 0;

    @Override
    public boolean requiresReference() {
        return true;
    }

    @Override
    public void onTraversalStart() {
        refSource = referenceArguments.getReferencePath() != null ? ReferenceDataSource.of(referenceArguments.getReferencePath()) : null;
        final Path path = IOUtils.getPath(output);
        chainId = 1;
        try {
            refWriter = new FastaReferenceWriterBuilder()
                    .setFastaFile(path)
                    .setBasesPerLine(basesPerLine)
                    .build();
            chainFileWriter = new FileWriter(shiftBackOutput);
            if (intervalFilename != null) {
                intervalRegularWriter = new FileWriter(intervalFilename+ ".intervals");
                intervalShiftedWriter = new FileWriter(intervalFilename + ".shifted.intervals");
            }
        } catch (IOException e) {
            throw new UserException.CouldNotCreateOutputFile("Couldn't create " + output + ", encountered exception: " + e.getMessage(), e);
        }
    }

    public void traverse() {
        SAMSequenceDictionary refDict = refSource.getSequenceDictionary();
        long refLengthLong = refDict.getReferenceLength();
        if (refLengthLong > Integer.MAX_VALUE) {
            // TODO fix this??
            throw new UserException.BadInput("Reference length is too long");
        }
        List contigs = refSource.getSequenceDictionary().getSequences();
        if (shiftOffsets != null && !shiftOffsets.isEmpty() && shiftOffsets.size() != contigs.size()) {
            throw new UserException.BadInput("Shift offset list size " + shiftOffsets.size() + " must equal number of contigs in the reference " + contigs.size());
        }
        final ListIterator shiftOffsetsIt = shiftOffsets != null && !shiftOffsets.isEmpty()  ? shiftOffsets.listIterator() : null;
        refSource.getSequenceDictionary().getSequences().forEach(seq -> shiftContig(seq, shiftOffsetsIt));
    }

    /**
     * This method adds to a new fasta ref file that has been shifted by the amount indicated in the shiftOffsetsIt.
     * This also adds to the supporting files: chainfile, interval list for both the shifted and unshifted fasta files
     * The shift is all done in memory. This is a scaling limitation.
     * @param seq The contig or sequence within the fasta file
     * @param shiftOffsetsIt the iterator at the correct position to get the next offset or null if dividing contig by 2
     */
    protected final void shiftContig(SAMSequenceRecord seq, ListIterator shiftOffsetsIt) {
        final int contigLength = seq.getSequenceLength();
        final String seqName = seq.getSequenceName();
        int shiftOffset = shiftOffsetsIt == null ? contigLength/2 : shiftOffsetsIt.next();
        if (shiftOffset > 0 && shiftOffset < contigLength) {
            byte[] bases = refSource.queryAndPrefetch(new SimpleInterval(seqName, 1, contigLength)).getBases();
            byte[] basesAtEnd = Arrays.copyOfRange(bases, shiftOffset, bases.length);
            byte[] basesAtStart = Arrays.copyOf(bases, shiftOffset);
            int shiftBackOffset = bases.length - shiftOffset;

            addToShiftedReference(refWriter, seqName, basesPerLine, basesAtStart, basesAtEnd);
            addToChainFile(seqName, contigLength, shiftOffset, bases, shiftBackOffset);
            if (intervalFilename != null) {
                addToIntervalFiles(intervalRegularWriter, intervalShiftedWriter, seqName, shiftOffset, contigLength);
            }
        } else {
            logger.info("not shifting config " + seq.getContig() + " because shift offset " + shiftOffset + " is not between 1-" + contigLength );
        }
    }

    private void addToIntervalFiles(FileWriter intervalRegularWriter, FileWriter intervalShiftedWriter, String seqName, int shiftOffset, int contigLength) {
        try {
            int intervalStart = shiftOffset/2;
            int intervalEnd = intervalStart + contigLength/2 - 1;
            int shiftedIntervalStart = intervalStart;
            int shiftedIntervalEnd = intervalEnd + contigLength % 2;
            intervalRegularWriter.append(seqName + ":" + intervalStart + "-" + intervalEnd + "\n");
            intervalShiftedWriter.append(seqName + ":" + shiftedIntervalStart + "-" + shiftedIntervalEnd + "\n");
        } catch (IOException e) {
            throw new UserException("Failed to write interval files due to " + e.getMessage(), e);
        }

    }

    private void addToShiftedReference(FastaReferenceWriter refWriter, String seqName, int basesPerLine, byte[] basesAtStart, byte[] basesAtEnd) {
        try {
            refWriter.startSequence(seqName, basesPerLine);
            // swap the bases
            refWriter.appendBases(basesAtEnd).appendBases(basesAtStart);
        } catch (IOException e) {
            throw new UserException("Failed to write shifted reference due to " + e.getMessage(), e);
        }
    }

    private void addToChainFile(String seqName, int contigLength, int shiftOffset, byte[] bases, int shiftBackOffset) {
        try {
            chainFileWriter.append(createChainString(seqName, shiftBackOffset, contigLength, shiftOffset, bases.length, 0, shiftBackOffset, chainId++));
            chainFileWriter.append("\n" + shiftBackOffset + "\n\n");
            chainFileWriter.append(createChainString(seqName, shiftOffset - 1, contigLength, 0, shiftOffset, shiftBackOffset, bases.length, chainId++));
            chainFileWriter.append("\n" + shiftOffset + "\n\n");
        } catch (IOException e) {
            throw new UserException("Failed to write chainFile due to " + e.getMessage(), e);
        }
    }

    private String createChainString(String name, int score, int length, int start, int end, int shiftBackStart, int shiftBackEnd, int id) {
        String[] items = new String[] { "chain",
                Integer.toString(score),
                name,
                Integer.toString(length),
                "+",
                Integer.toString(shiftBackStart),
                Integer.toString(shiftBackEnd),
                name,
                Integer.toString(length),
                "+",
                Integer.toString(start),
                Integer.toString(end),
                Integer.toString(id)
        };
        return String.join("\t", items);
    }

    @Override
    public Object onTraversalSuccess(){
        return null;
    }

    @Override
    public void closeTool() {
        super.closeTool();
        try{
            if( refWriter != null ) {
                refWriter.close();
            }
        } catch (IOException e) {
            throw new UserException("Failed to write fasta due to " + e.getMessage(), e);
        }
        try{
            if (chainFileWriter != null) {
                chainFileWriter.close();
            }
        } catch (IOException e) {
            throw new UserException("Failed to write chain file due to " + e.getMessage(), e);
        }
        try{
            if (intervalRegularWriter != null) {
                intervalRegularWriter.close();
            }
            if (intervalShiftedWriter != null) {
                intervalShiftedWriter.close();
            }
        } catch (IOException e) {
            throw new UserException("Failed to write intervals due to " + e.getMessage(), e);
        }
    }

    }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy