All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.dragstr.DragstrLocusUtils Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.tools.dragstr;

import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import it.unimi.dsi.fastutil.ints.Int2LongArrayMap;
import it.unimi.dsi.fastutil.ints.Int2LongMap;
import it.unimi.dsi.fastutil.ints.Int2ObjectArrayMap;
import it.unimi.dsi.fastutil.ints.Int2ObjectMap;
import org.apache.hadoop.io.IOUtils;
import org.broadinstitute.hellbender.utils.BinaryTableReader;
import org.broadinstitute.hellbender.utils.BinaryTableWriter;
import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
import org.broadinstitute.hellbender.utils.tsv.DataLine;
import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection;
import org.broadinstitute.hellbender.utils.tsv.TableReader;
import org.broadinstitute.hellbender.utils.tsv.TableWriter;

import java.io.*;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.ArrayList;
import java.util.List;

public class DragstrLocusUtils {
    private static final int INDEX_BYTE_INTERVAL = 1 << 16; // every 64KB

    private static BinaryTableWriter binaryWriter(final OutputStream out, final OutputStream indexOut, final String path) {
        return binaryWriter(out, indexOut, path, (record, output) -> {
            output.writeInt(record.getChromosomeIndex());
            output.writeLong(record.getStart());
            output.writeByte(record.getPeriod());
            output.writeShort(record.getLength());
            output.writeLong(record.getMask());
        });
    }

    /**
     * Generates the str_table.bin format produced by DRAGEN. Used for debugging purposes. Please keep around for now.
     */
    @SuppressWarnings("unused")
    public static BinaryTableWriter dragenWriter(final OutputStream out, final OutputStream indexOut, final String path) {
        final ByteBuffer buffer = ByteBuffer.allocate(Integer.BYTES * 3 + Short.BYTES + 2 * Byte.BYTES);
        buffer.order(ByteOrder.LITTLE_ENDIAN);

        return binaryWriter(out, indexOut, path, (record, output) -> {
            buffer.clear();
            buffer.putInt((int) record.getMask());
            buffer.putInt(record.getChromosomeIndex());
            buffer.putInt((int) record.getStart() - 1);
            buffer.putShort(record.getLength());
            buffer.put((byte) record.getPeriod());
            buffer.put((byte) Math.min(DragstrHyperParameters.DEFAULT_MAX_PERIOD, record.getRepeats())); // DRAGEN caps repeat lengths to 20, the default max period.
            output.write(buffer.array());
        });
    }

    private static BinaryTableWriter binaryWriter(final OutputStream out, final OutputStream indexOut, final String path, final DragstrLocus.WriteAction wa) {
        return new BinaryTableWriter(out, path) {

            private DataOutputStream indexDataOutputStream = indexOut != null ? new DataOutputStream(indexOut) : null;
            private Int2LongMap chromosomeOffsets = new Int2LongArrayMap();
            private int lastChromosomeIndex = -1;
            private long lastEntryOffset = 0;

            @Override
            protected void writeRecord(final DragstrLocus record, final DataOutput output) throws IOException {
                if (indexOut != null) {
                    outputIndexWhenApplies(record);
                }
                wa.write(record, output);
            }

            private void outputIndexWhenApplies(final DragstrLocus record) throws IOException {
                final long offset = offset();
                if (lastChromosomeIndex != record.getChromosomeIndex()) {
                    if (chromosomeOffsets.containsKey(record.getChromosomeIndex())) {
                        throw new IllegalStateException("cannot index the output when not sorted; chromosome index " + record.getChromosomeIndex() + " appears in more than one piece ");
                    }
                    dataOut.flush();
                    chromosomeOffsets.put(lastChromosomeIndex = record.getChromosomeIndex(), offset);
                    indexDataOutputStream.writeInt(lastChromosomeIndex);
                    indexDataOutputStream.writeInt((int) record.getStart());
                    indexDataOutputStream.writeLong(offset);
                    lastEntryOffset = offset;
                } else if ((offset - lastEntryOffset) >= INDEX_BYTE_INTERVAL) {
                    dataOut.flush();
                    indexDataOutputStream.writeInt(lastChromosomeIndex);
                    indexDataOutputStream.writeInt((int) record.getStart());
                    indexDataOutputStream.writeLong(offset);
                    lastEntryOffset = offset;
                }
            }

            @Override
            public void close() throws IOException {
                super.close();
                if (indexOut != null) indexDataOutputStream.close();
            }
        };
    }

    public static BinaryTableWriter binaryWriter(final File out) throws FileNotFoundException {
        return binaryWriter(out, null);
    }

    public static BinaryTableWriter binaryWriter(final File out, final File indexFile)
        throws FileNotFoundException
    {
        return binaryWriter(new FileOutputStream(out), indexFile != null ? new FileOutputStream(indexFile) : new IOUtils.NullOutputStream(), out.toString());
    }

    public static BinaryTableReader binaryReader(final File file) throws FileNotFoundException {
        return binaryReader(new FileInputStream(file));
    }

    public static BinaryTableReader binaryReader(final InputStream in) {
        return new BinaryTableReader(in, null) {
            @Override
            protected DragstrLocus readRecord(final DataInput input) throws IOException {
                final int chrIdx = input.readInt();
                final long start = input.readLong();
                final byte period = input.readByte();
                final short length = input.readShort();
                final long mask = input.readLong();
                return DragstrLocus.make(chrIdx, start, period, length, mask);
            }
        };
    }

    /**
     * Returns loci whose start base is located within an particular interval.
     * @param path path to the file containing the dragstr-loci.
     * @param index the index for that file pre-loaded in memory.
     * @param chrIdx the target interval contig index.
     * @param start the first base of the target interval 1-based.
     * @param end the last base of the target interval (inclusive).
     * @return never {@code null} but perhaps a read that returns not records.
     * @throws IOException in case of an underlying IO issue.
     */
    public static BinaryTableReader binaryReader(final String path, final BinaryTableIndex index,
                                                               final int chrIdx, final int start, final int end)
       throws IOException {

        final long offset = index.offset(chrIdx, start, end);
        if (offset < 0) {
            return BinaryTableReader.emptyReader();
        }
        final InputStream in = BucketUtils.openFile(path);
        if (in.skip(offset) != offset) {
            throw new IOException("failed to skip the requested number of bytes");
        }

        return new BinaryTableReader(in, null) {

            @Override
            protected DragstrLocus readRecord(final DataInput input) throws IOException {

                while (true) {
                    final int c = input.readInt();
                    final long s = input.readLong();
                    final byte p = input.readByte();
                    final short l = input.readShort();
                    final long m = input.readLong();
                    if (chrIdx != c) { // always we have an entry in the index for each chromosome
                                       // so we should not any chridx that is not the intervals is end of the line.
                        return null;
                    } else if (s > end) {
                        return null;
                    } else if (s >= start) {
                        return DragstrLocus.make(c, s, p, l, m);
                    }
                    // notice that eventually we hit the end of the stream or another chromosome.
                    // or we go beyond the requested interval, so this is not going to loop forever.
                }
            }
        };
    }

    /**
     * Returns a tab separated text format writer.
     * @param out the output stream where to write the data to.
     * @param dictionary the dictionary for the reference this loci refer to.
     * @return never {@code null}.
     * @throws IOException iff there is any low-level issue creating the writer.
     */
    public static TableWriter textWriter(final OutputStream out, final SAMSequenceDictionary dictionary) throws IOException {


        return new TableWriter(new OutputStreamWriter(out),
                TableColumnCollection.make("chridx", "chrid", "start", "end", "period", "mask", "mask_bin", "length_bp", "length_rp")) {

            @Override
            protected void composeLine(final DragstrLocus record, final DataLine dataLine) {
                dataLine.append(record.getChromosomeIndex())
                        .append(dictionary.getSequence(record.getChromosomeIndex()).getSequenceName())
                        .append(record.getStart())
                        .append(record.getStart() + record.getLength() - 1)
                        .append(record.getPeriod())
                        .append(record.getMask())
                        .append(Long.toBinaryString(record.getMask()))
                        .append(record.getLength())
                        .append(record.getLength() / record.getPeriod());
            }
        };
    }

    /**
     * Reads in DragstrLocus instances from a stream that has the same content as the one generated using the {@link #textWriter}.
     */
    static TableReader textReader(final InputStream in, final SAMSequenceDictionary dictionary) throws IOException {
        return new TableReader(new InputStreamReader(in)) {

            @Override
            protected DragstrLocus createRecord(final DataLine dataLine) {

                final String chr = dataLine.get("chrid");
                final SAMSequenceRecord seq = dictionary.getSequence(chr);
                final int chridx = seq.getSequenceIndex();
                final long start = dataLine.getLong("start");
                final byte period = dataLine.getByte("period");
                final short length = (short) dataLine.getInt("length");
                final int mask = dataLine.getInt("mask");
                return DragstrLocus.make(chridx, start, period, length, mask);
            }
        };
    }

    public static class BinaryTableIndex {

        private static class Entry {
            public final int chrIdx;
            public final int pos;
            public final long offset;

            private Entry(final int chrIdx, final int pos, final long offset) {
                this.chrIdx = chrIdx;
                this.pos = pos;
                this.offset = offset;
            }

            public static Entry of(final int chrIdx, final int pos, final long offset) {
                return new Entry(chrIdx, pos, offset);
            }
        }

        private Int2ObjectMap> entriesByChrIdx;

        private BinaryTableIndex(final Int2ObjectMap> entries) {
            entriesByChrIdx = entries;
        }

        public long offset(final int chrIdx, final int start, final int end) {
            final List chrEntries = entriesByChrIdx.get(chrIdx);
            if (chrEntries == null || chrEntries.isEmpty()) {
                return -1;
            } else if (chrEntries.get(0).pos > end) {
                return -1;
            } else {
                int i = 0, j = chrEntries.size() - 1;
                while (i < j) {
                    int k = (i + j) / 2;
                    final Entry candidate = chrEntries.get(k);
                    if (candidate.pos < start) {
                        i = Math.min(k + 1, j);
                    } else if (candidate.pos > start) {
                        j = Math.max(k - 1, i);
                    } else {
                        return candidate.offset;
                    }
                }
                while (i > 0 && chrEntries.get(i).pos > start) {
                    i--;
                }
                return chrEntries.get(i).offset;
            }
        }

        public static BinaryTableIndex load(final String path) {
            return load(BucketUtils.openFile(path));
        }

        public static BinaryTableIndex load(final InputStream inputStream) {

            final BinaryTableReader entryReader = new BinaryTableReader(inputStream, null) {
                @Override
                protected Entry readRecord(final DataInput input) throws IOException {
                    final int chrIdx = input.readInt();
                    final int pos = input.readInt();
                    final long offset = input.readLong();
                    return Entry.of(chrIdx, pos, offset);
                }
            };
            final Int2ObjectMap> entriesByChrIdx = new Int2ObjectArrayMap<>();
            entryReader.stream()
                       .forEach(entry -> {
                           List entries = entriesByChrIdx.get(entry.chrIdx);
                           if (entries == null) {
                               entriesByChrIdx.put(entry.chrIdx, entries = new ArrayList<>());
                           }
                           entries.add(entry);
                       });

            return new BinaryTableIndex(entriesByChrIdx);
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy