Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* The MIT License
*
* Copyright (c) 2018 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package htsjdk.tribble.index.tabix;
import htsjdk.samtools.BAMIndexMerger;
import htsjdk.samtools.BinningIndexContent;
import htsjdk.samtools.IndexMerger;
import htsjdk.samtools.LinearIndex;
import htsjdk.samtools.util.BlockCompressedOutputStream;
import htsjdk.tribble.util.LittleEndianOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
/**
* Merges tabix files for parts of a VCF file that have been concatenated.
*
* A partitioned VCF is a directory containing the following files:
*
*
A file named header containing all header bytes in VCF format.
*
Zero or more files named part-00000, part-00001, ... etc, containing a list of VCF records.
*
A file named terminator containing a BGZF end-of-file marker block (only if the VCF is bgzip-compressed).
*
*
* If the VCF is bgzip-compressed then the header and part files must be all bgzip-compressed.
*
* For a compressed VCF, if an index is required, then a tabix index can be generated for each (headerless) part file. These files
* should be named .part-00000.tbi, .part-00001.tbi, ... etc. Note the leading . to make the files hidden.
*
* This format has the following properties:
*
*
*
Parts and their indexes may be written in parallel, since one part file can be written independently of the others.
*
A VCF file can be created from a partitioned VCF file by concatenating all the non-hidden files (header, part-00000, part-00001, ..., terminator).
*
A VCF index can be created from a partitioned VCF file by merging all of the hidden files with a .tbi suffix. Note that this is not a simple file concatenation operation. See {@link TabixIndexMerger}.
*
*/
public class TabixIndexMerger extends IndexMerger {
private TabixFormat formatSpec;
private final List sequenceNames = new ArrayList<>();
private List indexes = new ArrayList<>();
public TabixIndexMerger(final OutputStream out, final long headerLength) {
super(out, headerLength);
}
@Override
public void processIndex(final TabixIndex index, final long partLength) {
this.partLengths.add(partLength);
if (indexes.isEmpty()) {
formatSpec = index.getFormatSpec();
if (index.getSequenceNames() != null) {
sequenceNames.addAll(index.getSequenceNames());
}
}
if (!index.getFormatSpec().equals(formatSpec)) {
throw new IllegalArgumentException(
String.format("Cannot merge tabix files with different formats, %s and %s.", index.getFormatSpec(), formatSpec));
}
if (!sequenceNames.equals(index.getSequenceNames())) {
throw new IllegalArgumentException(
String.format("Cannot merge tabix files with different sequence names, %s and %s.", index.getSequenceNames(), sequenceNames));
}
indexes.add(index);
}
@Override
public void finish(final long dataFileLength) throws IOException {
if (indexes.isEmpty()) {
throw new IllegalArgumentException("Cannot merge zero tabix files");
}
final long[] offsets = partLengths.stream().mapToLong(i -> i).toArray();
Arrays.parallelPrefix(offsets, (a, b) -> a + b); // cumulative offsets
final List mergedBinningIndexContentList = new ArrayList<>();
for (int ref = 0; ref < sequenceNames.size(); ref++) {
final int r = ref;
List binningIndexContentList = indexes.stream().map(index -> index.getIndices()[r]).collect(Collectors.toList());
final BinningIndexContent binningIndexContent = mergeBinningIndexContent(ref, binningIndexContentList, offsets);
mergedBinningIndexContentList.add(binningIndexContent);
}
final TabixIndex tabixIndex = new TabixIndex(formatSpec, sequenceNames, mergedBinningIndexContentList.toArray(new BinningIndexContent[0]));
try (LittleEndianOutputStream los = new LittleEndianOutputStream(new BlockCompressedOutputStream(out, (File) null))) {
tabixIndex.write(los);
}
}
private static BinningIndexContent mergeBinningIndexContent(final int referenceSequence, final List binningIndexContentList, final long[] offsets) {
final List binLists = new ArrayList<>();
final List linearIndexes = new ArrayList<>();
for (BinningIndexContent binningIndexContent : binningIndexContentList) {
binLists.add(binningIndexContent == null ? null : binningIndexContent.getBins());
linearIndexes.add(binningIndexContent == null ? null : binningIndexContent.getLinearIndex());
}
return new BinningIndexContent(referenceSequence, BAMIndexMerger.mergeBins(binLists, offsets), BAMIndexMerger.mergeLinearIndexes(referenceSequence, linearIndexes, offsets));
}
}