org.opencb.biodata.tools.variant.VariantVcfHtsjdkReader Maven / Gradle / Ivy
/*
*
*
*/
package org.opencb.biodata.tools.variant;
import htsjdk.tribble.readers.LineIterator;
import htsjdk.tribble.readers.LineIteratorImpl;
import htsjdk.tribble.readers.LineReader;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderVersion;
import org.apache.commons.lang3.StringUtils;
import org.opencb.biodata.formats.variant.io.VariantReader;
import org.opencb.biodata.formats.variant.vcf4.FullVcfCodec;
import org.opencb.biodata.models.variant.StudyEntry;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.models.variant.VariantFileMetadata;
import org.opencb.biodata.models.variant.avro.FileEntry;
import org.opencb.biodata.models.variant.metadata.VariantStudyMetadata;
import org.opencb.biodata.tools.variant.converters.avro.VCFHeaderToVariantFileHeaderConverter;
import org.opencb.biodata.tools.variant.converters.avro.VCFHeaderToVariantFileMetadataConverter;
import org.opencb.biodata.tools.variant.converters.avro.VariantContextToVariantConverter;
import org.opencb.biodata.tools.variant.metadata.VariantMetadataManager;
import org.opencb.commons.utils.FileUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.InputStream;
import java.io.UncheckedIOException;
import java.nio.file.Path;
import java.util.*;
import java.util.function.BiConsumer;
import static java.lang.Math.abs;
/**
* Reads a VCF file using the library HTSJDK.
*
* Optionally, normalizes the variants.
*
* Created on 16/05/16.
*
* @author Jacobo Coll <[email protected]>
*/
public class VariantVcfHtsjdkReader implements VariantReader {
private static final String MATEID = "MATEID";
private static final String MATE_CIPOS = "MATE_CIPOS";
private static final int INTERACTING_DISTANCE_THRESHOLD = 50;
private static final String PHASE_SET_TAG = "PS";
private static final String VCF_MISSING_STRING = ".";
private static final int MAXIMUM_ALLOWED_BATCH_SIZE = 1000;
private final Logger logger = LoggerFactory.getLogger(VariantVcfHtsjdkReader.class);
private final Path input;
private InputStream inputStream;
private final VariantStudyMetadata metadata;
private final VariantFileMetadata fileMetadata;
private final VariantNormalizer normalizer;
private FullVcfCodec codec;
private VCFHeader header;
private VariantContextToVariantConverter converter;
private LineIterator lineIterator;
private List headerLines;
private Set> malformHandlerSet = new HashSet<>();
private boolean failOnError = false;
private boolean ignorePhaseSet = true;
private boolean combineBreakends = false;
private final boolean closeInputStream; // Do not close inputStream if is provided in constructor. Respect symmetrical open/close
private VariantContext lastVariantContext = null;
private HashMap breakendMates;
public VariantVcfHtsjdkReader(InputStream inputStream, VariantStudyMetadata metadata) {
this(inputStream, metadata, null);
}
public VariantVcfHtsjdkReader(InputStream inputStream, VariantStudyMetadata metadata, VariantNormalizer normalizer) {
this.input = null;
this.inputStream = Objects.requireNonNull(inputStream);
this.metadata = Objects.requireNonNull(metadata);
this.fileMetadata = new VariantFileMetadata(metadata.getFiles().get(0));
this.normalizer = normalizer;
this.closeInputStream = false; // Do not close input stream
}
public VariantVcfHtsjdkReader(Path input, VariantStudyMetadata metadata) {
this(input, metadata, null);
}
public VariantVcfHtsjdkReader(Path input, VariantStudyMetadata metadata, VariantNormalizer normalizer) {
this.input = Objects.requireNonNull(input);
this.inputStream = null;
this.metadata = Objects.requireNonNull(metadata);
this.fileMetadata = new VariantFileMetadata(metadata.getFiles().get(0));
this.normalizer = normalizer;
this.closeInputStream = true; // Close input stream
}
public VariantVcfHtsjdkReader registerMalformatedVcfHandler(BiConsumer handler) {
this.malformHandlerSet.add(handler);
return this;
}
public VariantVcfHtsjdkReader setFailOnError(boolean failOnError) {
this.failOnError = failOnError;
return this;
}
public VariantVcfHtsjdkReader setIgnorePhaseSet(boolean ignorePhaseSet) {
this.ignorePhaseSet = ignorePhaseSet;
return this;
}
public VariantVcfHtsjdkReader setCombineBreakends(boolean combineBreakends) {
this.combineBreakends = combineBreakends;
breakendMates = new HashMap<>();
return this;
}
@Override
public boolean open() {
if (inputStream == null) {
try {
inputStream = FileUtils.newInputStream(input);
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}
return true;
}
@Override
public boolean pre() {
codec = new FullVcfCodec();
lineIterator = codec.makeSourceFromStream(inputStream);
// Read the header
headerLines = new LinkedList<>();
while (lineIterator.hasNext()) {
String line = lineIterator.peek();
if (line.startsWith(VCFHeader.HEADER_INDICATOR)) {
headerLines.add(line);
lineIterator.next();
} else {
break;
}
}
// Parse the header
header = (VCFHeader) codec.readActualHeader(new LineIteratorImpl(new LineReader() {
Iterator iterator = headerLines.iterator();
@Override
public String readLine() throws IOException {
if (iterator.hasNext()) {
return iterator.next();
} else {
return null;
}
}
@Override public void close() {}
}));
VCFHeaderToVariantFileMetadataConverter fileMetadataConverter = new VCFHeaderToVariantFileMetadataConverter();
fileMetadataConverter.convert(header, fileMetadata);
List samples = fileMetadata.getSampleIds();
header = new VCFHeader(header.getMetaDataInInputOrder(), samples);
codec.setVCFHeader(header, codec.getVCFHeaderVersion());
// Create converters
converter = new VariantContextToVariantConverter(metadata.getId(), fileMetadata.getId(), samples);
if (metadata.getIndividuals() == null) {
metadata.setIndividuals(new ArrayList<>(samples.size()));
}
VariantMetadataManager metadataManager = new VariantMetadataManager(metadata);
for (String sample : samples) {
metadataManager.addIndividual(sample, sample, metadata.getId());
}
if (normalizer != null) {
normalizer.configure(fileMetadata.getHeader());
}
return true;
}
@Override
public List read(int batchSize) {
List variantContexts = new ArrayList<>(batchSize);
// Add last variant context from last call to "read"
if (lastVariantContext != null) {
variantContexts.add(lastVariantContext);
}
lastVariantContext = readNextVariantContext();
while (lastVariantContext != null && incompleteBatch(variantContexts, batchSize)) {
variantContexts.add(lastVariantContext);
lastVariantContext = readNextVariantContext();
}
List variants = converter.apply(variantContexts);
if (combineBreakends) {
variants = runCombineBreakends(variants);
// Reached end of file, no more variants - Drain unpaired breakends
if (lastVariantContext == null) {
// Singleton BNDs - BNDs that contain a MATEID in the info field, however, no BND was found in the
// VCF with that MATEID
Iterator breakendIdIterator = breakendMates.keySet().iterator();
while (breakendIdIterator.hasNext() && variants.size() < batchSize) {
String breakendId = breakendIdIterator.next();
variants.add(breakendMates.get(breakendId));
breakendMates.remove(breakendId);
}
}
}
return normaliseIfAppropriate(variants);
}
private List normaliseIfAppropriate(List variants) {
// Need to normalise one by one so that if one of them raises error while normalising we can easily notify which
// one and skip it
List finalVariantList;
if (normalizer != null) {
finalVariantList = new ArrayList<>(variants.size());
for (Variant variant : variants) {
try {
finalVariantList.addAll(normalizer.apply(Collections.singletonList(variant)));
} catch (RuntimeException e) {
logger.warn("Error found during variant normalization. Variant: {}. This variant will be skipped "
+ "and process will continue", variant.toString());
logMalformatedLine(variant.toString(), e);
if (failOnError) {
throw e;
}
}
}
} else {
finalVariantList = variants;
}
return finalVariantList;
}
private List runCombineBreakends(List variants) {
List variantListToReturn = new ArrayList<>(variants.size());
for (Variant variant : variants) {
if (StringUtils.isNotBlank(variant.getAlternate())) {
byte[] alternateBytes = variant.getAlternate().getBytes();
// Symbolic allele: CNV, DEL, DUP, INS, INV, BND
if (Allele.wouldBeSymbolicAllele(alternateBytes)) {
// BND
if (alternateBytes[0] == '.' || alternateBytes[alternateBytes.length - 1] == '.' // single breakend
|| variant.getAlternate().contains("[") // mated breakend
|| variant.getAlternate().contains("]")) {
String mateId = getMateId(variant);
// If there's no mate specified, add BND
if (mateId != null) {
String breakendPairId = getBreakendPairId(variant.getNames().get(0), mateId);
// Mate was previously seen and stored, create Variant with the pair info, remove
// variantContext from sharedContext and continue
// WARNING: assuming BND positions cannot be multiallelic positions - there will always
// be just one alternate allele!
if (breakendMates.putIfAbsent(breakendPairId, variant) != null) {
variantListToReturn.add(combineBreakendPair(variant, breakendMates.get(breakendPairId)));
breakendMates.remove(breakendPairId);
// Mate not seen yet, variant has been saved in breakendMates, remove from input list
}
// Singleton BND, no mate specified within the INFO field; keep in the variant list as is
} else {
variantListToReturn.add(variant);
}
// Symbolic allele other than BND: CNV, DEL, DUP, INS, INV BND; keep in the variant list as is
} else {
variantListToReturn.add(variant);
}
// Simple variant: SNV, short insertion, short deletion; add it; keep in the variant list as is
} else {
variantListToReturn.add(variant);
}
}
}
return variantListToReturn;
}
private String getBreakendPairId(String mateId1, String mateId2) {
// The id for the breakend pair will be the two BND Ids alphabetically sorted and concatenated by a '_'
List ids = Arrays.asList(mateId1, mateId2);
Collections.sort(ids);
return StringUtils.join(ids, "_");
}
private Variant combineBreakendPair(Variant variant, Variant variant1) {
// Set mate position
variant.getSv().getBreakend().getMate().setChromosome(variant1.getChromosome());
variant.getSv().getBreakend().getMate().setPosition(variant1.getStart());
// Check the second BND does have CIPOS
if (variant1.getSv() != null
&& variant1.getSv().getCiStartLeft() != null
&& variant1.getSv().getCiStartRight() != null) {
Integer ciPositionLeft = variant1.getSv().getCiStartLeft();
Integer ciPositionRight = variant1.getSv().getCiStartRight();
// Get CIPOS from second BND
String ciposString = ciPositionLeft + VCFConstants.INFO_FIELD_ARRAY_SEPARATOR + ciPositionRight;
// Set CIPOS string of the sencond BND as part of the file INFO field in the first BND
Map fileData = variant.getStudies().get(0).getFiles().get(0).getData();
fileData.put(MATE_CIPOS, ciposString);
// CIPOS of the second breakend
variant.getSv()
.getBreakend()
.getMate()
.setCiPositionLeft(ciPositionLeft);
variant.getSv()
.getBreakend()
.getMate()
.setCiPositionRight(ciPositionRight);
// If not, it's a precise call, just position is stored (above)
}
return variant;
}
private String getMateId(Variant variant) {
if (variant.getStudies() != null) {
if (!variant.getStudies().isEmpty()) {
if (variant.getStudies().size() > 1) {
throw new RuntimeException("More than one study found for variant " + variant.toString()
+ ". Only one expected. Please, check.");
}
StudyEntry studyEntry = variant.getStudies().get(0);
if (studyEntry.getFiles() != null) {
if (!studyEntry.getFiles().isEmpty()) {
if (studyEntry.getFiles().size() > 1) {
throw new RuntimeException("More than one file found for variant " + variant.toString()
+ ". Only one expected. Please, check.");
}
FileEntry fileEntry = studyEntry.getFiles().get(0);
if (fileEntry.getData() != null
&& StringUtils.isNotBlank(
fileEntry.getData().get(MATEID))) {
return fileEntry.getData().get(MATEID);
}
}
}
}
}
return null;
}
private boolean incompleteBatch(List variantContexts, int batchSize) {
// batchSize must be > 0
// If batch already reached required batch size, check phase
if (variantContexts.size() >= batchSize) {
// if phase should be ignored the batch is complete
if (ignorePhaseSet) {
return false;
// if phase is to be considered, must check if the phase of the next variant context matches the phase
// of current variant context. In such a case, the batch is incomplete since all variant contexts with
// the same PS must be part of the same batch regardless of the batch size
} else {
if (variantContexts.size() == MAXIMUM_ALLOWED_BATCH_SIZE) {
logger.error("Reached {} variants in one single batch. Truncating batch at variant {}",
variantContexts.size(),
variantContexts.get(variantContexts.size() - 1).toString());
return false;
}
// Assumes variantContexts.size() > 0
VariantContext lastSavedVariantContext = variantContexts.get(variantContexts.size() - 1);
// Rationale for including a distance threshold in here regardless of the phase set:
// Variants within a certain distance might be jointly reported in databases, e.g MNVs in ClinVar and
// pop frequency datasets. Two variants with different PS are not necessarily in a different
// chromosome copy; the fact of having a different PS simply states that the caller was able to
// determine the phase within two non-overlapping genomic regions, and WITHIN each of those regions,
// the chromosome copy location can be inferred for corresponding alleles. Alleles of variants each
// in a different region (different PS) could actually be located in the same copy.
// Since this logic is included for supporting phased variant annotation use case, not including this
// distance check could mean that two variants in the same chromosome copy, with different PS, being
// reported together by ClinVar (as an MNV, for example) could be missed.
// Obviously if the phase set is the same the batch is incomplete
return (abs(lastVariantContext.getStart() - lastSavedVariantContext.getStart())
< INTERACTING_DISTANCE_THRESHOLD)
|| samePhaseSet(lastSavedVariantContext, lastVariantContext);
}
}
// batch is always incomplete if the number of read variant contexts < batchSize, regardless of phase of the
// items
return true;
}
private boolean samePhaseSet(VariantContext variantContext, VariantContext variantContext1) {
String phaseSet = getPhaseSet(variantContext);
if (phaseSet != null) {
String phaseSet1 = getPhaseSet(variantContext1);
if (phaseSet1 != null) {
return phaseSet.equals(phaseSet1);
}
}
return false;
}
private String getPhaseSet(VariantContext variantContext) {
htsjdk.variant.variantcontext.Genotype genotype = variantContext.getGenotype(0);
if (genotype != null) {
Object attribute = genotype.getAnyAttribute(PHASE_SET_TAG);
if (attribute != null) {
if (attribute instanceof Collection) {
throw new RuntimeException("Unexpected PS value: Phase set field found containing multiple values. " +
"See: " + variantContext.toString());
} else if (isMissing(attribute.toString())) {
return null;
} else {
return attribute.toString();
}
}
}
//Can hts return null fields?
//ABSOLUTELY, for missing values
return null;
}
private boolean isMissing(String vcfFormatFieldValue) {
return StringUtils.isBlank(vcfFormatFieldValue)
|| vcfFormatFieldValue.equals(VCF_MISSING_STRING);
}
private VariantContext readNextVariantContext() {
String line;
while (lineIterator.hasNext()) {
line = lineIterator.next();
if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
try {
return codec.decode(line);
} catch (RuntimeException e) {
logMalformatedLine(line, e);
if (failOnError) {
throw e;
}
}
}
}
return null;
}
private void logMalformatedLine(String line, RuntimeException exception) {
logger.warn(exception.getMessage());
for (BiConsumer consumer : this.malformHandlerSet) {
consumer.accept(line, exception);
}
}
@Override
public boolean post() {
return true;
}
@Override
public boolean close() {
try {
if (closeInputStream && inputStream != null) {
inputStream.close();
}
} catch (IOException e) {
throw new UncheckedIOException(e);
}
return true;
}
@Override
public List getSampleNames() {
return header.getSampleNamesInOrder();
}
@Override
public String getHeader() {
return String.join("\n", headerLines);
}
public VCFHeader getVCFHeader() {
return header;
}
public VCFHeaderVersion getVCFHeaderVersion() {
return codec.getVCFHeaderVersion();
}
@Override
public VariantFileMetadata getVariantFileMetadata() {
return fileMetadata;
}
@Deprecated
public VariantFileMetadata getMetadata() {
return getVariantFileMetadata();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy