
org.snpeff.vcf.VcfEffect Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of SnpEff Show documentation
Show all versions of SnpEff Show documentation
Variant annotation and effect prediction package.
The newest version!
package org.snpeff.vcf;
import java.util.Collections;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.List;
import org.snpeff.interval.BioType;
import org.snpeff.interval.Custom;
import org.snpeff.interval.Exon;
import org.snpeff.interval.Gene;
import org.snpeff.interval.Intergenic;
import org.snpeff.interval.Intron;
import org.snpeff.interval.Marker;
import org.snpeff.interval.Motif;
import org.snpeff.interval.NextProt;
import org.snpeff.interval.ProteinProteinInteractionLocus;
import org.snpeff.interval.ProteinStructuralInteractionLocus;
import org.snpeff.interval.Regulation;
import org.snpeff.interval.Transcript;
import org.snpeff.interval.Variant;
import org.snpeff.snpEffect.EffectType;
import org.snpeff.snpEffect.VariantEffect;
import org.snpeff.snpEffect.VariantEffect.FunctionalClass;
import org.snpeff.util.Gpr;
import org.snpeff.util.Tuple;
/**
* An 'ANN' or 'EFF' entry in a VCF INFO field
* Note: 'EFF' is the old version that has been replaced by the standardized 'ANN' field (2014-12)
* *
* @author pablocingolani
*/
public class VcfEffect {
public static boolean debug = false;
public static String ANN_FIELD_NAMES[] = { //
"ALLELE", "GT", "GENOTYPE", //
"EFFECT", "ANNOTATION", //
"IMPACT", //
"GENE", //
"GENEID", //
"FEATURE", //
"FEATUREID", "TRID", //
"BIOTYPE", //
"RANK", "EXID", //
"HGVS_C", "HGVS_DNA", "CODON", //
"HGVS", "HGVS_P", "HGVS_PROT", "AA", //
"POS_CDNA", "CDNA_POS", //
"LEN_CDNA", "CDNA_LEN", //
"POS_CDS", "CDS_POS", //
"LEN_CDS", "CDS_LEN", //
"POS_AA", "AA_POS", //
"LEN_AA", "AA_LEN", //
"DISTANCE", //
"ERRORS", "WARNINGS", "INFOS", //
};
public static String EFF_FIELD_NAMES[] = { //
"EFFECT", "IMPACT", "FUNCLASS", "CODON", //
"AA", //
"HGVS", //
"AA_LEN", //
"GENE", //
"BIOTYPE", //
"CODING", //
"TRID", //
"RANK", "EXID", //
"GT", "GENOTYPE_NUMBER", "GENOTYPE", //
"ERRORS", "WARNINGS", "INFOS", //
};
EffFormatVersion formatVersion;
String vcfFieldString; // Original 'raw' string from VCF Info field
String vcfFieldStrings[]; // Original 'raw' strings from VCF info field: effectString.split()
String effString;
EffectType effectType;
String effectTypesStr;
List effectTypes;
String effectDetails;
int aaLen, aaPos;
int cdsLen, cdsPos;
int cDnaLen, cDnaPos;
int distance;
int rank, rankMax;
BioType bioType;
String codon, aa, hgvsC, hgvsP;
VariantEffect.Coding coding;
String genotype;
String errorsWarnings;
String geneName, geneId, featureType, featureId, transcriptId, exonId;
VariantEffect.EffectImpact impact;
VariantEffect.FunctionalClass funClass;
VariantEffect variantEffect;
boolean useSequenceOntology;
boolean useHgvs;
boolean useGeneId;
boolean useFirstEffect;
/**
* Get info field name based on format version
*/
public static String infoFieldName(EffFormatVersion formatVersion) {
if (formatVersion == null) return EffFormatVersion.VCF_INFO_ANN_NAME;
return formatVersion.infoFieldName();
}
/**
* Return a string safe to be used in an 'EFF' info field (VCF file)
*/
public static String vcfEffSafe(String str) {
return str.replaceAll("(\\s|\\(|\\)|\\[|\\]|;|,|\\|)+", "_");
}
/**
* Constructor: Guess format version
*/
public VcfEffect(String effectString) {
init();
formatVersion = null; // Force guess
vcfFieldString = effectString;
parse();
}
/**
* Constructor: Force format version
* @param formatVersion : If null, will try to guess it
*/
public VcfEffect(String effectString, EffFormatVersion formatVersion) {
init();
this.formatVersion = formatVersion;
vcfFieldString = effectString;
parse();
}
public VcfEffect(VariantEffect variantEffect, EffFormatVersion formatVersion) {
this(variantEffect, formatVersion, true, false);
}
public VcfEffect(VariantEffect variantEffect, EffFormatVersion formatVersion, boolean useSequenceOntology, boolean useFirstEffect) {
init();
this.formatVersion = formatVersion;
this.variantEffect = variantEffect;
this.useSequenceOntology = useSequenceOntology;
this.useFirstEffect = useFirstEffect;
set(variantEffect);
}
/**
* Add subfield to a buffer
*/
void add(StringBuilder sb, Object obj) {
if (obj != null) sb.append(VcfEntry.vcfInfoEncode(obj.toString()));
sb.append("|");
}
public void addEffectType(EffectType effectType) {
effectTypes.add(effectType);
this.effectType = null;
}
/**
* Create 'ANN' field
*/
String createAnnField() {
StringBuilder effBuff = new StringBuilder();
// Allele
add(effBuff, genotype);
// Add main annotation in Sequence Ontology terms
add(effBuff, effectTypesStr);
// Add effect impact
add(effBuff, impact);
// Gene name
add(effBuff, geneName);
// Gene ID
add(effBuff, geneId);
// Feature type
add(effBuff, featureType);
// Feature ID
add(effBuff, featureId);
// Transcript biotype
add(effBuff, bioType);
// Add exon (or intron) rank info
if (rank >= 0) add(effBuff, rank + "/" + rankMax);
else effBuff.append("|");
// HGVS
add(effBuff, hgvsC);
add(effBuff, hgvsP);
// cDNA position / length
if (cDnaPos >= 0) {
add(effBuff, cDnaPos + "/" + cDnaLen);
} else effBuff.append("|");
// CDS position / length
if (cdsPos >= 0) {
add(effBuff, cdsPos + "/" + cdsLen);
} else effBuff.append("|");
// Protein position / protein length
if (aaPos >= 0) {
add(effBuff, aaPos + "/" + aaLen);
} else effBuff.append("|");
// Distance: Mostly used for non-coding variants
if (distance >= 0) add(effBuff, distance);
else effBuff.append("|");
// Errors or warnings (this is the last thing in the list)
effBuff.append(errorsWarnings);
return effBuff.toString();
}
/**
* Create 'EFF' field
*/
String createEffField() {
StringBuilder effBuff = new StringBuilder();
// Add effect
effBuff.append(effectTypesStr);
effBuff.append("(");
// Add effect impact
effBuff.append(impact);
effBuff.append("|");
// Add functional class
effBuff.append(funClass == FunctionalClass.NONE ? "" : funClass.toString()); // Show only if it is not empty
effBuff.append("|");
// Codon change
if (!codon.isEmpty()) effBuff.append(codon);
else if (distance >= 0) effBuff.append(distance);
effBuff.append("|");
// Add HGVS (amino acid change)
if (useHgvs) {
StringBuilder hgvs = new StringBuilder();
if (hgvsP != null) hgvs.append(VcfEntry.vcfInfoEncode(hgvsP));
if (hgvsC != null) {
if (hgvs.length() > 0) hgvs.append('/');
hgvs.append(VcfEntry.vcfInfoEncode(hgvsC));
}
effBuff.append(hgvs.toString());
} else effBuff.append(aa);
effBuff.append("|");
// Add amino acid length
if (formatVersion != EffFormatVersion.FORMAT_EFF_2) { // This field is not in format version 2
effBuff.append(aaLen >= 0 ? aaLen : "");
effBuff.append("|");
}
// Add gene info
if (variantEffect != null) {
Gene gene = variantEffect.getGene();
Transcript tr = variantEffect.getTranscript();
if (gene != null) {
// Gene name
effBuff.append(VcfEntry.vcfInfoValueSafe(useGeneId ? geneId : geneName));
effBuff.append("|");
// Transcript biotype
if (tr != null) {
if ((tr.getBioType() != null) && (tr.getBioType() != null)) effBuff.append(tr.getBioType());
else effBuff.append(tr.isProteinCoding() ? "protein_coding" : ""); // No biotype? Add protein_coding of we know it is.
}
effBuff.append("|");
// Protein coding gene?
String coding = "";
if (gene.getGenome().hasCodingInfo()) coding = (gene.isProteinCoding() ? VariantEffect.Coding.CODING.toString() : VariantEffect.Coding.NON_CODING.toString());
effBuff.append(coding);
effBuff.append("|");
} else if (variantEffect.isRegulation()) {
Regulation reg = (Regulation) variantEffect.getMarker();
effBuff.append("|" + reg.getRegulationType() + "||");
} else if (variantEffect.isCustom()) {
Marker m = variantEffect.getMarker();
if (m != null) effBuff.append("|" + VcfEntry.vcfInfoValueSafe(m.getId()) + "||");
else effBuff.append("|||");
} else effBuff.append("|||");
} else {
// No variantEffect? Use parsed information
effBuff.append(VcfEntry.vcfInfoValueSafe(useGeneId ? geneId : geneName));
effBuff.append("|");
effBuff.append(bioType);
effBuff.append("|");
effBuff.append(coding);
effBuff.append("|");
}
// Add transcript info
effBuff.append(VcfEntry.vcfInfoValueSafe(transcriptId));
effBuff.append("|");
// Add exon (or intron) rank info
effBuff.append(rank >= 0 ? rank : "");
// Add genotype (or genotype difference) for this effect
if (formatVersion == EffFormatVersion.FORMAT_EFF_4) {
effBuff.append("|");
effBuff.append(genotype);
}
//---
// Errors or warnings (this is the last thing in the list)
//---
if (!errorsWarnings.isEmpty()) {
effBuff.append("|");
effBuff.append(errorsWarnings);
}
effBuff.append(")");
return effBuff.toString();
}
/**
* Create INFO field using either 'ANN' or 'EFF' depending on format version
*/
String createInfoField() {
if (formatVersion == null || formatVersion.isAnn()) return createAnnField();
return createEffField();
}
/**
* Guess effect format version
*/
public EffFormatVersion formatVersion() {
// Already set?
if (formatVersion != null && formatVersion.isFullVersion()) return formatVersion;
// Try to guess format
if (formatVersion == null) formatVersion = formatVersion(vcfFieldString);
// Split strings
if (vcfFieldStrings == null) vcfFieldStrings = split(vcfFieldString);
// Now we can guess specific sub-version within each format
if (formatVersion.isAnn()) {
// Easy guess: So far there is only one version
formatVersion = EffFormatVersion.FORMAT_ANN_1;
} else if (formatVersion.isEff()) {
// On of the 'EFF' formats
int len = vcfFieldStrings.length;
// Error or Warning string is not added under normal situations
String lastField = vcfFieldStrings[len - 2]; // Actually last array item is after the last ')', so we use the previous one
if (lastField.startsWith("ERROR") //
|| lastField.startsWith("WARNING") //
|| lastField.startsWith("INFO") //
) len--;
// Guess format
if (len <= 11) formatVersion = EffFormatVersion.FORMAT_EFF_2;
else if (len <= 12) formatVersion = EffFormatVersion.FORMAT_EFF_3;
else formatVersion = EffFormatVersion.FORMAT_EFF_4;
} else {
throw new RuntimeException("Unimplemented formatVersion '" + formatVersion + "'");
}
return formatVersion;
}
/**
* Guess format 'main' version (either 'ANN' of 'EFF') without trying to guess sub-version
*/
protected EffFormatVersion formatVersion(String effectString) {
// Extract string between left and right parenthesis
int idxLp = effectString.indexOf('(');
// No parenthesis at all? Definitively 'ANN'
if (idxLp < 0) return EffFormatVersion.FORMAT_ANN;
// Probably 'EFF': how many sub fields between parenthesis?
int idxRp = effectString.lastIndexOf(')');
if (idxLp < idxRp) {
String paren = effectString.substring(idxLp + 1, idxRp);
String fields[] = paren.split("\\|", -1);
if (fields.length >= 9) return EffFormatVersion.FORMAT_EFF;
}
// Too few sub-fields: It cannot be 'EFF'
return EffFormatVersion.FORMAT_ANN;
}
public String getAa() {
return aa;
}
public int getAaLen() {
return aaLen;
}
public int getAaPos() {
return aaPos;
}
public String getAllele() {
return genotype;
}
public BioType getBioType() {
return bioType;
}
public int getcDnaLen() {
return cDnaLen;
}
public int getcDnaPos() {
return cDnaPos;
}
public int getCdsLen() {
return cdsLen;
}
public int getCdsPos() {
return cdsPos;
}
public VariantEffect.Coding getCoding() {
return coding;
}
public String getCodon() {
return codon;
}
public int getDistance() {
return distance;
}
public String getEffectDetails() {
return effectDetails;
}
public String getEffectsStr() {
StringBuilder sb = new StringBuilder();
for (EffectType et : effectTypes) {
if (sb.length() > 0) sb.append(formatVersion.separator());
sb.append(et);
}
return sb.toString();
}
public String getEffectsStrSo() {
StringBuilder sb = new StringBuilder();
for (EffectType et : effectTypes) {
if (sb.length() > 0) sb.append(formatVersion.separator());
sb.append(et.toSequenceOntology(formatVersion, null));
}
return sb.toString();
}
public EffectType getEffectType() {
if (effectType != null) return effectType;
if (effectTypes == null || effectTypes.isEmpty()) return EffectType.NONE;
// Pick highest effect type
effectType = EffectType.NONE;
for (EffectType et : effectTypes)
if (et.compareTo(effectType) < 0) effectType = et;
return effectType;
}
public List getEffectTypes() {
return effectTypes;
}
public String getEffectTypesStr() {
return effectTypesStr;
}
public String getEffString() {
return effString;
}
public String getErrorsWarning() {
return errorsWarnings;
}
public String getExonId() {
return exonId;
}
public String getFeatureId() {
return featureId;
}
public String getFeatureType() {
return featureType;
}
/**
* Get a subfield by name
*/
public String getFieldByName(String fieldName) {
switch (fieldName) {
case "ALLELE":
case "GT":
case "GENOTYPE":
case "GENOTYPE_NUMBER":
return genotype;
case "EFFECT":
case "ANNOTATION":
return effString;
case "IMPACT":
return impact != null ? impact.toString() : "";
case "FUNCLASS":
return funClass != null ? funClass.toString() : "";
case "GENE":
return geneName;
case "GENEID":
return geneId;
case "FEATURE":
case "FEATURE_TYPE":
return featureType;
case "FEATUREID":
return featureId;
case "TRID":
return transcriptId;
case "BIOTYPE":
return (bioType == null ? "" : bioType.toString());
case "RANK":
return Integer.toString(rank);
case "EXID":
return exonId;
case "RANK_MAX":
return Integer.toString(rankMax);
case "HGVS_C":
case "HGVS_DNA":
return hgvsC;
case "CODON":
return codon;
case "HGVS":
case "HGVS_P":
case "HGVS_PROT":
return hgvsP;
case "AA":
return aa;
case "POS_CDNA":
case "CDNA_POS":
return Integer.toString(cDnaPos);
case "LEN_CDNA":
case "CDNA_LEN":
return Integer.toString(cDnaLen);
case "POS_CDS":
case "CDS_POS":
return Integer.toString(cdsPos);
case "LEN_CDS":
case "CDS_LEN":
return Integer.toString(cdsLen);
case "POS_AA":
case "AA_POS":
return Integer.toString(aaPos);
case "LEN_AA":
case "AA_LEN":
return Integer.toString(aaLen);
case "CODING":
return coding != null ? coding.toString() : "";
case "DISTANCE":
return Integer.toString(distance);
case "ERRORS":
case "WARNINGS":
case "INFOS":
return errorsWarnings;
default:
throw new RuntimeException("Field '" + fieldName + "' not found.");
}
}
public EffFormatVersion getFormatVersion() {
return formatVersion;
}
public VariantEffect.FunctionalClass getFunClass() {
return funClass;
}
public String getGeneId() {
return geneId;
}
public String getGeneName() {
return geneName;
}
public String getGenotype() {
return genotype;
}
public String getHgvsC() {
return hgvsC;
}
public String getHgvsDna() {
return hgvsC;
}
public String getHgvsP() {
return hgvsP;
}
public String getHgvsProt() {
return hgvsP;
}
public VariantEffect.EffectImpact getImpact() {
return impact;
}
public int getRank() {
return rank;
}
public int getRankMax() {
return rankMax;
}
public String getTranscriptId() {
return transcriptId;
}
/**
* String from VCF file (original, unparsed, string)
*/
public String getVcfFieldString() {
return vcfFieldString;
}
/**
* Get a subfield as an index
*/
public String getVcfFieldString(int index) {
if (index >= vcfFieldStrings.length) return null;
return vcfFieldStrings[index];
}
/**
* Does it have 'effType' ?
*/
public boolean hasEffectType(EffectType effType) {
if (effectTypes == null) return false;
for (EffectType et : effectTypes)
if (et == effType) return true;
return false;
}
void init() {
aaLen = aaPos = cdsLen = cdsPos = cDnaLen = cDnaPos = distance = rank = rankMax = -1;
vcfFieldString = effString = effectTypesStr = effectDetails = codon = aa = hgvsC = hgvsP = genotype = errorsWarnings = geneName = geneId = featureType = featureId = transcriptId = exonId = errorsWarnings = "";
bioType = null;
impact = null;
funClass = FunctionalClass.NONE;
useSequenceOntology = true;
useHgvs = true;
useGeneId = false;
}
/**
* Parse annotations either in 'ANN' or 'EFF' INFO field
*/
void parse() {
// Guess format, if not given
if (formatVersion == null || !formatVersion.isFullVersion()) formatVersion = formatVersion();
// Split strings
vcfFieldStrings = split(vcfFieldString);
// Parse
if (formatVersion.isAnn()) parseAnn();
else parseEff();
}
/**
* Parse 'ANN' field
*/
void parseAnn() {
int index = 0;
// Gentype
genotype = vcfFieldStrings[index++];
// Annotation
effString = vcfFieldStrings[index];
effectTypesStr = vcfFieldStrings[index];
effectTypes = parseEffect(vcfFieldStrings[index]);
index++;
// Impact
impact = VariantEffect.EffectImpact.valueOf(vcfFieldStrings[index++]);
// Gene name
geneName = vcfFieldStrings[index++];
// Gene ID
geneId = vcfFieldStrings[index++];
// Feature type
featureType = vcfFieldStrings[index++];
// Feature ID
featureId = vcfFieldStrings[index++];
if (featureType.equals("transcript")) transcriptId = featureId;
// Biotype
bioType = BioType.parse(vcfFieldStrings[index++]);
// Rank '/' rankMax
Tuple ints = parseSlash(vcfFieldStrings[index++]);
rank = ints.first;
rankMax = ints.second;
// HGVS
hgvsC = VcfEntry.vcfInfoDecode(vcfFieldStrings[index++]);
codon = hgvsC;
hgvsP = VcfEntry.vcfInfoDecode(vcfFieldStrings[index++]);
aa = hgvsP;
// cDna: 'pos / len'
ints = parseSlash(vcfFieldStrings[index++]);
cDnaPos = ints.first;
cDnaLen = ints.second;
// CDS: 'pos / len'
ints = parseSlash(vcfFieldStrings[index++]);
cdsPos = ints.first;
cdsLen = ints.second;
// AA: 'pos / len'
ints = parseSlash(vcfFieldStrings[index++]);
aaPos = ints.first;
aaLen = ints.second;
// Distance
distance = Gpr.parseIntSafe(vcfFieldStrings[index++]);
// Errors , warnings, info
errorsWarnings = vcfFieldStrings[index++];
}
/**
* Parse 'EFF' field
*/
void parseEff() {
try {
// Parse each sub field
int index = 0;
// Effect
effString = vcfFieldStrings[index];
effectTypesStr = vcfFieldStrings[index];
effectTypes = parseEffect(vcfFieldStrings[index]);
effectDetails = parseEffectDetails(vcfFieldStrings[index]); // Effect details: everything between '[' and ']' (e.g. Regulation, Custom, Motif, etc.)
index++;
if ((vcfFieldStrings.length > index) && !vcfFieldStrings[index].isEmpty()) impact = VariantEffect.EffectImpact.valueOf(vcfFieldStrings[index]);
index++;
if ((vcfFieldStrings.length > index) && !vcfFieldStrings[index].isEmpty()) funClass = VariantEffect.FunctionalClass.valueOf(vcfFieldStrings[index]);
index++;
if ((vcfFieldStrings.length > index) && !vcfFieldStrings[index].isEmpty()) codon = vcfFieldStrings[index];
index++;
// Parse 'AA' and HGVS
if ((vcfFieldStrings.length > index) && !vcfFieldStrings[index].isEmpty()) aa = vcfFieldStrings[index];
if (aa.indexOf('/') > 0) {
String f[] = aa.split("/");
// HGVS Protein
if (f.length > 0 && f[0].startsWith("p.")) {
hgvsP = VcfEntry.vcfInfoDecode(f[0]);
}
// HGVS DNA
if ((f.length > 1) && (f[1].startsWith("c.") || f[1].startsWith("n."))) hgvsC = VcfEntry.vcfInfoDecode(f[1]);
} else if (aa.startsWith("c.") || aa.startsWith("n.")) {
hgvsC = aa = VcfEntry.vcfInfoDecode(aa); // Only HGVS DNA
}
index++;
if (formatVersion != EffFormatVersion.FORMAT_EFF_2) {
if ((vcfFieldStrings.length > index) && !vcfFieldStrings[index].isEmpty()) aaLen = Gpr.parseIntSafe(vcfFieldStrings[index]);
else aaLen = 0;
index++;
}
if ((vcfFieldStrings.length > index) && !vcfFieldStrings[index].isEmpty()) geneName = vcfFieldStrings[index];
index++;
if ((vcfFieldStrings.length > index) && !vcfFieldStrings[index].isEmpty()) bioType = BioType.parse(vcfFieldStrings[index]);
index++;
if ((vcfFieldStrings.length > index) && !vcfFieldStrings[index].isEmpty()) coding = VariantEffect.Coding.valueOf(vcfFieldStrings[index]);
index++;
if ((vcfFieldStrings.length > index) && !vcfFieldStrings[index].isEmpty()) transcriptId = vcfFieldStrings[index];
index++;
if ((vcfFieldStrings.length > index) && !vcfFieldStrings[index].isEmpty()) exonId = vcfFieldStrings[index];
index++;
if (formatVersion == EffFormatVersion.FORMAT_EFF_4) {
if ((vcfFieldStrings.length > index) && !vcfFieldStrings[index].isEmpty()) genotype = vcfFieldStrings[index];
else genotype = "";
index++;
}
if ((vcfFieldStrings.length > index) && !vcfFieldStrings[index].isEmpty()) errorsWarnings = vcfFieldStrings[index];
index++;
} catch (Exception e) {
String fields = "";
for (int i = 0; i < vcfFieldStrings.length; i++)
fields += "\t" + i + " : '" + vcfFieldStrings[i] + "'\n";
throw new RuntimeException("Error parsing:\n\t'" + vcfFieldString + "'\n\t EFF formatVersion : " + formatVersion + "\n" + fields, e);
}
}
/**
* Parse effect sub-field
*/
List parseEffect(String eff) {
int idx = eff.indexOf('[');
if (idx > 0) eff = eff.substring(0, idx);
List effs = new LinkedList<>();
if (eff.isEmpty()) return effs;
// Split multiple effectTypes
if (eff.indexOf(EffFormatVersion.EFFECT_TYPE_SEPARATOR_OLD) >= 0) {
// Old version
for (String es : eff.split("\\" + EffFormatVersion.EFFECT_TYPE_SEPARATOR_OLD))
effs.add(EffectType.parse(formatVersion, es));
} else {
// Split effect strings
for (String es : eff.split(formatVersion.separatorSplit()))
effs.add(EffectType.parse(formatVersion, es));
}
return effs;
}
/**
* Parse effect details.
* E.g. NEXT_PROT[amino_acid_modification:Phosphoserine] returns "amino_acid_modification:Phosphoserine"
*/
String parseEffectDetails(String eff) {
int idx = eff.indexOf('[');
if (idx < 0) return "";
return eff.substring(idx + 1, eff.length() - 1);
}
/**
* Parse two integers separated by a slash
*/
Tuple parseSlash(String str) {
int i1 = -1, i2 = -1;
if (str != null && !str.isEmpty()) {
String fields[] = str.split("/");
if (fields.length >= 2) {
// Two numbers separated by a slash
i1 = Gpr.parseIntSafe(fields[0]);
i2 = Gpr.parseIntSafe(fields[1]);
} else {
// Only one number?
i1 = Gpr.parseIntSafe(fields[0]);
}
}
return new Tuple<>(i1, i2);
}
/**
* Set all fields form 'variantEffect'
*/
void set(VariantEffect variantEffect) {
// Allele
Variant variant = variantEffect.getVariant();
Gene gene = variantEffect.getGene();
Marker marker = variantEffect.getMarker();
Transcript tr = variantEffect.getTranscript();
// Genotype
if (variant.getGenotype() != null) genotype = variant.getGenotype();
else if (!variant.isVariant()) genotype = variant.getReference();
else genotype = variant.getAlt();
// else if (var.isNonRef()) genotype = var.getGenotype();
// Effect
effectType = variantEffect.getEffectType();
effectTypes = variantEffect.getEffectTypes();
if (formatVersion.isAnn()) {
effectTypesStr = variantEffect.getEffectTypeString(true, useFirstEffect, formatVersion);
} else {
effectTypesStr = variantEffect.effect(true, false, false, useSequenceOntology, useFirstEffect);
}
// Impact
impact = variantEffect.getEffectImpact();
// Functional class
funClass = variantEffect.getFunctionalClass();
// Gene
List genes = variantEffect.isMultipleGenes() ? variantEffect.getGenes() : null;
if (genes != null) { // Multiple genes
setGeneNameIdMultiple(variantEffect);
} else if (gene != null) {
geneName = gene.getGeneName();
geneId = gene.getId();
} else if (marker instanceof Intergenic) {
geneName = ((Intergenic) marker).getName();
geneId = marker.getId();
} else {
geneName = geneId = "";
}
// Feature type & ID
featureType = featureId = "";
if (marker != null) {
if (marker instanceof Custom) {
// Custom
featureType = marker.getType() + formatVersion.separator() + ((Custom) marker).getLabel();
featureId = marker.getId();
} else if (marker instanceof Regulation) {
// Regulation includes cell type
Regulation reg = (Regulation) marker;
featureType = reg.getType() + formatVersion.separator() + reg.getName() + ":" + reg.getRegulationType();
featureId = marker.getId();
} else if (marker instanceof NextProt) {
featureType = marker.getId();
featureId = ((NextProt) marker).getTranscriptId();
} else if (marker instanceof Motif) {
Motif motif = (Motif) marker;
featureType = motif.getPwmName();
featureId = motif.getPwmId();
} else if (marker instanceof ProteinStructuralInteractionLocus) {
featureType = "interaction";
featureId = marker.getId();
} else if (marker instanceof ProteinProteinInteractionLocus) {
featureType = "interaction";
featureId = marker.getId();
} else if (tr != null) {
featureType = "transcript";
featureId = transcriptId = tr.getId();
// Append version number (this is recommended by HGVS specification)
if (tr.getVersion() != null && !tr.getVersion().isEmpty()) featureId += "." + tr.getVersion();
} else {
featureType = marker.getType().toSequenceOntology(formatVersion, null);
featureId = marker.getId();
}
}
// Biotype
if (tr != null) {
if ((tr.getBioType() != null) && (tr.getBioType() != null)) {
bioType = tr.getBioType();
} else {
// No biotype? Add protein_coding of we know it is.
bioType = BioType.coding(tr.isProteinCoding());
}
} else {
bioType = null;
}
// Find and set rank and rankMax
setRank();
// Codon change
codon = variantEffect.getCodonChangeMax();
// AA change
aa = variantEffect.getAaChange();
// HGVS notation
hgvsC = variantEffect.getHgvsDna();
hgvsP = variantEffect.getHgvsProt();
// cDna position & len (cDNA is the DNA version of mRNA)
if (tr != null) {
cDnaPos = variantEffect.getcDnaPos();
if (cDnaPos >= 0 && formatVersion.isAnn()) cDnaPos++; // 1-based position;
cDnaLen = tr.mRna().length();
} else {
cDnaPos = cDnaLen = -1;
}
// CDS position / length
if (tr != null) {
cdsPos = variantEffect.getCodonNum() * 3 + variantEffect.getCodonIndex();
if (cdsPos >= 0 && formatVersion.isAnn()) cdsPos++; // 1-based position;
cdsLen = variantEffect.getCdsLength();
} else {
cdsPos = cdsLen = -1;
}
// Protein position / protein length
if (tr != null) {
aaPos = variantEffect.getCodonNum();
if (aaPos >= 0 && formatVersion.isAnn()) aaPos++; // 1-based position;
aaLen = variantEffect.getAaLength();
} else {
aaPos = aaLen = -1;
}
// Distance: Mostly used for non-coding variants
distance = variantEffect.getDistance();
if (variantEffect.hasError() || variantEffect.hasWarning()) {
StringBuilder err = new StringBuilder();
// Add errors
if (!variantEffect.getError().isEmpty()) {
err.append(variantEffect.getError());
}
// Add warnings
if (!variantEffect.getWarning().isEmpty()) {
if (err.length() > 0) err.append(formatVersion.separator());
err.append(variantEffect.getWarning());
}
errorsWarnings = err.toString();
}
}
public void setAa(String aa) {
this.aa = aa;
}
public void setAaLen(int aaLen) {
this.aaLen = aaLen;
}
public void setBioType(BioType bioType) {
this.bioType = bioType;
}
public void setCoding(VariantEffect.Coding coding) {
this.coding = coding;
}
public void setCodon(String codon) {
this.codon = codon;
}
public void setEffectDetails(String effectDetails) {
this.effectDetails = effectDetails;
}
public void setEffectType(EffectType effect) {
effectTypes = new LinkedList<>();
addEffectType(effect);
}
public void setExonId(String exonId) {
this.exonId = exonId;
}
public void setFormatVersion(EffFormatVersion formatVersion) {
this.formatVersion = formatVersion;
}
public void setFunClass(VariantEffect.FunctionalClass funClass) {
this.funClass = funClass;
}
public void setGeneId(String geneId) {
this.geneId = geneId;
}
public void setGeneName(String geneName) {
this.geneName = geneName;
}
/**
* Structural variant having multiple genes: Set all geneNames and geneIds
*/
void setGeneNameIdMultiple(VariantEffect variantEffect) {
// Get all genes
List genes = variantEffect.getGenes();
// Sort by geneName
Collections.sort(genes, new Comparator() {
@Override
public int compare(Gene g1, Gene g2) {
return g1.getGeneName().compareTo(g2.getGeneName());
}
});
// Create gene name and geneId strings
StringBuilder geneNames = new StringBuilder();
StringBuilder geneIds = new StringBuilder();
String sep = "";
for (Gene g : genes) {
geneNames.append(sep + g.getGeneName());
geneIds.append(sep + g.getId());
if (sep.isEmpty()) sep = formatVersion.separator();
}
geneName = geneNames.toString();
geneId = geneIds.toString();
}
public void setGenotype(String genotype) {
this.genotype = genotype;
}
public void setImpact(VariantEffect.EffectImpact impact) {
this.impact = impact;
}
/**
* Find and set rank and rank max
*/
void setRank() {
Transcript tr = variantEffect.getTranscript();
// Rank
Exon ex = variantEffect.getExon();
rank = -1;
if (ex != null) {
rank = ex.getRank();
rankMax = tr.numChilds();
return;
}
// Do we have an intron?
Intron intron = variantEffect.getIntron();
if (intron != null) {
rank = intron.getRank();
rankMax = Math.max(0, tr.numChilds() - 1);
return;
}
if (tr == null) return;
// Exon not explicitly set. Try to find it
Variant variant = variantEffect.getVariant();
for (Exon e : tr)
if (e.intersects(variant)) {
rank = e.getRank();
rankMax = tr.numChilds();
return;
}
// Intron not explicitly set. Try to find it.
List introns = tr.introns();
for (Intron in : introns) {
if (in.intersects(variant)) {
rank = in.getRank();
rankMax = introns.size();
return;
}
}
// Nothing found
rank = rankMax = -1;
}
public void setTranscriptId(String transcriptId) {
this.transcriptId = transcriptId;
}
public void setUseFirstEffect(boolean useFirstEffect) {
this.useFirstEffect = useFirstEffect;
}
public void setUseGeneId(boolean useGeneId) {
this.useGeneId = useGeneId;
}
public void setUseHgvs(boolean useHgvs) {
this.useHgvs = useHgvs;
}
/**
* Split a 'effect' string to an array of strings
*/
public String[] split(String eff) {
// ANN format versions
if (formatVersion.isAnn()) {
// Negative number means "use trailing empty as well"
return eff.replace('|', '\t').split("\t", -1);
}
// EFF format version
if (formatVersion.isEff()) {
int idxBr = eff.indexOf('[');
int idxParen = eff.indexOf('(');
String eff0 = null;
if ((idxBr >= 0) && (idxBr < idxParen)) {
int idxRbr = eff.indexOf(']');
eff0 = eff.substring(0, idxRbr + 1);
eff = eff.substring(idxRbr);
}
eff = eff.replace('(', '\t'); // Replace all chars by spaces
eff = eff.replace('|', '\t');
eff = eff.replace(')', '\t');
String effs[] = eff.split("\t", -1); // Negative number means "use trailing empty as well"
if (eff0 != null) effs[0] = eff0;
return effs;
}
throw new RuntimeException("Unimplemented format version '" + formatVersion + "'");
}
@Override
public String toString() {
if (formatVersion == null || formatVersion.isAnn()) return createAnnField();
return createEffField();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy