org.snpeff.nextProt.NextProtParser Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of SnpEff Show documentation
Variant annotation and effect prediction package.
The newest version!
package org.snpeff.nextProt;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;

import org.snpeff.codons.CodonTables;
import org.snpeff.collections.AutoHashMap;
import org.snpeff.interval.Gene;
import org.snpeff.interval.Genome;
import org.snpeff.interval.Marker;
import org.snpeff.interval.Markers;
import org.snpeff.interval.NextProt;
import org.snpeff.interval.Transcript;
import org.snpeff.snpEffect.Config;
import org.snpeff.stats.CountByType;
import org.snpeff.util.Gpr;
import org.snpeff.util.GprSeq;
import org.snpeff.util.Timer;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

/**
 * Parse NetxProt XML file and build a database
 *
 * http://www.nextprot.org/
 *
 * @author pablocingolani
 */
public class NextProtParser {

	public static final double HIGHLY_CONSERVED_AA_PERCENT = 0.99;

	public static final int HIGHLY_CONSERVED_AA_COUNT = 30;

	// We don't care about these categories
	public static final String CATAGORY_BLACK_LIST_STR[] = { "" //
			, "expression-info" //
			, "mature-protein" //
			, "mature protein" //
			, "mutagenesis site" //
			, "mutagenesis-site" //
			, "mutagenesis" //
			, "pdb-mapping" //
			, "peptide-mapping" //
			, "retained intron" //
			, "sequence conflict" //
			, "sequence-conflict" //
			, "sequence variant" //
			, "sequence-variant" //
			, "srm-peptide-mapping" //
			, "variant" //
	};

	protected String NODE_NAME_PROTEIN;
	protected String NODE_NAME_GENE;
	protected String NODE_NAME_TRANSCRIPT;
	protected String NODE_NAME_ANNOTATION;
	protected String NODE_NAME_ANNOTATION_LIST;
	protected String NODE_NAME_POSITION;
	protected String NODE_NAME_PROPERTY;
	protected String NODE_NAME_DESCRIPTION;
	protected String NODE_NAME_CVNAME;
	protected String NODE_NAME_SEQUENCE;
	protected String NODE_NAME_XREF;

	protected String ATTR_NAME_UNIQUE_NAME;
	protected String ATTR_NAME_DATABASE;
	protected String ATTR_NAME_ACCESSION;
	protected String ATTR_NAME_ANNOTATION_LIST;
	protected String ATTR_NAME_CATAGORY;
	protected String ATTR_NAME_FIRST;
	protected String ATTR_NAME_LAST;
	protected String ATTR_NAME_ISOFORM_REF;
	protected String ATTR_NAME_PROPERTY_NAME;
	protected String ATTR_NAME_VALUE;

	protected String ATTR_VALUE_ENSEMBL;
	protected String ATTR_VALUE_REFSEQ;
	protected String ATTR_VALUE_NUCLEOTIDE_SEQUENCE_ID;

	boolean debug;
	boolean verbose;
	String trIdFile;
	HashSet categoryBlackList;
	HashMap trIdByUniqueName;
	HashMap trIdMap;
	HashMap sequenceByUniqueName;
	AutoHashMap countAaSequenceByType;
	HashMap trById;
	HashSet proteinDifferences = new HashSet<>();
	HashSet proteinOk = new HashSet<>();
	Markers markers;
	Config config;
	Genome genome;
	int aaErrors;

	public NextProtParser(Config config) {
		this.config = config;
		genome = config.getGenome();
		trIdByUniqueName = new HashMap<>();
		sequenceByUniqueName = new HashMap<>();
		countAaSequenceByType = new AutoHashMap<>(new CountByType());
		trById = new HashMap<>();
		markers = new Markers();

		defineNextProtXmlTerms();

		// Create and populate black list
		categoryBlackList = new HashSet<>();
		for (String cat : CATAGORY_BLACK_LIST_STR)
			categoryBlackList.add(cat);
	}

	void addTr(Transcript tr) {
		String trId = tr.getId();
		trById.put(trId, tr);

		// Find another transcript ID in the map
		String id = trIdMap.get(trId);
		if (id != null) trById.put(id, tr);

		// Remove transcript version (if any)
		if (trId.indexOf('.') > 0) {
			trId = trId.split("\\.")[0];
			id = trIdMap.get(trId);
			if (id != null) trById.put(id, tr);
		}
	}

	void addTranscripts() {
		// Read transcript ids
		readTrIdMap();

		// Build transcript map
		for (Gene gene : config.getSnpEffectPredictor().getGenome().getGenes())
			for (Transcript tr : gene)
				addTr(tr);
	}

	/**
	 * Show annotations counters in a table
	 */
	void analyzeSequenceConservation() {
		if (verbose) Timer.showStdErr("Sequence conservation analysis."//
				+ "\n\tAA sequence length  : " + 1 //
				+ "\n\tMin AA count        : " + HIGHLY_CONSERVED_AA_COUNT //
				+ "\n\tMin AA conservation : " + HIGHLY_CONSERVED_AA_PERCENT //
		);

		ArrayList keys = new ArrayList<>();
		keys.addAll(countAaSequenceByType.keySet());
		Collections.sort(keys);

		// Show title
		StringBuilder title = new StringBuilder();
		for (char aa : GprSeq.AMINO_ACIDS)
			title.append(aa + "\t");
		title.append("\t" + title);
		if (verbose) System.out.println("Amino acid regions:\n\tTotal\tMax count\tAvg len\tConservation\tCatergory\tControlled Vocabulary\t" + title + "\tOther AA sequences:");

		// Show AA counts for each 'key'
		for (String key : keys) {
			long seqLen = 0, totalSeqs = 0, maxCount = 0;
			CountByType cbt = countAaSequenceByType.get(key);
			long total = cbt.sum();

			boolean highlyConservedAaSequence = false;

			StringBuilder sb = new StringBuilder();

			// For each single amino acid "sequence"
			for (char aa : GprSeq.AMINO_ACIDS) {
				long count = cbt.get("" + aa);
				if (count > 0) {
					seqLen += 1 * count;
					totalSeqs += count;
					maxCount = Math.max(maxCount, count);

					sb.append(count);
					double perc = ((double) count) / total;

					// We estimate that if most AA are the same, then changing this AA can cause a high impact in protein coding
					if ((perc > HIGHLY_CONSERVED_AA_PERCENT) && (total >= HIGHLY_CONSERVED_AA_COUNT)) highlyConservedAaSequence = true;
				}
				sb.append("\t");
			}

			// Sequences of more than one AA
			for (String aas : cbt.keySet()) {
				long count = cbt.get(aas);
				double perc = ((double) count) / total;
				if (aas.length() > 1) {
					seqLen += aas.length() * count;
					totalSeqs += count;
					maxCount = Math.max(maxCount, count);

					sb.append(String.format("\t" + aas + ":" + count));
					if ((perc > HIGHLY_CONSERVED_AA_PERCENT) && (total >= HIGHLY_CONSERVED_AA_COUNT)) highlyConservedAaSequence = true;
				}
			}

			long avgLen = seqLen / totalSeqs;

			// Show line
			if (verbose) System.out.println( //
					"\t" + total //
							+ "\t" + maxCount //
							+ "\t" + avgLen //
							+ "\t" + (highlyConservedAaSequence ? "High" : "") //
							+ "\t" + key //
							+ "\t" + sb //
			);

			// Mark highly conserved
			if (highlyConservedAaSequence) {
				int count = 0;
				for (Marker m : markers) {
					NextProt nextProt = (NextProt) m;
					if (m.getId().equals(key)) {
						nextProt.setHighlyConservedAaSequence(true);
						count++;
					}
				}

				if (verbose) Timer.showStdErr("NextProt " + count + " markers type '" + key + "' marked as highly conserved AA sequence");
			}
		}
	}

	/**
	 * Add annotations
	 */
	void countAaSequence(String category, String contrVoc, String description, String sequence) {
		String key = key(category, contrVoc, description);
		CountByType cbt = countAaSequenceByType.getOrCreate(key);
		cbt.inc(sequence);
	}

	protected void defineNextProtXmlTerms() {
		// Define NextProt XML terms
		NODE_NAME_PROTEIN = "protein";
		NODE_NAME_GENE = "gene";
		NODE_NAME_TRANSCRIPT = "transcript";
		NODE_NAME_ANNOTATION = "annotation";
		NODE_NAME_ANNOTATION_LIST = "annotationList";
		NODE_NAME_POSITION = "position";
		NODE_NAME_PROPERTY = "property";
		NODE_NAME_DESCRIPTION = "description";
		NODE_NAME_CVNAME = "cvName";
		NODE_NAME_SEQUENCE = "sequence";
		NODE_NAME_XREF = "xref";

		ATTR_NAME_UNIQUE_NAME = "uniqueName";
		ATTR_NAME_DATABASE = "database";
		ATTR_NAME_ACCESSION = "accession";
		ATTR_NAME_ANNOTATION_LIST = "annotationList";
		ATTR_NAME_CATAGORY = "category";
		ATTR_NAME_FIRST = "first";
		ATTR_NAME_LAST = "last";
		ATTR_NAME_ISOFORM_REF = "isoformRef";
		ATTR_NAME_PROPERTY_NAME = "propertyName";
		ATTR_NAME_VALUE = "value";

		ATTR_VALUE_ENSEMBL = "Ensembl";
		ATTR_VALUE_REFSEQ = "RefSeq";
		ATTR_VALUE_NUCLEOTIDE_SEQUENCE_ID = "'nucleotide sequence ID";
	}

	/**
	 * Show an error message and exit
	 */
	protected void fatalError(String message) {
		System.err.println("Fatal error: " + message);
		System.exit(-1);
	}

	/**
	 * Parse a node
	 */
	ArrayList findNodes(Node node, String nodeName, String nodeValue, String attrName, String attrValue) {
		ArrayList resulstsList = new ArrayList<>();

		while (node != null) {
			boolean found = false;
			short type = node.getNodeType();

			//---
			// Get name & value
			//---
			String name = node.getNodeName();
			String value = node.getNodeValue();
			if (value != null) value = value.replace('\n', ' ').trim();

			//---
			// Get attributes
			//---
			StringBuilder attrSb = new StringBuilder();
			if ((attrName != null) || (attrValue != null)) { // Are attributes required? (don't parse if they are not needed
				NamedNodeMap map = node.getAttributes();
				if (map != null) {
					for (int i = 0; i < map.getLength(); i++) {
						Node attr = map.item(i);
						if (attrSb.length() > 0) attrSb.append(", ");

						String aname = attr.getNodeName();
						String aval = attr.getNodeValue();
						attrSb.append(aname + "=" + aval);

						if (((nodeName == null) || ((name != null) && name.equals(nodeName))) //
								&& ((nodeValue == null) || ((value != null) && value.equals(nodeValue))) //
								&& ((attrName == null) || ((aname != null) && attrName.equals(aname))) //
								&& ((attrValue == null) || ((aval != null) && attrValue.equals(aval))) //
						) found = true;
					}
				}
			} else {
				if (((nodeName == null) || ((name != null) && name.equals(nodeName))) //
						&& ((nodeValue == null) || ((value != null) && value.equals(nodeValue))) //
				) {
					found = true;
				}
			}

			if (found) resulstsList.add(node);

			//---
			// Show node
			//---
			switch (type) {
			case Node.ELEMENT_NODE:
				NodeList nodeList = node.getChildNodes();
				resulstsList.addAll(findNodes(nodeList, nodeName, nodeValue, attrName, attrValue));
				node = node.getNextSibling();
				break;
			case Node.TEXT_NODE:
				node = null;
				break;
			case Node.CDATA_SECTION_NODE:
				node = null;
				break;
			default:
				node = null;
			}
		}

		return resulstsList;
	}

	/**
	 * Parse a node list
	 */
	List findNodes(NodeList nodeList, String nodeName, String nodeValue, String attrName, String attrValue) {
		ArrayList resulstsList = new ArrayList<>();

		for (int temp = 0; temp < nodeList.getLength(); temp++) {
			Node node = nodeList.item(temp);
			resulstsList.addAll(findNodes(node, nodeName, nodeValue, attrName, attrValue));
		}

		return resulstsList;
	}

	/**
	 * Find only one node
	 */
	Node findOneNode(Node node, String nodeName, String nodeValue, String attrName, String attrValue) {
		ArrayList resulstsList = findNodes(node, nodeName, nodeValue, attrName, attrValue);
		if (resulstsList.isEmpty()) return null;
		return resulstsList.get(0);
	}

	/**
	 * Find sequences for a node
	 */
	void findSequences(Node node) {
		// Get sequences
		List seqNodes = findNodes(node, NODE_NAME_SEQUENCE, null, null, null);
		for (Node seq : seqNodes) {
			String seqStr = getText(seq);
			String uniq = getUniqueNameSequence(seq);
			sequenceByUniqueName.put(uniq, seqStr);
		}
	}

	/**
	 * Get uniqueId -> EnsemblId mapping for transcripts
	 * @return true if any was found
	 */
	boolean findTrIds(Node node) {
		boolean added = false;

		// Find Ensembl transcript ID
		List ensemblTrIds = findNodes(node, NODE_NAME_TRANSCRIPT, null, ATTR_NAME_DATABASE, ATTR_VALUE_ENSEMBL);
		for (Node trNode : ensemblTrIds) {
			// Get Ensembl ID
			String trId = getAttribute(trNode, ATTR_NAME_ACCESSION);

			// Get Unique ID
			String trUniqName = getUniqueNameTranscript(trNode);

			// Add to map
			trIdByUniqueName.put(trUniqName, trId);
			added = true;
		}

		return added;
	}

	/**
	 * Get AA end position from position node
	 */
	protected int getAaEnd(Node posNode) {
		String last = getAttribute(posNode, ATTR_NAME_LAST);
		int aaEnd = Gpr.parseIntSafe(last) - 1;
		return aaEnd;
	}

	/**
	 * Get AA start position from position node
	 */
	protected int getAaStart(Node posNode) {
		String first = getAttribute(posNode, ATTR_NAME_FIRST);
		int aaStart = Gpr.parseIntSafe(first) - 1;
		return aaStart;
	}

	/**
	 * Get annotation description from annotation node
	 */
	String getAnnDescription(Node annNode) {
		Node descr = findOneNode(annNode, NODE_NAME_DESCRIPTION, null, null, null);
		String description = getText(descr);
		if (description == null) description = "";
		else if (description.indexOf(';') > 0) description = description.substring(0, description.indexOf(';')); // Cut after semicolon
		return description;
	}

	List getAnnotationCategories(Node node) {
		List annListNodes = findNodes(node, NODE_NAME_ANNOTATION_LIST, null, null, null);
		return annListNodes;
	}

	/**
	 * Get an attribute from a node
	 */
	String getAttribute(Node node, String attrName) {
		if (node == null) return null;

		NamedNodeMap map = node.getAttributes();
		if (map == null) return null;

		Node attrNode = map.getNamedItem(attrName);
		if (attrNode == null) return null;

		return attrNode.getNodeValue();
	}

	/**
	 * Get controlled vocabulary term from annotation node
	 */
	String getControlledVocubulary(Node annNode) {
		// Controlled vocabulary
		Node cv = findOneNode(annNode, NODE_NAME_CVNAME, null, null, null);
		String contrVoc = getText(cv);
		if (contrVoc == null) contrVoc = "";

		contrVoc.indexOf(';');
		String cvs[] = contrVoc.split(";", 2);
		if (cvs.length > 1) return cvs[0];
		return contrVoc;
	}

	/**
	 * Get Ensembl gene ID
	 */
	String getGeneId(Node node, String uniqueName) {
		Node geneNode = findOneNode(node, NODE_NAME_GENE, null, ATTR_NAME_DATABASE, ATTR_VALUE_ENSEMBL);
		return getAttribute(geneNode, ATTR_NAME_ACCESSION);
	}

	protected String getIsoformRefFromPos(Node posNode) {
		Node isoAnn = posNode.getParentNode().getParentNode();
		String isoformRef = getAttribute(isoAnn, ATTR_NAME_ISOFORM_REF);
		return isoformRef;
	}

	public Markers getMarkers() {
		return markers;
	}

	/**
	 * Get text form a node
	 */
	String getText(Node n) {
		if (n == null) return null;
		return n.getTextContent().replace('\n', ' ').trim();
	}

	String getUniqueNameSequence(Node seqNode) {
		Node iso = seqNode.getParentNode();
		String seqUniqName = getAttribute(iso, ATTR_NAME_UNIQUE_NAME);
		return seqUniqName;
	}

	String getUniqueNameTranscript(Node trNode) {
		Node isoMap = trNode.getParentNode();
		String trUniqName = getAttribute(isoMap, ATTR_NAME_UNIQUE_NAME);
		return trUniqName;
	}

	/**
	 * Create a key
	 */
	String key(String category, String contrVoc, String description) {
		category = vcfSafe(category);

		if ((description == null) || description.isEmpty()) description = contrVoc;
		description = vcfSafe(description);

		if (description.isEmpty()) return category;
		return category + ":" + description;
	}

	/**
	 * Get node type as a string
	 */
	String nodeType(short type) {
		switch (type) {
		case Node.ATTRIBUTE_NODE:
			return "ATTRIBUTE_NODE";
		case Node.CDATA_SECTION_NODE:
			return "CDATA_SECTION_NODE";
		case Node.COMMENT_NODE:
			return "COMMENT_NODE";
		case Node.DOCUMENT_FRAGMENT_NODE:
			return "DOCUMENT_FRAGMENT_NODE";
		case Node.DOCUMENT_NODE:
			return "DOCUMENT_NODE";
		case Node.DOCUMENT_POSITION_CONTAINED_BY:
			return "DOCUMENT_POSITION_CONTAINED_BY";
		case Node.DOCUMENT_TYPE_NODE:
			return "DOCUMENT_TYPE_NODE";
		case Node.ELEMENT_NODE:
			return "ELEMENT_NODE";
		case Node.ENTITY_NODE:
			return "ENTITY_NODE";
		case Node.ENTITY_REFERENCE_NODE:
			return "ENTITY_REFERENCE_NODE";
		case Node.NOTATION_NODE:
			return "NOTATION_NODE";
		case Node.PROCESSING_INSTRUCTION_NODE:
			return "PROCESSING_INSTRUCTION_NODE";
		case Node.TEXT_NODE:
			return "TEXT_NODE";
		default:
			throw new RuntimeException("Unknown");
		}
	}

	/**
	 * Parse  XML document
	 */
	public void parse(Node doc) {
		addTranscripts();

		if (verbose) Timer.showStdErr("Parsing XML data.");
		List nodeList = findNodes(doc.getChildNodes(), NODE_NAME_PROTEIN, null, null, null);
		if (verbose) Timer.showStdErr("Found " + nodeList.size() + " protein nodes");

		// Parse each node
		for (Node node : nodeList) {
			if (debug) Gpr.debug("Processing protein node: " + toString(node));
			parseProteinNode(node);
		}

		analyzeSequenceConservation();
	}

	/**
	 * Parse a protein node
	 */
	void parseAnnotation(Node ann, String geneId, String category) {
		if (debug) Gpr.debug("\t\tAnnotation: " + toString(ann) + "\tCategory: " + category);
		String description = getAnnDescription(ann);
		String contrVoc = getControlledVocubulary(ann);

		// Search annotations
		List posNodes = findNodes(ann, NODE_NAME_POSITION, null, null, null);
		for (Node pos : posNodes) {
			if (debug) Gpr.debug("\t\t\tPosition: " + toString(pos));
			// Get first & last position
			int aaStart = getAaStart(pos);
			int aaEnd = getAaStart(pos);
			if (aaStart < 0 || aaEnd < 0) continue;
			int len = aaEnd - aaStart + 1;

			// Get ID
			String isoformRef = getIsoformRefFromPos(pos);

			// Find sequence
			String sequence = sequenceByUniqueName.get(isoformRef);
			String subSeq = "";
			if ((sequence != null) //
					&& (aaStart >= 0) //
					&& (aaEnd >= aaStart) //
					&& (aaEnd < sequence.length()) //
			) subSeq = sequence.substring(aaStart, aaEnd + 1);

			// Check transcript
			TranscriptData trData = transcriptData(isoformRef, aaStart, aaEnd, sequence, subSeq);

			// Create nextProt markers
			if (!trData.ok || (len <= 0)) continue;

			// Create marker
			String id = key(category, contrVoc, description);
			NextProt nextProt = new NextProt(trData.tr, trData.chrPosStart, trData.chrPosEnd, id);
			markers.add(nextProt);
			if (debug) Gpr.debug("Added NextProt entry:" + nextProt //
					+ "\n\tgeneId:" + geneId //
					+ "\n\tisoformRef:" + isoformRef //
					+ "\n\ttrId:" + trData.tr.getId() //
					+ "\n\tcategory:'" + category + "'" //
					+ "\n\tdescription:'" + description + "'" //
					+ "\n\tcontrolled_vocabulary:" + contrVoc //
					+ "\n\taaStart:" + aaStart//
					+ "\n\taaEnd:" + aaEnd //
					+ "\n\taaLen:" + len //
					+ "\n\tchr:" + trData.chrName //
					+ "\n\tstart:" + trData.chrPosStart //
					+ "\n\tend:" + trData.chrPosEnd //
					+ "\n\tsubSeq:" + subSeq //
					+ "\n\tcodon:" + trData.codon //
					+ "\n\taa:" + trData.aa//
			);

			countAaSequence(category, contrVoc, description, subSeq);
		}
	}

	/**
	 * Parse "" XML mark
	 */
	void parseAnnotations(Node node, String geneId) {
		// Find all  XML marks
		List annListNodes = getAnnotationCategories(node);

		// For each  set of nodes
		for (Node annListNode : annListNodes) {
			// Get annotationList category
			String category = getAttribute(annListNode, ATTR_NAME_CATAGORY);

			if (!categoryBlackList.contains(category)) {
				List annNodes = findNodes(annListNode, NODE_NAME_ANNOTATION, null, null, null);

				// Analyze the ones not in the blacklist
				for (Node ann : annNodes)
					parseAnnotation(ann, geneId, category);
			}
		}
	}

	/**
	 * Parse a protein node
	 */
	void parseProteinNode(Node node) {
		String uniqueName = getAttribute(node, ATTR_NAME_UNIQUE_NAME);
		if (debug) Timer.showStdErr("Parsing protein node: " + uniqueName);

		// Find Ensembl gene ID
		String geneId = getGeneId(node, uniqueName);
		if (geneId != null) {
			if (debug) Timer.showStdErr("\tFound matching gene ID: " + geneId);
			// Get transcript IDs
			if (findTrIds(node)) {
				findSequences(node); // Find sequences
				parseAnnotations(node, geneId); // Parse annotation list (XML  mark)
			}
		}
	}

	/**
	 * Read transcript file
	 */
	void readTrIdMap() {
		trIdMap = new HashMap<>();
		if (trIdFile == null) return;
		if (verbose) Timer.showStdErr("Reading transcripts file '" + trIdFile + "'");

		String lines[] = Gpr.readFile(trIdFile).split("\n");
		for (String line : lines) {
			String ids[] = line.split("\t");
			if (ids.length > 1) {
				String ensemblId = ids[0].trim();
				String refSeqId = ids[1].trim();
				trIdMap.put(refSeqId, ensemblId);
			}
		}
	}

	public void setDebug(boolean debug) {
		this.debug = debug;
	}

	public void setTrIdFile(String trIdFile) {
		this.trIdFile = trIdFile;
	}

	public void setVerbose(boolean verbose) {
		this.verbose = verbose;
	}

	/**
	 * Show a node as a string
	 */
	String toString(Node node) {
		StringBuilder sb = new StringBuilder();

		//---
		// Get name & value
		//---
		String name = node.getNodeName();
		String value = node.getNodeValue();
		if (value != null) value = value.replace('\n', ' ').trim();
		sb.append(name);

		//---
		// Get attributes
		//---
		NamedNodeMap map = node.getAttributes();
		if (map != null) {
			sb.append("( ");
			for (int i = 0; i < map.getLength(); i++) {
				Node attr = map.item(i);
				String aname = attr.getNodeName();
				String aval = attr.getNodeValue();

				if (i > 0) sb.append(", ");
				sb.append(aname + "='" + aval + "'");
			}
			sb.append(" )");
		}

		if (value != null) sb.append(" = '" + value + "'\n");

		return sb.toString();
	}

	/**
	 * Gather data from transcript
	 */
	TranscriptData transcriptData(String isoformRef, int aaStart, int aaEnd, String sequence, String subSeq) {
		String trId = trIdByUniqueName.get(isoformRef);
		TranscriptData trData = new TranscriptData();
		if (trId != null) {
			Transcript tr = trById.get(trId);
			if (tr != null) {
				trData.tr = tr;
				String protein = tr.protein();

				// Remove trailing stop codon ('*')
				if (!protein.isEmpty() && (protein.charAt(protein.length() - 1) == '*')) protein = protein.substring(0, protein.length() - 1);

				// Sanity check: Do protein sequences match?
				if (protein.equals(sequence)) {
					proteinOk.add(trId);

					if ((aaStart >= 0) && (aaEnd >= aaStart)) {
						// Try to map to chromosome position
						int cdsBase2Pos[] = tr.baseNumberCds2Pos();
						int codonStart = aaStart * 3;
						int codonEnd = (aaEnd + 1) * 3 - 1;

						if (codonStart < cdsBase2Pos.length && codonEnd < cdsBase2Pos.length) {
							if (tr.isStrandPlus()) {
								trData.chrPosStart = cdsBase2Pos[codonStart];
								trData.chrPosEnd = cdsBase2Pos[codonEnd];
							} else {
								trData.chrPosStart = cdsBase2Pos[codonEnd];
								trData.chrPosEnd = cdsBase2Pos[codonStart];
							}

							trData.chrName = tr.getChromosomeName();

							// More sanity checks
							trData.codon = tr.cds().substring(codonStart, codonEnd + 1);
							trData.aa = CodonTables.getInstance().aa(trData.codon, genome, trData.chrName);
							if (!subSeq.equals(trData.aa) && verbose) {
								Timer.showStdErr("WARNING: AA differ: " //
										+ "\tUniqueName : " + isoformRef //
										+ "\tEnsembl ID : " + trId //
										+ "\tEnsembl  AA: '" + trData.aa + "'"//
										+ "\tNextProt AA: '" + subSeq + "'"//
										+ "\n");
							} else trData.ok = true; // All sanity checks passed
						}
					}
				} else {
					if (!proteinDifferences.contains(trId) && verbose) Timer.showStdErr("WARNING: Protein sequences differ: " //
							+ "\tUniqueName" + isoformRef //
							+ "\tEnsembl ID: " + trId //
							+ "\n\tEnsembl  (" + protein.length() + "): " + protein //
							+ "\n\tNextProt (" + sequence.length() + "): " + sequence //
							+ "\n");
					proteinDifferences.add(trId);
				}
			}
		}

		return trData;
	}

	String vcfSafe(String str) {
		return str.trim().replaceAll("(,|;|=| |\t)+", "_");
	}
}

class TranscriptData {
	public boolean ok = false;
	public Transcript tr = null;
	public int chrPosStart = -1, chrPosEnd = -1;
	public String chrName = "", codon = "", aa = "";

	@Override
	public String toString() {
		return chrName + ":" + chrPosStart + "-" + chrPosEnd //
				+ ", codon: '" + codon + "'" //
				+ ", aa: '" + aa + "'" //
		;
	}
}