org.biojava.nbio.alignment.io.StockholmStructure Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of biojava-alignment Show documentation
The newest version!
/*
 * BioJava development code
 *
 * This code may be freely distributed and modified under the terms of the GNU Lesser General Public Licence. This
 * should be distributed with the code. If you do not have a copy, see:
 *
 * http://www.gnu.org/copyleft/lesser.html
 *
 * Copyright for this code is held jointly by the individual authors. These should be listed in @author doc comments.
 *
 * For more information on the BioJava project and its aims, or to join the biojava-l mailing list, visit the home page
 * at:
 *
 * http://www.biojava.org/
 *
 * Created on August 13, 2010 Author: Mark Chapman
 */

package org.biojava.nbio.alignment.io;

import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
import org.biojava.nbio.core.sequence.ProteinSequence;
import org.biojava.nbio.core.sequence.RNASequence;
import org.biojava.nbio.core.sequence.template.AbstractCompound;
import org.biojava.nbio.core.sequence.template.AbstractSequence;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * Stores all the content of a Stockholm file. N.B.: This structure will undergo several enhancements later on.
 * Don't depend on it in a final code, otherwise it will be hard to maintain.
 *
 * In general, Stockholm File contains the alignment mark-up lines.

 * 

 *
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * Header Section
Reference Section
Comment Section
Alignment Section
 *
 * Sequence letters may include any characters except whitespace. Gaps may be indicated by "." or "-".

 * Mark-up lines may include any characters except whitespace. Use underscore ("_") instead of space.

 *
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * section field preferred location
#=GF <feature> <Generic per-File annotation, free text> Above the alignment
#=GC <feature> <Generic per-Column annotation, exactly 1 char per column> Below the alignment
#=GS <seqname> <feature> <Generic per-Sequence annotation, free text> Above the alignment or just below the corresponding sequence
#=GR <seqname> <feature> <Generic per-Residue annotation, exactly 1 char per residue> Just below the corresponding sequence
 *
 * @since 3.0.5
 * @author Amr ALHOSSARY
 * @author Marko Vaz
 *
 */
public class StockholmStructure {

	private final static Logger logger = LoggerFactory.getLogger(StockholmStructure.class);

	public static final String PFAM = "PFAM";
	public static final String RFAM = "RFAM";
	private final StockholmFileAnnotation fileAnnotation;
	private final StockholmConsensusAnnotation consAnnotation;
	private final Map sequences;
	private final Map seqsAnnotation;
	private final Map resAnnotation;

	public StockholmStructure() {
		fileAnnotation = new StockholmFileAnnotation();
		consAnnotation = new StockholmConsensusAnnotation();
		sequences = new HashMap<>();
		seqsAnnotation = new HashMap<>();
		resAnnotation = new HashMap<>();
	}

	public StockholmFileAnnotation getFileAnnotation() {
		return fileAnnotation;
	}

	public StockholmConsensusAnnotation getConsAnnotation() {
		return consAnnotation;
	}

	/**
	 * @param seqName
	 * @param seqText
	 */
	public void appendToSequence(String seqName, String seqText) {
		StringBuffer seq = sequences.get(seqName);
		if (seq != null) {
			// add sequence without space
			seq.append(seqText);
		} else {
			seq = new StringBuffer(seqText);
			sequences.put(seqName, seq);
		}
	}

	public Map getSequences() {
		return sequences;
	}

	private StockholmSequenceAnnotation getSequenceAnnotation(String seqName) {
		if (!seqsAnnotation.containsKey(seqName)) {
			seqsAnnotation.put(seqName, new StockholmSequenceAnnotation());
		}
		return seqsAnnotation.get(seqName);
	}

	/**
	 * @param seqName
	 * @param text
	 */
	public void addGSAccessionNumber(String seqName, String text) {
		getSequenceAnnotation(seqName).setAccessionNumber(text);
	}

	public void addGSDescription(String seqName, String text) {
		getSequenceAnnotation(seqName).addToDescription(text);
	}

	/**
	 * @param seqName
	 * @param text
	 */
	public void addGSdbReference(String seqName, String text) {
		getSequenceAnnotation(seqName).addDBReference(text);
	}

	public void addGSOrganismSpecies(String seqName, String text) {
		getSequenceAnnotation(seqName).setOrganism(text);
	}

	public void addGSOrganismClassification(String seqName, String text) {
		getSequenceAnnotation(seqName).setOrganismClassification(text);
	}

	public void addGSLook(String seqName, String text) {
		getSequenceAnnotation(seqName).setLook(text);
	}

	private StockholmResidueAnnotation getResidueAnnotation(String seqName) {
		if (!resAnnotation.containsKey(seqName)) {
			resAnnotation.put(seqName, new StockholmResidueAnnotation());
		}
		return resAnnotation.get(seqName);
	}

	public void addSurfaceAccessibility(String seqName, String text) {
		getResidueAnnotation(seqName).setSurfaceAccessibility(text);
	}

	public void addTransMembrane(String seqName, String text) {
		getResidueAnnotation(seqName).setTransMembrane(text);
	}

	public void addPosteriorProbability(String seqName, String text) {
		getResidueAnnotation(seqName).setPosteriorProbability(text);
	}

	public void addLigandBinding(String seqName, String text) {
		getResidueAnnotation(seqName).setLigandBinding(text);
	}

	public void addActiveSite(String seqName, String text) {
		getResidueAnnotation(seqName).setActiveSite(text);
	}

	public void addASPFamPredicted(String seqName, String text) {
		getResidueAnnotation(seqName).setAsPFamPredicted(text);
	}

	public void addASSwissProt(String seqName, String text) {
		getResidueAnnotation(seqName).setAsSwissProt(text);
	}

	public void addIntron(String seqName, String text) {
		getResidueAnnotation(seqName).setIntron(text);
	}

	public void addSecondaryStructure(String seqName, String text) {
		getResidueAnnotation(seqName).setSecondaryStructure(text);
	}

	/**
	 * used to retrieve sequences from the structure
	 *
	 * @return Biosequences (case sensitive)
	 * @see #getBioSequences(boolean)
	 * @see #getBioSequences(boolean, String)
	 */
	public List> getBioSequences() {
		return getBioSequences(false);
	}

	/**
	 * This function tolerates mixed case letters, and allows for forcing the output biosequence type (PFAM/RFAM).
	 *
	 * @param ignoreCase
	 *            if true, the function will deal with small letters as if they are capital ones
	 * @param forcedSequenceType
	 *            either null, {@link #PFAM}, or {@link #RFAM}.
	 * @return Biosequences according to the criteria specified
	 * @see #getBioSequences()
	 * @see #getBioSequences(boolean)
	 */
	public List> getBioSequences(boolean ignoreCase,
			String forcedSequenceType) {
		if (forcedSequenceType != null && !(forcedSequenceType.equals(PFAM) || forcedSequenceType.equals(RFAM))) {
			throw new IllegalArgumentException("Illegal Argument " + forcedSequenceType);
		}
		List> seqs = new ArrayList<>();
		for (String sequencename : sequences.keySet()) {
			AbstractSequence seq = null;
			String sequence = sequences.get(sequencename).toString();
			if (ignoreCase) {
				sequence = sequence.toUpperCase();
			}

			try {
			if (forcedSequenceType == null)
				seq = fileAnnotation.isPFam() ? new ProteinSequence(sequence) : new RNASequence(sequence);
			else if (forcedSequenceType.equals(PFAM))
				seq = new ProteinSequence(sequence);
			else
				seq = new RNASequence(sequence);
			} catch (CompoundNotFoundException e) {
				logger.warn("Could not create sequence because of unknown compounds ({}). Sequence {} will be ignored.",e.getMessage(),sequencename);
				continue;
			}
			String[] seqDetails = splitSeqName(sequencename);
			seq.setDescription(seqDetails[0]);
			seq.setBioBegin((seqDetails[1] == null || "".equals(seqDetails[1].trim()) ? null : Integer.valueOf(seqDetails[1])));
			seq.setBioEnd((seqDetails[2] == null || "".equals(seqDetails[2].trim()) ? null : Integer.valueOf(seqDetails[2])));

			seqs.add(seq);
		}
		return seqs;
	}

	/**
	 * Because some database files have incorrectly small letters (e.g. Pfam23 structure PF00389.22 sequence
	 * TKRA_BACSU/6-322), this function is used to ignore the small letters case.
	 *
	 * @param ignoreCase
	 * @return
	 * @see #getBioSequences()
	 * @see #getBioSequences(boolean, String)
	 */
	public List> getBioSequences(boolean ignoreCase) {
		return getBioSequences(ignoreCase, null);
	}

	/**
	 * Returns an array with the following sequence related content: name, start, end.
	 *
	 * @param sequenceName
	 *            the sequence from where to extract the content. It is supposed that it follows the following
	 *            convention name/start-end (e.g.: COATB_BPIKE/30-81)
	 * @return array with the following sequence related content: name, start, end.
	 */
	private String[] splitSeqName(String sequenceName) {
		String[] result = new String[3];

		String[] barSplit = sequenceName.split("/");
		if (barSplit.length == 2) {
			result[0] = barSplit[0];
			String[] positions = barSplit[1].split("-");
			if (positions.length == 2) {
				result[1] = positions[0];
				result[2] = positions[1];
			}
		} else {
			result[0] = sequenceName;
			result[1] = null;
			result[2] = null;
		}

		return result;
	}

	@Override
	public String toString() {
		StringBuffer result = new StringBuffer();
		List> bioSeqs = getBioSequences(false);
		int sequenceLength = -1;
		for (AbstractSequence sequence : bioSeqs) {
			String sequenceAsString = sequence.getSequenceAsString();
			sequenceLength = sequenceAsString.length();
			if (sequenceLength > 50) {
				result.append(sequenceAsString.substring(0, 40));
				result.append("...");
				result.append(sequenceAsString.substring(sequenceLength - 3, sequenceLength));
			} else {
				result.append(sequenceAsString);
			}
			result.append(" " + sequence.getDescription() + "\n");
		}
		result.append("Alignment with " + bioSeqs.size() + " rows and " + sequenceLength + " columns");

		return result.toString();
	}

	public static class DatabaseReference {
		public static final String EXPERT = "EXPERT";
		public static final String MIM = "MIM";
		public static final String PFAMB = "PFAMB";
		public static final String PRINTS = "PRINTS";
		public static final String PROSITE = "PROSITE";
		public static final String PROSITE_PROFILE = "PROSITE_PROFILE";
		public static final String SCOP = "SCOP";
		public static final String PDB = "PDB";
		public static final String SMART = "SMART";
		public static final String URL = "URL";
		public static final String LOAD = "LOAD";
		public static final String HOMSTRAD = "HOMSTRAD";
		public static final String INTERPRO = "INTERPRO";

		private final String database;
		/** TODO this field should be subdivided into smaller fields if the database is SCOP or PDB. */
		private final String reference;

		public DatabaseReference(String database, String reference) {
			this.database = database;
			this.reference = reference;
		}

		public DatabaseReference(String representativeAnnotationString) {
			int semiColonIndex = representativeAnnotationString.indexOf(';');
			this.database = representativeAnnotationString.substring(0, semiColonIndex);
			this.reference = representativeAnnotationString.substring(semiColonIndex + 1,
					representativeAnnotationString.lastIndexOf(';')).trim();
		}

		@Override
		public String toString() {
			return new StringBuilder(this.database).append(';').append(' ').append(this.reference).append(';')
					.toString();
		}

		public String getDatabase() {
			return database;
		}

		public String getReference() {
			return reference;
		}
	}
}
section field	preferred location
#=GF <feature> <Generic per-File annotation, free text>	Above the alignment
#=GC <feature> <Generic per-Column annotation, exactly 1 char per column>	Below the alignment
#=GS <seqname> <feature> <Generic per-Sequence annotation, free text>	Above the alignment or just below the corresponding sequence
#=GR <seqname> <feature> <Generic per-Residue annotation, exactly 1 char per residue>	Just below the corresponding sequence