All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.biojava.nbio.alignment.io.StockholmStructure Maven / Gradle / Ivy

The newest version!
/*
 * BioJava development code
 *
 * This code may be freely distributed and modified under the terms of the GNU Lesser General Public Licence. This
 * should be distributed with the code. If you do not have a copy, see:
 *
 * http://www.gnu.org/copyleft/lesser.html
 *
 * Copyright for this code is held jointly by the individual authors. These should be listed in @author doc comments.
 *
 * For more information on the BioJava project and its aims, or to join the biojava-l mailing list, visit the home page
 * at:
 *
 * http://www.biojava.org/
 *
 * Created on August 13, 2010 Author: Mark Chapman
 */

package org.biojava.nbio.alignment.io;

import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
import org.biojava.nbio.core.sequence.ProteinSequence;
import org.biojava.nbio.core.sequence.RNASequence;
import org.biojava.nbio.core.sequence.template.AbstractCompound;
import org.biojava.nbio.core.sequence.template.AbstractSequence;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * Stores all the content of a Stockholm file. N.B.: This structure will undergo several enhancements later on.
 * Don't depend on it in a final code, otherwise it will be hard to maintain.
 *
 * In general, Stockholm File contains the alignment mark-up lines.
*
* * * * * * * * * * * * * * *
Header Section
Reference Section
Comment Section
Alignment Section
* * Sequence letters may include any characters except whitespace. Gaps may be indicated by "." or "-".
* Mark-up lines may include any characters except whitespace. Use underscore ("_") instead of space.
* * * * * * * * * * * * * * * * * *
section fieldpreferred location
#=GF <feature> <Generic per-File annotation, free text>Above the alignment
#=GC <feature> <Generic per-Column annotation, exactly 1 char per column>Below the alignment
#=GS <seqname> <feature> <Generic per-Sequence annotation, free text>Above the alignment or just below the corresponding sequence
#=GR <seqname> <feature> <Generic per-Residue annotation, exactly 1 char per residue>Just below the corresponding sequence
* * @since 3.0.5 * @author Amr ALHOSSARY * @author Marko Vaz * */ public class StockholmStructure { private final static Logger logger = LoggerFactory.getLogger(StockholmStructure.class); public static final String PFAM = "PFAM"; public static final String RFAM = "RFAM"; private final StockholmFileAnnotation fileAnnotation; private final StockholmConsensusAnnotation consAnnotation; private final Map sequences; private final Map seqsAnnotation; private final Map resAnnotation; public StockholmStructure() { fileAnnotation = new StockholmFileAnnotation(); consAnnotation = new StockholmConsensusAnnotation(); sequences = new HashMap<>(); seqsAnnotation = new HashMap<>(); resAnnotation = new HashMap<>(); } public StockholmFileAnnotation getFileAnnotation() { return fileAnnotation; } public StockholmConsensusAnnotation getConsAnnotation() { return consAnnotation; } /** * @param seqName * @param seqText */ public void appendToSequence(String seqName, String seqText) { StringBuffer seq = sequences.get(seqName); if (seq != null) { // add sequence without space seq.append(seqText); } else { seq = new StringBuffer(seqText); sequences.put(seqName, seq); } } public Map getSequences() { return sequences; } private StockholmSequenceAnnotation getSequenceAnnotation(String seqName) { if (!seqsAnnotation.containsKey(seqName)) { seqsAnnotation.put(seqName, new StockholmSequenceAnnotation()); } return seqsAnnotation.get(seqName); } /** * @param seqName * @param text */ public void addGSAccessionNumber(String seqName, String text) { getSequenceAnnotation(seqName).setAccessionNumber(text); } public void addGSDescription(String seqName, String text) { getSequenceAnnotation(seqName).addToDescription(text); } /** * @param seqName * @param text */ public void addGSdbReference(String seqName, String text) { getSequenceAnnotation(seqName).addDBReference(text); } public void addGSOrganismSpecies(String seqName, String text) { getSequenceAnnotation(seqName).setOrganism(text); } public void addGSOrganismClassification(String seqName, String text) { getSequenceAnnotation(seqName).setOrganismClassification(text); } public void addGSLook(String seqName, String text) { getSequenceAnnotation(seqName).setLook(text); } private StockholmResidueAnnotation getResidueAnnotation(String seqName) { if (!resAnnotation.containsKey(seqName)) { resAnnotation.put(seqName, new StockholmResidueAnnotation()); } return resAnnotation.get(seqName); } public void addSurfaceAccessibility(String seqName, String text) { getResidueAnnotation(seqName).setSurfaceAccessibility(text); } public void addTransMembrane(String seqName, String text) { getResidueAnnotation(seqName).setTransMembrane(text); } public void addPosteriorProbability(String seqName, String text) { getResidueAnnotation(seqName).setPosteriorProbability(text); } public void addLigandBinding(String seqName, String text) { getResidueAnnotation(seqName).setLigandBinding(text); } public void addActiveSite(String seqName, String text) { getResidueAnnotation(seqName).setActiveSite(text); } public void addASPFamPredicted(String seqName, String text) { getResidueAnnotation(seqName).setAsPFamPredicted(text); } public void addASSwissProt(String seqName, String text) { getResidueAnnotation(seqName).setAsSwissProt(text); } public void addIntron(String seqName, String text) { getResidueAnnotation(seqName).setIntron(text); } public void addSecondaryStructure(String seqName, String text) { getResidueAnnotation(seqName).setSecondaryStructure(text); } /** * used to retrieve sequences from the structure * * @return Biosequences (case sensitive) * @see #getBioSequences(boolean) * @see #getBioSequences(boolean, String) */ public List> getBioSequences() { return getBioSequences(false); } /** * This function tolerates mixed case letters, and allows for forcing the output biosequence type (PFAM/RFAM). * * @param ignoreCase * if true, the function will deal with small letters as if they are capital ones * @param forcedSequenceType * either null, {@link #PFAM}, or {@link #RFAM}. * @return Biosequences according to the criteria specified * @see #getBioSequences() * @see #getBioSequences(boolean) */ public List> getBioSequences(boolean ignoreCase, String forcedSequenceType) { if (forcedSequenceType != null && !(forcedSequenceType.equals(PFAM) || forcedSequenceType.equals(RFAM))) { throw new IllegalArgumentException("Illegal Argument " + forcedSequenceType); } List> seqs = new ArrayList<>(); for (String sequencename : sequences.keySet()) { AbstractSequence seq = null; String sequence = sequences.get(sequencename).toString(); if (ignoreCase) { sequence = sequence.toUpperCase(); } try { if (forcedSequenceType == null) seq = fileAnnotation.isPFam() ? new ProteinSequence(sequence) : new RNASequence(sequence); else if (forcedSequenceType.equals(PFAM)) seq = new ProteinSequence(sequence); else seq = new RNASequence(sequence); } catch (CompoundNotFoundException e) { logger.warn("Could not create sequence because of unknown compounds ({}). Sequence {} will be ignored.",e.getMessage(),sequencename); continue; } String[] seqDetails = splitSeqName(sequencename); seq.setDescription(seqDetails[0]); seq.setBioBegin((seqDetails[1] == null || "".equals(seqDetails[1].trim()) ? null : Integer.valueOf(seqDetails[1]))); seq.setBioEnd((seqDetails[2] == null || "".equals(seqDetails[2].trim()) ? null : Integer.valueOf(seqDetails[2]))); seqs.add(seq); } return seqs; } /** * Because some database files have incorrectly small letters (e.g. Pfam23 structure PF00389.22 sequence * TKRA_BACSU/6-322), this function is used to ignore the small letters case. * * @param ignoreCase * @return * @see #getBioSequences() * @see #getBioSequences(boolean, String) */ public List> getBioSequences(boolean ignoreCase) { return getBioSequences(ignoreCase, null); } /** * Returns an array with the following sequence related content: name, start, end. * * @param sequenceName * the sequence from where to extract the content. It is supposed that it follows the following * convention name/start-end (e.g.: COATB_BPIKE/30-81) * @return array with the following sequence related content: name, start, end. */ private String[] splitSeqName(String sequenceName) { String[] result = new String[3]; String[] barSplit = sequenceName.split("/"); if (barSplit.length == 2) { result[0] = barSplit[0]; String[] positions = barSplit[1].split("-"); if (positions.length == 2) { result[1] = positions[0]; result[2] = positions[1]; } } else { result[0] = sequenceName; result[1] = null; result[2] = null; } return result; } @Override public String toString() { StringBuffer result = new StringBuffer(); List> bioSeqs = getBioSequences(false); int sequenceLength = -1; for (AbstractSequence sequence : bioSeqs) { String sequenceAsString = sequence.getSequenceAsString(); sequenceLength = sequenceAsString.length(); if (sequenceLength > 50) { result.append(sequenceAsString.substring(0, 40)); result.append("..."); result.append(sequenceAsString.substring(sequenceLength - 3, sequenceLength)); } else { result.append(sequenceAsString); } result.append(" " + sequence.getDescription() + "\n"); } result.append("Alignment with " + bioSeqs.size() + " rows and " + sequenceLength + " columns"); return result.toString(); } public static class DatabaseReference { public static final String EXPERT = "EXPERT"; public static final String MIM = "MIM"; public static final String PFAMB = "PFAMB"; public static final String PRINTS = "PRINTS"; public static final String PROSITE = "PROSITE"; public static final String PROSITE_PROFILE = "PROSITE_PROFILE"; public static final String SCOP = "SCOP"; public static final String PDB = "PDB"; public static final String SMART = "SMART"; public static final String URL = "URL"; public static final String LOAD = "LOAD"; public static final String HOMSTRAD = "HOMSTRAD"; public static final String INTERPRO = "INTERPRO"; private final String database; /** TODO this field should be subdivided into smaller fields if the database is SCOP or PDB. */ private final String reference; public DatabaseReference(String database, String reference) { this.database = database; this.reference = reference; } public DatabaseReference(String representativeAnnotationString) { int semiColonIndex = representativeAnnotationString.indexOf(';'); this.database = representativeAnnotationString.substring(0, semiColonIndex); this.reference = representativeAnnotationString.substring(semiColonIndex + 1, representativeAnnotationString.lastIndexOf(';')).trim(); } @Override public String toString() { return new StringBuilder(this.database).append(';').append(' ').append(this.reference).append(';') .toString(); } public String getDatabase() { return database; } public String getReference() { return reference; } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy