org.biojava.nbio.alignment.io.StockholmFileParser Maven / Gradle / Ivy
The newest version!
/*
* BioJava development code
*
* This code may be freely distributed and modified under the terms of the GNU Lesser General Public Licence. This
* should be distributed with the code. If you do not have a copy, see:
*
* http://www.gnu.org/copyleft/lesser.html
*
* Copyright for this code is held jointly by the individual authors. These should be listed in @author doc comments.
*
* For more information on the BioJava project and its aims, or to join the biojava-l mailing list, visit the home page
* at:
*
* http://www.biojava.org/
*
* Created on August 13, 2010 Author: Mark Chapman
*/
package org.biojava.nbio.alignment.io;
import org.biojava.nbio.alignment.io.StockholmFileAnnotation.StockholmFileAnnotationReference;
import org.biojava.nbio.core.exceptions.ParserException;
import org.biojava.nbio.core.util.InputStreamProvider;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
/**
* Stockholm file parser.
* for more information about the format refer to
*
* - ftp://ftp.sanger.ac.uk/pub/databases
* /Pfam/current_release/userman.txt.
* - ftp://ftp.sanger.ac.uk/pub/databases/Rfam/CURRENT
* /USERMAN.
* - http://sonnhammer.sbc.su.se/Stockholm.html.
*
*
*
* Pfam DESCRIPTION OF FIELDS
*
* Compulsory fields:
* ------------------
*
* AC Accession number: Accession number in form PFxxxxx.version or PBxxxxxx.
* ID Identification: One word name for family.
* DE Definition: Short description of family.
* AU Author: Authors of the entry.
* SE Source of seed: The source suggesting the seed members belong to one family.
* GA Gathering method: Search threshold to build the full alignment.
* TC Trusted Cutoff: Lowest sequence score and domain score of match in the full alignment.
* NC Noise Cutoff: Highest sequence score and domain score of match not in full alignment.
* TP Type: Type of family -- presently Family, Domain, Motif or Repeat.
* SQ Sequence: Number of sequences in alignment.
* // End of alignment.
*
* Optional fields:
* ----------------
*
* DC Database Comment: Comment about database reference.
* DR Database Reference: Reference to external database.
* RC Reference Comment: Comment about literature reference.
* RN Reference Number: Reference Number.
* RM Reference Medline: Eight digit medline UI number.
* RT Reference Title: Reference Title.
* RA Reference Author: Reference Author
* RL Reference Location: Journal location.
* PI Previous identifier: Record of all previous ID lines.
* KW Keywords: Keywords.
* CC Comment: Comments.
* NE Pfam accession: Indicates a nested domain.
* NL Location: Location of nested domains - sequence ID, start and end of insert.
* WK Wikipedia Reference: Reference to wikipedia.
*
* Obsolete fields:
* -----------
* AL Alignment method of seed: The method used to align the seed members.
* AM Alignment Method: The order ls and fs hits are aligned to the model to build the full align.
*
*
*
* @since 3.0.5
* @author Amr ALHOSSARY
* @author Marko Vaz
*
*/
public class StockholmFileParser {
private final static Logger logger = LoggerFactory.getLogger(StockholmFileParser.class);
/** indicates reading as much as possible, without limits */
public static final int INFINITY = -1;
/** #=GF <feature> <Generic per-File annotation, free text> */
private static final String GENERIC_PER_FILE_ANNOTATION = "GF";
/** #=GC <feature> <Generic per-Column annotation, exactly 1 char per column> */
private static final String GENERIC_PER_CONSENSUS_ANNOTATION = "GC";
/** #=GS <seqname> <feature> <Generic per-Sequence annotation, free text> */
private static final String GENERIC_PER_SEQUENCE_ANNOTATION = "GS";
/** #=GR <seqname> <feature> <Generic per-Residue annotation, exactly 1 char per residue> */
private static final String GENERIC_PER_RESIDUE_ANNOTATION = "GR";
// COMPULSORY FIELDS
/** Accession number in form PFxxxxx (Pfam) or RFxxxxx (Rfam). */
private static final String GF_ACCESSION_NUMBER = "AC";
/** One word name for family. */
private static final String GF_IDENTIFICATION = "ID";
/** Short description of family. */
private static final String GF_DEFINITION = "DE";
/** Authors of the entry. */
private static final String GF_AUTHOR = "AU";
/**
* Indicates the order that ls and fs matches are aligned to the model to give the full alignment. (OBSOLETE IN
* HMMER3)
*/
private static final String GF_ALIGNMENT_METHOD = "AM";
/** Command line used to generate the model */
private static final String GF_BUILD_METHOD = "BM";
/** Command line used to perform the search */
private static final String GF_SEARCH_METHOD = "SM";
/** The source suggesting the seed members belong to one family. */
private static final String GF_SOURCE_SEED = "SE";
/** The source (prediction or publication) of the consensus RNA secondary structure used by Rfam. */
private static final String GF_SOURCE_STRUCTURE = "SS";
/** Search threshold to build the full alignment. */
private static final String GF_GATHERING_THRESHOLD = "GA";
/** Lowest sequence score (and domain score for Pfam) of match in the full alignment. */
private static final String GF_TRUSTED_CUTOFF = "TC";
/** Highest sequence score (and domain score for Pfam) of match not in full alignment. */
private static final String GF_NOISE_CUTOFF = "NC";
/**
* Type of family -- presently Family, Domain, Motif or Repeat for Pfam. -- a tree with roots Gene, Intron or
* Cis-reg for Rfam.
*/
private static final String GF_TYPE_FIELD = "TP";
/** Number of sequences in alignment, and start of MSA. */
private static final String GF_SEQUENCE = "SQ";
// OPTIONAL FIELDS
/** Comment about database reference. */
private static final String GF_DB_COMMENT = "DC";
/** Reference to external database. */
private static final String GF_DB_REFERENCE = "DR";
/** Comment about literature reference. */
private static final String GF_REFERENCE_COMMENT = "RC";
/** Reference Number. */
private static final String GF_REFERENCE_NUMBER = "RN";
/** Eight digit medline UI number. */
private static final String GF_REFERENCE_MEDLINE = "RM";
/** Reference Title. */
private static final String GF_REFERENCE_TITLE = "RT";
/** Reference Author. */
private static final String GF_REFERENCE_AUTHOR = "RA";
/** Journal Location. */
private static final String GF_REFERENCE_LOCALTION = "RL";
/** Record of all previous ID lines. */
private static final String GF_PREVIOUS_IDS = "PI";
/** Keywords */
private static final String GF_KEYWORDS = "KW";
/** Comments */
private static final String GF_COMMENT = "CC";
/** Indicates a nested domain */
private static final String GF_PFAM_ACCESSION = "NE";
/** Location of nested domains - sequence ID, start and end of insert. */
private static final String GF_LOCATION = "NL";
/** Wikipedia page */
private static final String GF_WIKIPEDIA_LINK = "WK";
/** Clan accession */
private static final String GF_CLAN = "CL";
/** Used for listing Clan membership */
private static final String GF_MEMBERSHIP = "MB";
/** FOR EMBEDDING TREES **/
/** A tree in New Hampshire eXtended format. */
private static final String GF_NEW_HAMPSHIRE = "NH";
/** A unique identifier for the next tree. */
private static final String GF_TREE_ID = "TN";
// OTHER
/**
* A method used to set the bit score threshold based on the ratio of expected false positives to true positives.
* Floating point number between 0 and 1.
*/
private static final String GF_FALSE_DISCOVERY_RATE = "FR";
// #=GS
private static final String GS_ACCESSION_NUMBER = "AC";
private static final String GS_DESCRIPTION = "DE";
private static final String GS_DATABASE_REFERENCE = "DR";
private static final String GS_ORGANISM_SPECIES = "OS";
private static final String GS_ORGANISM_CLASSIFICATION = "OC";
private static final String GS_LOOK = "LO";
// #=GR
/**
* For RNA [.,;<>(){}[]AaBb...],
* For protein [HGIEBTSCX]
*/
private static final String GR_SECONDARY_STRUCTURE = "SS";
/**
* [0-9X]
* (0=0%-10%; ...; 9=90%-100%)
*/
private static final String GR_SURFACE_ACCESSIBILITY = "SA";
/** [Mio] */
private static final String GR_TRANS_MEMBRANE = "TM";
/**
* [0-9*]
* (0=0.00-0.05; 1=0.05-0.15; *=0.95-1.00)
*/
private static final String GR_POSTERIOR_PROBABILITY = "PP";
/** [*] */
private static final String GR_LIGAND_BINDING = "LI";
/** [*] */
private static final String GR_ACTIVE_SITE = "AS";
/** [*] */
private static final String GR_AS_PFAM_PREDICTED = "pAS";
/** [*] */
private static final String GR_AS_SWISSPROT = "sAS";
/** [0-2] */
private static final String GR_INTRON = "IN";
// #=GC
private static final String GC_SEQUENSE_CONSENSUS = "seq_cons";
private static final String GC_SECONDARY_STRUCTURE = "SS_cons";
private static final String GC_SURFACE_ACCESSIBILITY = "SA_cons";
private static final String GC_TRANS_MEMBRANE = "TM_cons";
private static final String GC_POSTERIOR_PROBABILITY = "PP_cons";
private static final String GC_LIGAND_BINDING = "LI_cons";
private static final String GC_ACTIVE_SITE = "AS_cons";
private static final String GC_AS_PFAM_PREDICTED = "pAS_cons";
private static final String GC_AS_SWISSPROT = "sAS_cons";
private static final String GC_INTRON = "IN_cons";
/**
* Often the consensus RNA or protein sequence is used as a reference Any non-gap character (eg. x's) can indicate
* consensus/conserved/match columns .'s or -'s indicate insert columns ~'s indicate unaligned insertions Upper and
* lower case can be used to discriminate strong and weakly conserved residues respectively
*/
private static final String GC_REFERENCE_ANNOTATION = "RF";
/**
* Indicates which columns in an alignment should be masked, such that the emission probabilities for match states
* corresponding to those columns will be the background distribution.
*/
private static final String GC_MODEL_MASK = "MM";
private StockholmStructure stockholmStructure;
// private boolean endFile = false;
// private static final int STATUS_OUTSIDE_FILE = 0;
// private static final int STATUS_INSIDE_FILE = 10;
// private static final int STATUS_IN_SEQUENCE = 20;
//
// private int status=STATUS_OUTSIDE_FILE;
Scanner internalScanner = null;
private InputStream cashedInputStream;
/**
* Parses a Stockholm file and returns a {@link StockholmStructure} object with its content.
* This function is meant to be used for single access to specific file and it closes the file after doing its
* assigned job. Any subsequent call to {@link #parseNext(int)} will throw an exception or will function with
* unpredicted behavior.
*
* @param filename
* complete(?) path to the file from where to read the content
* @return stockholm file content
* @throws IOException
* when an exception occurred while opening/reading/closing the file+
* @throws ParserException
* if unexpected format is encountered
*/
public StockholmStructure parse(String filename) throws IOException {
InputStream inStream = new InputStreamProvider().getInputStream(filename);
StockholmStructure structure = parse(inStream);
inStream.close();
return structure;
}
/**
* Parses a Stockholm file and returns a {@link StockholmStructure} object with its content.
* This function doesn't close the file after doing its assigned job; to allow for further calls of
* {@link #parseNext(int)}.
*
* @see #parseNext(int)
*
* @param filename
* file from where to read the content. see {@link InputStreamProvider} for more details.
* @param max
* maximum number of files to read, {@link #INFINITY} for all.
* @return a vector of {@link StockholmStructure} containing parsed structures.
* @throws IOException
* when an exception occurred while opening/reading/closing the file.
* @throws ParserException
* if unexpected format is encountered
*/
public List parse(String filename, int max) throws IOException {
InputStreamProvider isp = new InputStreamProvider();
InputStream inStream = isp.getInputStream(filename);
return parse(inStream, max);
}
/**
* parses {@link InputStream} and returns a the first contained alignment in a {@link StockholmStructure} object.
* Used mainly for multiple files within the same input stream, (e.g. when reading from Pfam flat files.
* This method leaves the stream open for further calls of {@link #parseNext(int)}.
*
* @see #parseNext(int)
* @param inStream
* the {@link InputStream} containing the file to read.
* @return a {@link StockholmStructure} object representing file contents.
* @throws IOException
* @throws ParserException
*/
public StockholmStructure parse(InputStream inStream) throws IOException {
return parse(inStream, 1).get(0);
}
/**
* parses an {@link InputStream} and returns at maximum max
objects contained in that file.
* This method leaves the stream open for further calls of {@link #parse(InputStream, int)} (same function) or
* {@link #parseNext(int)}.
*
* @see #parseNext(int)
* @param inStream
* the stream to parse
* @param max
* maximum number of structures to try to parse, {@link #INFINITY} to try to obtain as much as possible.
* @return a {@link List} of {@link StockholmStructure} objects. If there are no more structures, an empty list is
* returned.
* @throws IOException
* in case an I/O Exception occurred.
*/
public List parse(InputStream inStream, int max) throws IOException {
if (max < INFINITY) {
throw new IllegalArgumentException("max can't be -ve value " + max);
}
if (inStream != this.cashedInputStream) {
this.cashedInputStream = inStream;
this.internalScanner = null;
}
if (internalScanner == null) {
internalScanner = new Scanner(inStream);
}
ArrayList structures = new ArrayList<>();
while (max != INFINITY && max-- > 0) {
StockholmStructure structure = parse(internalScanner);
if (structure != null) {
structures.add(structure);
} else {
break;
}
}
return structures;
}
/**
* Tries to parse and return as maximum as max
structures in the last used file or input stream.
* Please consider calling either {@link #parse(InputStream)}, {@link #parse(InputStream, int)}, or
* {@link #parse(String, int)} before calling this function.
*
* @param max
* @return
* @throws IOException
*/
public List parseNext(int max) throws IOException {
return parse(this.cashedInputStream, max);
}
/**
* Parses a Stockholm file and returns a {@link StockholmStructure} object with its content. This method returns
* just after reaching the end of structure delimiter line ("//"), leaving any remaining empty lines unconsumed.
*
* @param scanner
* from where to read the file content
* @return Stockholm file content, null
if couldn't or no more structures.
* @throws IOException
* @throws Exception
*/
StockholmStructure parse(Scanner scanner) throws IOException {
if (scanner == null) {
if (internalScanner != null) {
scanner = internalScanner;
} else {
throw new IllegalArgumentException("No Scanner defined");
}
}
String line = null;
int linesCount = 0;
try {
while (scanner.hasNextLine()) {
line = scanner.nextLine();
// if the file is empty
// this condition will not happen, just left in case we decided to go for buffereedReader again for
// performance purpose.
if (linesCount == 0 && line == null) {
throw new IOException("Could not parse Stockholm file, BufferedReader returns null!");
}
// ignore empty lines
if ((/* status==STATUS_INSIDE_FILE && */line == null) || line.trim().length() == 0) {
continue;
}
if (line.startsWith("#=G")) {
// // comment line or metadata
// line = line.substring(1).trim();
// line = line.substring(1).trim();
if (line.startsWith(GENERIC_PER_FILE_ANNOTATION, 2)) {
// #=GF
int firstSpaceIndex = line.indexOf(' ', 5);
String featureName = line.substring(5, firstSpaceIndex);
String value = line.substring(firstSpaceIndex).trim();
handleFileAnnotation(featureName, value);
} else if (line.startsWith(GENERIC_PER_CONSENSUS_ANNOTATION, 2)) {
// Being in a consensus means we are no longer in a sequence.
// this.status = STATUS_INSIDE_FILE;
// #=GC
int firstSpaceIndex = line.indexOf(' ', 5);
String featureName = line.substring(5, firstSpaceIndex);
String value = line.substring(firstSpaceIndex).trim();
handleConsensusAnnotation(featureName, value);
} else if (line.startsWith(GENERIC_PER_SEQUENCE_ANNOTATION, 2)) {
// #=GS
int index1 = line.indexOf(' ', 5);
String seqName = line.substring(5, index1);
while (line.charAt(++index1) <= ' ')
// i.e. white space
;// keep advancing
int index2 = line.indexOf(' ', index1);
String featureName = line.substring(index1, index2);
String value = line.substring(index2).trim();
handleSequenceAnnotation(seqName, featureName, value);
} else if (line.startsWith(GENERIC_PER_RESIDUE_ANNOTATION, 2)) {
// #=GR
int index1 = line.indexOf(' ', 5);
String seqName = line.substring(5, index1);
while (line.charAt(++index1) == ' ')
;// keep advancing
int index2 = line.indexOf(' ', index1);
String featureName = line.substring(index1, index2);
String value = line.substring(index2).trim();
handleResidueAnnotation(seqName, featureName, value);
}
} else if (line.startsWith("# STOCKHOLM")) { // it is the header line
// if (status == STATUS_OUTSIDE_FILE) {
// status = STATUS_INSIDE_FILE;
// String[] header = line.split("\\s+");
// this.stockholmStructure = new StockholmStructure();
// this.stockholmStructure.getFileAnnotation().setFormat(header[1]);
// this.stockholmStructure.getFileAnnotation().setVersion(header[2]);
// } else {
// throw new ParserException("Uexpected Format line: [" + line + "]");
// }
String[] header = line.split("\\s+");
this.stockholmStructure = new StockholmStructure();
this.stockholmStructure.getFileAnnotation().setFormat(header[1]);
this.stockholmStructure.getFileAnnotation().setVersion(header[2]);
} else if ("//".equals(line.trim())) {
// status = STATUS_OUTSIDE_FILE;
break;// should we just break immediately or jump next empty lines?
} else /* if (!line.startsWith("#")) */{
// most probably This line corresponds to a sequence. Something like:
// O83071/192-246 MTCRAQLIAVPRASSLAEAIACAQKMRVSRVPVYERS
// N.B. as long as we don't check the status now, it is somehow error prone
handleSequenceLine(line);
// //============removed status==========================
// if (status == STATUS_IN_SEQUENCE) {
// // This line corresponds to a sequence. Something like:
// // O83071/192-246 MTCRAQLIAVPRASSLAEAIACAQKMRVSRVPVYERS
// handleSequenceLine(line);
// // }else if (status==STATUS_OUTSIDE_FILE) {
// // throw new
// //
// ParserException("The end of file character was allready reached but there are still sequence lines");
// } else {
// System.err.println("Error: Unknown or unexpected line [" + line
// + "].\nPlease contact the Biojava team.");
// throw new ParserException("Error: Unknown or unexpected line [" + line + "].");
// }
// //============removed status==========================
}
linesCount++;
}
} catch (IOException e) {
// TODO: Best practice is to catch or throw Exception, never both
logger.error("IOException: ", e);
throw new IOException("Error parsing Stockholm file");
}
StockholmStructure structure = this.stockholmStructure;
this.stockholmStructure = null;
if (structure != null) {
int length = -1;
Map sequences = structure.getSequences();
for (String sequencename : sequences.keySet()) {
StringBuffer sequence = sequences.get(sequencename);
if (length == -1) {
length = sequence.length();
} else if (length != sequence.length()) {
throw new RuntimeException("Sequences have different lengths");
}
}
}
return structure;
}
/**
* Handles a line that corresponds to a sequence.
* e.g.: COATB_BPIKE/30-81 AEPNAATNYATEAMDSLKTQAIDLISQTWPVVTTVVVAGLVIRLFKKFSSKA
* N.B.: This function can't tolerate sequences with intrinsic white space.
*
* @param line
* the line to be parsed
* @throws Exception
*/
private void handleSequenceLine(String line) {
String[] lineContent = line.split("\\s+");
if (lineContent.length != 2) {
throw new ParserException("Could not split sequence line into sequence name and sequence:\n" + line);
}
stockholmStructure.appendToSequence(lineContent[0], lineContent[1]);
}
/**
* #=GF <feature> <Generic per-File annotation, free text>
*
* @param featureName
* @param value
* the line to be parsed
*/
private void handleFileAnnotation(String featureName, String value) {
if (featureName.equals(GF_ACCESSION_NUMBER)) {
stockholmStructure.getFileAnnotation().setGFAccessionNumber(value);
} else if (featureName.equals(GF_IDENTIFICATION)) {
stockholmStructure.getFileAnnotation().setGFIdentification(value);
} else if (featureName.equals(GF_DB_REFERENCE)) {
stockholmStructure.getFileAnnotation().addDBReference(value);
} else if (featureName.equals(GF_DEFINITION)) {
stockholmStructure.getFileAnnotation().setGFDefinition(value);
} else if (featureName.equals(GF_AUTHOR)) {
stockholmStructure.getFileAnnotation().setGFAuthors(value);
} else if (featureName.equals(GF_ALIGNMENT_METHOD)) {
stockholmStructure.getFileAnnotation().setAlignmentMethod(value);
} else if (featureName.equals(GF_BUILD_METHOD)) {
stockholmStructure.getFileAnnotation().addGFBuildMethod(value);
} else if (featureName.equals(GF_SEARCH_METHOD)) {
stockholmStructure.getFileAnnotation().setGFSearchMethod(value);
} else if (featureName.equals(GF_SOURCE_SEED)) {
stockholmStructure.getFileAnnotation().setGFSourceSeed(value);
} else if (featureName.equals(GF_SOURCE_STRUCTURE)) {
stockholmStructure.getFileAnnotation().setGFSourceStructure(value);
} else if (featureName.equals(GF_GATHERING_THRESHOLD)) {
stockholmStructure.getFileAnnotation().setGFGatheringThreshs(value);
} else if (featureName.equals(GF_TRUSTED_CUTOFF)) {
stockholmStructure.getFileAnnotation().setGFTrustedCutoffs(value);
} else if (featureName.equals(GF_NOISE_CUTOFF)) {
stockholmStructure.getFileAnnotation().setGFNoiseCutoffs(value);
} else if (featureName.equals(GF_TYPE_FIELD)) {
stockholmStructure.getFileAnnotation().setGFTypeField(value);
} else if (featureName.equals(GF_PREVIOUS_IDS)) {
stockholmStructure.getFileAnnotation().setGFPreviousIDs(value);
} else if (featureName.equals(GF_SEQUENCE)) {
// status = STATUS_IN_SEQUENCE;
stockholmStructure.getFileAnnotation().setGFNumSequences(value);
} else if (featureName.equals(GF_DB_COMMENT)) {
stockholmStructure.getFileAnnotation().setGFDBComment(value);
// } else if (featureName.equals(GF_DB_REFERENCE)) {
// stockholmStructure.getFileAnnotation().addDBReference(value);
} else if (featureName.equals(GF_REFERENCE_COMMENT)) {
stockholmStructure.getFileAnnotation().setGFRefComment(value);
} else if (featureName.equals(GF_REFERENCE_NUMBER)) {
StockholmFileAnnotationReference reference = new StockholmFileAnnotationReference();
stockholmStructure.getFileAnnotation().getReferences().add(reference);
} else if (featureName.equals(GF_REFERENCE_MEDLINE)) {
stockholmStructure.getFileAnnotation().getReferences().lastElement().setRefMedline(value);
} else if (featureName.equals(GF_REFERENCE_TITLE)) {
stockholmStructure.getFileAnnotation().getReferences().lastElement().addToRefTitle(value);
} else if (featureName.equals(GF_REFERENCE_AUTHOR)) {
stockholmStructure.getFileAnnotation().getReferences().lastElement().addToRefAuthor(value);
} else if (featureName.equals(GF_REFERENCE_LOCALTION)) {
stockholmStructure.getFileAnnotation().getReferences().lastElement().setRefLocation(value);
} else if (featureName.equals(GF_KEYWORDS)) {
stockholmStructure.getFileAnnotation().setGFKeywords(value);
} else if (featureName.equals(GF_COMMENT)) {
stockholmStructure.getFileAnnotation().addToGFComment(value);
} else if (featureName.equals(GF_PFAM_ACCESSION)) {
stockholmStructure.getFileAnnotation().setGFPfamAccession(value);
} else if (featureName.equals(GF_LOCATION)) {
stockholmStructure.getFileAnnotation().setGFLocation(value);
} else if (featureName.equals(GF_WIKIPEDIA_LINK)) {
stockholmStructure.getFileAnnotation().setGFWikipediaLink(value);
} else if (featureName.equals(GF_CLAN)) {
stockholmStructure.getFileAnnotation().setGFClan(value);
} else if (featureName.equals(GF_MEMBERSHIP)) {
stockholmStructure.getFileAnnotation().setGFMembership(value);
} else if (featureName.equals(GF_NEW_HAMPSHIRE)) {
stockholmStructure.getFileAnnotation().addGFNewHampshire(value);
} else if (featureName.equals(GF_TREE_ID)) {
stockholmStructure.getFileAnnotation().addGFTreeID(value);
} else if (featureName.equals(GF_FALSE_DISCOVERY_RATE)) {
stockholmStructure.getFileAnnotation().addGFFalseDiscoveryRate(value);
} else {
// unknown feature
logger.warn("Unknown File Feature [{}].\nPlease contact the Biojava team.", featureName);
}
}
/**
* usually a single line of:
* #=GC <feature> <Generic per-Column annotation, exactly 1 char per column>
*
* @param featureName
* the feature name :)
* @param value
* the line to be parsed.
*/
private void handleConsensusAnnotation(String featureName, String value) {
if (featureName.equals(GC_SECONDARY_STRUCTURE)) {
stockholmStructure.getConsAnnotation().setSecondaryStructure(value);
} else if (featureName.equals(GC_SEQUENSE_CONSENSUS)) {
stockholmStructure.getConsAnnotation().setSequenceConsensus(value);
} else if (featureName.equals(GC_SURFACE_ACCESSIBILITY)) {
stockholmStructure.getConsAnnotation().setSurfaceAccessibility(value);
} else if (featureName.equals(GC_TRANS_MEMBRANE)) {
stockholmStructure.getConsAnnotation().setTransMembrane(value);
} else if (featureName.equals(GC_POSTERIOR_PROBABILITY)) {
stockholmStructure.getConsAnnotation().setPosteriorProbability(value);
} else if (featureName.equals(GC_LIGAND_BINDING)) {
stockholmStructure.getConsAnnotation().setLigandBinding(value);
} else if (featureName.equals(GC_ACTIVE_SITE)) {
stockholmStructure.getConsAnnotation().setActiveSite(value);
} else if (featureName.equals(GC_AS_PFAM_PREDICTED)) {
stockholmStructure.getConsAnnotation().setAsPFamPredicted(value);
} else if (featureName.equals(GC_AS_SWISSPROT)) {
stockholmStructure.getConsAnnotation().setAsSwissProt(value);
} else if (featureName.equals(GC_INTRON)) {
stockholmStructure.getConsAnnotation().setIntron(value);
} else if (featureName.equals(GC_REFERENCE_ANNOTATION)) {
stockholmStructure.getConsAnnotation().setReferenceAnnotation(value);
} else if (featureName.equals(GC_MODEL_MASK)) {
stockholmStructure.getConsAnnotation().setModelMask(value);
} else {
// unknown feature
logger.warn("Unknown Consensus Feature [{}].\nPlease contact the Biojava team.", featureName);
}
}
/**
* #=GS <seqname> <feature> <Generic per-Sequence annotation, free text>
*
* @param line
* the line to be parsed
*/
private void handleSequenceAnnotation(String seqName, String featureName, String value) {
if (featureName.equals(GS_ACCESSION_NUMBER)) {
stockholmStructure.addGSAccessionNumber(seqName, value);
} else if (featureName.equals(GS_DESCRIPTION)) {
stockholmStructure.addGSDescription(seqName, value);
} else if (featureName.equals(GS_DATABASE_REFERENCE)) {
stockholmStructure.addGSdbReference(seqName, value);
} else if (featureName.equals(GS_ORGANISM_SPECIES)) {
stockholmStructure.addGSOrganismSpecies(seqName, value);
} else if (featureName.equals(GS_ORGANISM_CLASSIFICATION)) {
stockholmStructure.addGSOrganismClassification(seqName, value);
} else if (featureName.equals(GS_LOOK)) {
stockholmStructure.addGSLook(seqName, value);
} else {
// unknown feature
logger.warn("Unknown Sequence Feature [{}].\nPlease contact the Biojava team.", featureName);
}
}
/**
* #=GR <seqname> <feature> <Generic per-Residue annotation, exactly 1 char per residue>
*
* @param line
* the line to be parsed
*/
private void handleResidueAnnotation(String seqName, String featureName, String value) {
if (featureName.equals(GR_SURFACE_ACCESSIBILITY)) {
stockholmStructure.addSurfaceAccessibility(seqName, value);
} else if (featureName.equals(GR_TRANS_MEMBRANE)) {
stockholmStructure.addTransMembrane(seqName, value);
} else if (featureName.equals(GR_POSTERIOR_PROBABILITY)) {
stockholmStructure.addPosteriorProbability(seqName, value);
} else if (featureName.equals(GR_LIGAND_BINDING)) {
stockholmStructure.addLigandBinding(seqName, value);
} else if (featureName.equals(GR_ACTIVE_SITE)) {
stockholmStructure.addActiveSite(seqName, value);
} else if (featureName.equals(GR_AS_PFAM_PREDICTED)) {
stockholmStructure.addASPFamPredicted(seqName, value);
} else if (featureName.equals(GR_AS_SWISSPROT)) {
stockholmStructure.addASSwissProt(seqName, value);
} else if (featureName.equals(GR_INTRON)) {
stockholmStructure.addIntron(seqName, value);
} else if (featureName.equals(GR_SECONDARY_STRUCTURE)) {
stockholmStructure.addSecondaryStructure(seqName, value);
} else {
// unknown feature
logger.warn("Unknown Residue Feature [{}].\nPlease contact the Biojava team.", featureName);
}
}
}