org.biojava.nbio.genome.parsers.gff.Feature Maven / Gradle / Ivy
/*
* BioJava development code
*
* This code may be freely distributed and modified under the
* terms of the GNU Lesser General Public Licence. This should
* be distributed with the code. If you do not have a copy,
* see:
*
* http://www.gnu.org/copyleft/lesser.html
*
* Copyright for this code is held jointly by the individual
* authors. These should be listed in @author doc comments.
*
* For more information on the BioJava project and its aims,
* or to join the biojava-l mailing list, visit the home page
* at:
*
* http://www.biojava.org/
*
*/
package org.biojava.nbio.genome.parsers.gff;
import java.util.HashMap;
/**
* A Feature corresponds to a single row in a GFF file.
*
* @author Hanno Hinsch
*/
public class Feature implements FeatureI {
private Location mLocation;
private String mSeqname;
private String mSource;
private String mType;
private double mScore; //or . if none
private int mFrame; //0,1,2
private String mAttributes; //any trailing stuff
private HashMap mUserMap;
/**
* Get the sequence name. (GFF field 1). Note that feature objects have
* no link or reference to the actual sequence object to which
* they refer; they are completely uncoupled.
*
* @return Sequence name.
*/
@Override
public String seqname() {
return mSeqname;
}
;
/**
* Get source (aka method). (GFF field 2). This is often the name of
* the program or procedure that created the features.
*
* @return Source field.
*/
public String source() {
return mSource;
}
;
/**
* Get feature type, such as "exon" or "CDS". (GFF field 3).
*
* @return Feature type.
*/
@Override
public String type() {
return mType;
}
;
/**
* Get location of feature. Note that feature objects have
* no link or reference to the actual sequence object to which
* they refer; they are completely uncoupled.
*
* @return Location of feature.
*/
@Override
public Location location() {
return mLocation;
}
/**
* Get score. (GFF field 7). The meaning of the score varies from file to file.
*
* @return Score value.
*/
public double score() {
return mScore;
}
;
/**
* Get frame (aka phase). (GFF field 8). Specifies the offset of the
* first nucleotide of the first in-frame codon, assuming this feature
* is a dna/rna sequence that codes
* for a protein. If you
* intend to use this field, you probably want to look it up on the web first.
*
* @return The frame (0, 1, 2).
*/
public int frame() {
return mFrame;
}
;
/**
* Get the string of key/value attributes. (GFF field 9). The format and
* meaning of this field varies from flavor to flavor of GFF/GTF. This method
* simply returns the whole string. Other methods in this class make assumptions
* about its format and provide additional utility.
*
* @return The attribute string.
*/
public String attributes() {
return mAttributes;
}
;
@SuppressWarnings("unused")
private Feature() {
}
; //unavailable
/**
* Make a copy of the specified feature. The mappings in the userMap() HashMap
* are copied, so each feature has independent user data. Note, however, that the
* actual objects in the HashMap are shared (not copied), so a change to such an object may
* affect multiple features.
*
* @param feature Feature to clone.
*/
public Feature(Feature feature) {
mSeqname = feature.mSeqname;
mSource = feature.mSource;
mType = feature.mType;
mLocation = feature.mLocation;
mScore = feature.mScore;
mFrame = feature.mFrame;
mAttributes = feature.mAttributes;
initAttributeHashMap();
mUserMap = new HashMap(feature.mUserMap);
}
/**
* Construct a new Feature from raw data (usually a GFF row).
*
* @param seqname The sequence name field (field 1).
* @param source The source or method field (field 2).
* @param type The type of feature field (field 3).
* @param location The location of the feature. (calculated from GFF start, end and strand fields).
* @param score The score field (field 7).
* @param frame The frame or phase field (field 8).
* @param attributes A string of key/value pairs separated by semicolons (field 9).
*/
public Feature(String seqname, String source, String type, Location location, Double score, int frame, String attributes) {
mSeqname = seqname;
mSource = source;
mType = type;
mLocation = location;
mScore = score;
mFrame = frame;
mAttributes = attributes;
initAttributeHashMap();
mUserMap = new HashMap();
}
/**
* Get HashMap of user data. Each Feature object has a Java HashMap object
* which can be used to annotate the Feature. JavaGene does not use or interpret
* the keys or values. The values can be any subtype of the Java Object class.
*
* If a Feature is constructed from data fields, the initial HashMap has no mappings (is empty).
* If a Feature is constructed from another Feature, a copy of the mappings is made.
* Note that the Objects in the copied mapping are shared, even though the mapping itself
* is copied (not shared). Thus removing or adding a mapping to one Feature will not affect the
* other, but changing an Object which is part of an established mapping may affect both Features.
*
* @return The user HashMap.
*/
@Override
public HashMap userData() {
return mUserMap;
}
HashMap attributeHashMap = new HashMap();
private void initAttributeHashMap(){
String[] values = mAttributes.split(";");
for(String attribute : values){
attribute = attribute.trim();
int equalindex = attribute.indexOf("=");
String splitData = "=";
if(equalindex == -1) //gtf uses space and gff3 uses =
splitData = " ";
String[] data = attribute.split(splitData);
String value = "";
if(data.length >= 2 && data[1].indexOf('"') != -1){ // an attibute field could be empty
value = data[1].replaceAll('"' + "","").trim();
}else if(data.length >= 2){
value = data[1].trim();
}
attributeHashMap.put(data[0].trim(), value);
}
}
/**
* Get value of specified attribute key. Returns null if the attribute key has no value (does not exist).
* Keys are case-sensitive. Assumes attributes are correctly formatted in GFF style.
* Known bug: a semicolon within a quoted value will cause parse failure.
*
* @param key The key.
* @return The corresponding value. Null if the key has no value defined.
*/
@Override
public String getAttribute(String key) {
return attributeHashMap.get(key);
}
public String getAttributeOld(String key) {
int start = 0;
int end = mAttributes.indexOf(';');
while (0 < end) {
//find the first word (up to space) in chunk,
// see if it is this key
int i = mAttributes.indexOf(' ', start);
if (0 < i && i < end) {
if (mAttributes.substring(start, i).equals(key)) {
//remove quotes, if needed
if (mAttributes.charAt(i + 1) == '\"' && mAttributes.charAt(end - 1) == '\"') {
return mAttributes.substring(i + 2, end - 1);//return attribute
} else {
return mAttributes.substring(i + 1, end); //return attribute
}
}
}
start = end + 2; //skip required semicolon and single space
end = mAttributes.indexOf(';', start);
}
return null;
}
@Override
public boolean hasAttribute(String key) {
return attributeHashMap.containsKey(key);
}
@Override
public boolean hasAttribute(String key, String value) {
String data = getAttribute(key);
if(data == null)
return false;
if(data.equals(value))
return true;
else
return false;
}
/**
* Get the first item (everything before first semicolon, if it has one)
* in the attribute field, which is assumed to
* be a group identifer. This is appropriate for GFF1 files and variants. It is not
* appropriate for GTF and GFF2 files, although they may use a named attribute key,
* such as "gene_id" or "transcript_id", for grouping.
*
* @return The group id. Everything before the first semicolon in the attributes string (minus trailing whitespace).
*/
@Override
public String group() {
int i = mAttributes.indexOf(';');
return (i < 0) ? mAttributes.trim() : mAttributes.substring(0, i).trim();
}
/**
*
*/
@Override
public String toString() {
String s = mSeqname + '\t';
s += mSource + '\t';
s += mType + '\t';
s += mLocation.start() + "\t";
s += mLocation.end() + "\t";
s += Double.toString(mScore) + "\t";
if (mFrame == -1) {
s += ".\t";
} else {
s += mFrame + "\t";
}
s += mAttributes;
return s;
}
/**
* @deprecated
*/
@Deprecated
public static void main(String args[])
throws Exception {
//Feature f= new Feature();
//intentionally perverse
//f.group= "gene_id transcript; transcript \"gene_id fantom2\"; ";
// f.addAttribute( "author", "julian" );
// f.addAttribute( "curator", "nick" );
// f.addAttribute( "author", "hanno" );
//Log.log( f.group );
//f.addAttribute( "perverse", "foo;goo" );
//assert f.getAttribute( "perverse").equals( "foo;goo" );
// assert f.getAttribute( "gene_id" ).equals( "transcript" );
// assert f.getAttribute( "author" ).equals( "julian hanno" );
// assert f.getAttribute( "curator" ).equals( "nick" );
// assert f.getAttribute( "transcript").equals( "gene_id fantom2" );
//Log.log( "passed test." );
}
@Override
public HashMap getAttributes() {
return attributeHashMap;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy