net.maizegenetics.dna.map.GenomeFeatureBuilder Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of tassel Show documentation
Show all versions of tassel Show documentation
TASSEL is a software package to evaluate traits associations, evolutionary patterns, and linkage
disequilibrium.
The newest version!
package net.maizegenetics.dna.map;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.json.simple.JSONObject;
import java.util.HashMap;
import java.util.Locale;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Created by jgw87 on 7/2/14.
* Builder class to create a GenomeFeature. All annoations are stored in a HashMap. Any annotation can be added through
* the addAnnotation() method, but the more common fields have their own convenience methods. Only the feature's own ID
* is required; all other annotations are optional
*/
public class GenomeFeatureBuilder {
private static final Logger myLogger = LogManager.getLogger(GenomeFeatureBuilder.class);
//Variables to store the information on the feature
private HashMap myannotations = null;
/**
* Generic constructor which does nothing special
*/
public GenomeFeatureBuilder() {
myannotations = new HashMap();
}
/**
* Constructor to build a new feature off of an existing one.
*
* @param feature The genome feature to copy
*/
public GenomeFeatureBuilder(GenomeFeature feature) {
this.myannotations = feature.annotations(); //the annotations() method returns a shallow copy, so should be safe
}
/**
* Public accessor method to get a new GenomeFeatureBuilder based off an existing GenomeFeature
* TODO: Get this matching other getInstance methods better; doesn't seem to fit, so commented out for now
* //@param feature
* //@return
*/
/*public static GenomeFeatureBuilder getInstance(GenomeFeature feature){
return new GenomeFeatureBuilder(feature);
}*/
public GenomeFeature build() {
validateData();
return new GenomeFeature(myannotations);
}
private void validateData() {
//Test that feature has the required fields
if ( (!myannotations.containsKey("id")) || myannotations.get("id") == "NA") {
throw new UnsupportedOperationException("GenomeFeatureBuilder: Cannot build a feature without a personal identifier (field 'id')");
}
//Test if start or stop is negative
if(myannotations.containsKey("start") && myannotations.containsKey("stop")) {
int mystart = Integer.parseInt(myannotations.get("start"));
int mystop = Integer.parseInt(myannotations.get("stop"));
if (mystart < 0) {
throw new UnsupportedOperationException("GenomeFeatureBuilder: Start coordinate is negative for " +
myannotations.get("id") + ": " + mystart + " (possibly unassigned?)");
}
if (mystop < 0) {
throw new UnsupportedOperationException("GenomeFeatureBuilder: Stop coordinate is negative for " +
myannotations.get("id") + ": " + mystop + " (possibly unassigned?)");
}
//Test that start is less than stop
if (mystart > mystop) {
throw new UnsupportedOperationException("GenomeFeatureBuilder: Start coordinate is greater than stop " +
"coordinate for " + myannotations.get("id") + ": " + mystart + " vs " + mystop);
}
}
}
public GenomeFeatureBuilder id(String id) {
return addAnnotation("id", id);
}
public GenomeFeatureBuilder type(String type) {
return addAnnotation("type", type);
}
public GenomeFeatureBuilder parentId(String parentId) {
return addAnnotation("parent_id", parentId);
}
public GenomeFeatureBuilder chromosome(Chromosome chr) {
return addAnnotation("chromosome", chr.getName());
}
public GenomeFeatureBuilder chromosome(String chr) {
return addAnnotation("chromosome", chr);
}
public GenomeFeatureBuilder chromosome(int chr) {
return addAnnotation("chromosome", "" + chr);
}
public GenomeFeatureBuilder start(int start) {
return addAnnotation("start", "" + start);
}
public GenomeFeatureBuilder start(String start) {
return addAnnotation("start", start);
}
public GenomeFeatureBuilder stop(int stop) {
return addAnnotation("stop", "" + stop);
}
public GenomeFeatureBuilder stop(String stop) {
return addAnnotation("stop", stop);
}
public GenomeFeatureBuilder position(String position) {
return addAnnotation("position", position);
}
public GenomeFeatureBuilder position(int position) {
return addAnnotation("position", "" + position);
}
public GenomeFeatureBuilder addAnnotation(String key, String value) {
key = synonymizeKeys(key);
if("".equals(value)){ //Convert empty strings to 'NA'
value="NA";
}
myannotations.put(key, value); //All annotations kept in the hash
//If the key is "position", convert to identical start-stop coordinates as well.
if (key == "position") {
this.start(value);
this.stop(value);
}
return this;
}
/**
* Method that takes common synonyms of annotation types and standardizes them according to the following rules:
* (1) Make lowercase
* (2) Standardize according to following rules. (Any not on this list are returned as just lowercased)
* name, id -> id
* chr, chrom, chromosome -> chromosome
* stop, end -> stop
* parentid, parent_id, parent -> parent_id
* pos, position -> position
*
* @param key The key to standardize
* @return
*/
public static String synonymizeKeys(String key) {
key = key.toLowerCase(Locale.ENGLISH);
switch (key) {
case "name":
case "id":
return "id";
case "chr":
case "chrom":
case "chromosome":
return "chromosome";
case "end":
case "stop":
return "stop";
case "parent":
case "parentid":
case "parent_id":
return "parent_id";
case "pos":
case "position":
return "position";
default:
return key;
}
}
/**
* Load all annotations from a hashmap. Keys become the annotations, and values the annotation value. Each key-value
* pair is added individually (instead of using a putAll() method) to allow for key standardization, etc.
* @return This builder, with the new data loaded from the hashmap
*/
public GenomeFeatureBuilder loadAll(HashMap newAnnotations){
for(String key: newAnnotations.keySet()){
addAnnotation(key, newAnnotations.get(key));
}
return this;
}
/**
* Create a GenomeFeature from a line of GFF file. This method is modified from the BioJava source code for the same
* purpose, in biojava3-genome/src/main/java/org/biojava3/genome/GFF3Reader.java
*
* @param line A single line from a GFF file as a string
* @return This builder, with the data loaded from the line
*/
public GenomeFeatureBuilder parseGffLine(String line) {
//Field identifiers for GFF format
int gffSeq = 0, gffSource = 1, gffFeatureType = 2, gffStart = 3, gffStop = 4, gffScore = 5, gffStrand = 6, gffFrame = 7, gffAttributes = 8;
//Get all the easy data stored in its own fields
String[] tokens = line.split("\t");
this.chromosome(tokens[gffSeq].trim());
this.type(tokens[gffFeatureType].trim());
this.start(tokens[gffStart]);
this.stop(tokens[gffStop]);
addAnnotation("strand", tokens[gffStrand].trim());
//Extract the parent from the attributes field
String parentID = getParentFromGffAttributes(tokens[gffAttributes]);
this.parentId(parentID);
//Extract the unique identifier for this feature. If none, build one from available info
String myID = getFeatureIdFromGffAttributes(tokens[gffAttributes]);
if (myID == null) {
myID = myannotations.get("type") + "_" + myannotations.get("chromosome") + "_" + myannotations.get("start") + "_" + myannotations.get("stop");
}
this.id(myID);
return this;
}
/**
* Parse a GFF attribute field to identify the parent of the current GenomeFeature. Tricky b/c of the different ways it
* can be represented. There's a hierarchy of accepted answers, with 'parent_id', 'Parent=', 'transcript_id', and
* 'gene_id' taken in that order. If nothing is found, returns an empty string ("")
*
* @param attributes The string from the attribute field of the GFF file
* @return The parent's ID string
*/
public static String getParentFromGffAttributes(String attributes) {
//Match the appropriate string with regular expressions
Matcher matcher;
//Pattern for 'parent_id "GRMZM2G005232"'
matcher = Pattern.compile("parent_id \"(\\w+)\"").matcher(attributes);
if (matcher.find()) {
return matcher.group(1);
}
//Pattern for 'Parent=GRMZM2G005232' and 'Parent=gene:GRMZM2G005232'
matcher = Pattern.compile("Parent=(\\w+:){0,1}(\\w+)").matcher(attributes);
if (matcher.find()) {
return matcher.group(2);
}
//Pattern for 'gene_id "GRMZM2G005232"'
matcher = Pattern.compile("transcript_id \"(\\w+)\"").matcher(attributes);
if (matcher.find()) {
return matcher.group(1);
}
//Pattern for 'transcript_id "GRMZM2G005232_T01"'
matcher = Pattern.compile("gene_id \"(\\w+)\"").matcher(attributes);
if (matcher.find()) {
return matcher.group(1);
}
return "";
}
/**
* Parse a GFF attribute field to identify the name of the current GenomeFeature. Looks for 'ID=' and 'Name=' fields
*
* @param attributes The string from the attribute field of the GFF file. REturns null if not found
* @return The feature's ID string
*/
public static String getFeatureIdFromGffAttributes(String attributes) {
//Match the appropriate string with regular expressions
Matcher matcher;
//Pattern for 'ID=GRMZM2G005232' and 'ID=gene:GRMZM2G005232'
matcher = Pattern.compile("(Name|ID)=(\\w+:){0,1}(\\w+)").matcher(attributes);
if (matcher.find()) {
return matcher.group(3);
}
return null;
}
public GenomeFeatureBuilder parseJsonObject (JSONObject featureData){
HashMap jsonHash = new HashMap<>();
for (String key : (Set) featureData.keySet()) {
addAnnotation(key, featureData.get(key).toString());
}
return this;
}
}