net.maizegenetics.dna.map.GenomeFeatureMapBuilder Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of tassel Show documentation
Show all versions of tassel Show documentation
TASSEL is a software package to evaluate traits associations, evolutionary patterns, and linkage
disequilibrium.
The newest version!
package net.maizegenetics.dna.map;
import com.google.common.collect.*;
import net.maizegenetics.util.DirectedGraph;
import net.maizegenetics.util.Graph;
import net.maizegenetics.util.GraphBuilder;
import net.maizegenetics.util.Utils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
import org.json.simple.parser.ParseException;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.*;
/**
* Created by jgw87 on 7/2/14.
* A Builder class to create a GenomeFeatureMap to identify genomic features. Can build piecemeal or read in from a file
* For now, not implementing functionality to make a new builder from an existing map.
*/
//TODO: Add functionality for reading in a precompiled map
public class GenomeFeatureMapBuilder {
private static final Logger myLogger = LogManager.getLogger(GenomeFeatureMapBuilder.class);
//Graph of all the genome features connecting to each other
DirectedGraph featureTree = null;
//Lookups to identify GenomeFeatures by their name, location, and type (exon, gene, etc)
private HashMap nameLookup = new HashMap<>();
private HashMap>> locationLookup = new HashMap<>(); // Complex.
private Multimap typeLookup = HashMultimap.create();
//Helper variables used to store information to build the feature map
HashSet chromosomes = new HashSet<>();
public GenomeFeatureMap build(){
buildGenomeTree();
buildLocationLookup();
return new GenomeFeatureMap(nameLookup, typeLookup, locationLookup, featureTree);
}
/**
* Build the genome feature tree, linking everything to its parent. If something has no parent, it's assigned to the
* chromosome (and if no chromosome, then the genome, but that's not very useful). It also checks that the genome itself
* and chromosomes exist as their own {@link GenomeFeature}s, and creates them if needed.
*/
private void buildGenomeTree(){
addRootFeaturesIfNeeded();
//Go through each feature and add parent IDs for those that don't have them (link to chromosome or genome)
GenomeFeature genome = getGenome();
ArrayList toReplace = new ArrayList();
for(GenomeFeature feature: nameLookup.values()){
if(feature.id() == getGenome().id()){ //Don't assign the genome as its own parent
continue;
}
if(feature.parentId().equals("NA") || !nameLookup.containsKey(feature.parentId())){
GenomeFeatureBuilder replacement = new GenomeFeatureBuilder(feature);
if(feature.chromosome().equals("NA")){
replacement.parentId(genome.id());
}else{
replacement.parentId(feature.chromosome());
}
toReplace.add(replacement.build());
}
}
for(GenomeFeature f:toReplace){
replaceFeature(f);
}
//Build the actual genome graph based on parent information
GraphBuilder builder = new GraphBuilder(Graph.GraphType.DIRECTED);
for(GenomeFeature feature: nameLookup.values()){
if(feature.id() == getGenome().id()){ //Don't assign the genome as its own parent
continue;
}
if(!nameLookup.containsKey(feature.parentId())){
myLogger.warn("WARNING! Unable to find parent " + feature.parentId() + " for feature " + feature.id());
continue;
}
GenomeFeature parent = nameLookup.get(feature.parentId());
builder.addEdge(parent, feature);
}
featureTree = (DirectedGraph) builder.build();
}
/**
* Add the genome itself and each chromosome as an overarching GenomeFeatures if they haven't been added already
*/
private void addRootFeaturesIfNeeded(){
//Check if a feature names "genome" has been added. If so, confirm there's only one. If not, create it.
GenomeFeature genome;
if(typeLookup.keySet().contains("genome")){
genome = getGenome();
}else{
genome = new GenomeFeatureBuilder().id("GENOME").type("genome").build();
addFeature(genome);
}
//Go through and check/create chromosomes, assigning their parent to the genome if need be.
ArrayList toReplace = new ArrayList();
ArrayList toAdd = new ArrayList();
for(String c: chromosomes){
//If chromosome already entered, check that parent is assigned to genome
if(nameLookup.containsKey(c)){
GenomeFeature mychrom = nameLookup.get(c);
//No parent assigned -> Assign to the genome
if(mychrom.parentId().equals("NA")){
GenomeFeature newChrom = new GenomeFeatureBuilder(mychrom).parentId(genome.id()).build();
toReplace.add(newChrom);
//parent assigned, but to different feature -> throw error
}else if(!mychrom.parentId().equals(genome.id())){
throw new UnsupportedOperationException("Error: Chromosome " + mychrom.id() + " assigned with parent "
+ mychrom.parentId() + " that does not match the recognized genome " + genome.id());
}
}else{ //Chromosome not entered yet -> make a new one
System.out.println("Making chromosome " + c + " from scratch");
GenomeFeature newChrom = new GenomeFeatureBuilder().id(c).parentId(genome.id()).chromosome(c).type("chromosome").build();
toAdd.add(newChrom);
}
}
//Actually add & replace the relevant GenomeFeatures; Done here to avoid ConcurrentModificationExceptions
for(GenomeFeature f: toAdd){
addFeature(f);
}
for(GenomeFeature f: toReplace){
replaceFeature(f);
}
}
private GenomeFeature getGenome(){
if(typeLookup.keySet().contains("genome")) {
Collection genomes = typeLookup.get("genome");
//If somehow can't pull feature out, throw an error
if (genomes.size() == 0) {
throw new UnsupportedOperationException("Error: Cannot find feature with type 'genome' even though supposedly loaded");
}
//If more than one feature annotated as "genome", throw an error
if (genomes.size() > 1) {
StringBuilder multiples = new StringBuilder();
for (GenomeFeature g : genomes) {
multiples.append("\n\t" + g.id());
}
throw new UnsupportedOperationException("Error: Attempt to build a GenomeFeatureMap with more than one feature annotated as type 'genome':" + multiples);
}
return(genomes.iterator().next()); //Take first (and by this point, only) feature
}else{
myLogger.warn("Warning! Attempting to retrieve the root genome when it hasn't been created yet");
return null;
}
}
private void buildLocationLookup(){
//Initialize each chromosome as a new Rangemap
for(String c: chromosomes){
RangeMap> newmap = TreeRangeMap.create();
locationLookup.put(c, newmap);
}
for(GenomeFeature feature: nameLookup.values()){
RangeMap mymap = locationLookup.get(feature.chromosome());
addFeatureToRangemap(mymap, feature);
}
}
/**
* Add a GenomeFeature to a RangeMap, stacking it on top of any existing features instead of overwriting them
* (how RangeMap behaves natively)
* @param masterMap Rangemap object to be modified
* @param feature GenomeFeature to be added. Only start and stop position are checked
*/
public static void addFeatureToRangemap(
RangeMap> masterMap, GenomeFeature feature){
//Check that range is valid (all >=0). Features with invalid ranges (e.g., default chromosomes) are ignored
if(feature.start() <0 || feature.stop() < 0){
return;
}
//Make ranges closed-open b/c otherwise results in a lot of 1-bp redundant ranges where adjacent features were added
Range featureRange = Range.closedOpen(feature.start(), feature.stop() + 1);
//First, extract out subrange and save any annotations already there. (Requires making a copy so later modification of
// masterMap doesn't change things
ArrayList> rangeList = new ArrayList<>();
ArrayList> hashList = new ArrayList<>();
Map, HashSet> subranges = masterMap.subRangeMap(featureRange).asMapOfRanges();
for(Range r: subranges.keySet()){
rangeList.add(Range.closedOpen(r.lowerEndpoint(), r.upperEndpoint()));
hashList.add(new HashSet(subranges.get(r)));
}
//Next, set entire range equal to the new feature to cover any areas not covered by the existing ranges
HashSet newset = new HashSet<>();
newset.add(feature);
masterMap.put(featureRange, newset);
//Take the extracted, existing annotations and add back in, adding this feature to the list since they overwrite
// everything already there
for(int i=0; i tempset = hashList.get(i);
Range temprange = rangeList.get(i);
tempset.add(feature); //Add the feature on top of existing ones
masterMap.put(temprange, tempset);
}
}
/**
* Adds a GenomeFeature to the map. If the feature's unique ID has already been loaded, it throws an UnsupportedOperationException
* @param feature The GenomeFeature to be added
* @return This builder
*/
public GenomeFeatureMapBuilder addFeature(GenomeFeature feature){
String ID=feature.id();
if(nameLookup.containsKey(ID)){
throw new UnsupportedOperationException("Error: Attempt to add a GenomeFeature whose unique ID is already loaded: "
+ ID);
}
return this.addOrReplaceFeature(feature); //Done to avoid code duplication
}
/**
* Replaces the GenomeFeature with a specified ID with a new one. If unique ID isn't already in the map, it throws
* an UnsupportedOperationException.
* @param feature The GenomeFeature to be added
* @return This builder
*/
public GenomeFeatureMapBuilder replaceFeature(GenomeFeature feature){
String ID=feature.id();
if(!nameLookup.containsKey(ID)){
throw new UnsupportedOperationException("Error: Attempt to replace a GenomeFeature whose unique ID has not been loaded yet: "
+ ID);
}
return this.addOrReplaceFeature(feature); //Done to avoid code duplication
}
/**
* Adds a GenomeFeature to the map, regardless of whether or not it's already been added. This method throws no
* warnings if you'll overwrite existing data, so use it with caution.
* @param feature The GenomeFeature to be added
* @return This builder
*/
public GenomeFeatureMapBuilder addOrReplaceFeature(GenomeFeature feature){
nameLookup.put(feature.id(), feature);
typeLookup.put(feature.type(), feature);
if(!feature.chromosome().equals("NA")){
chromosomes.add(feature.chromosome());
}
return this;
}
/**
* Load in data from a GFF (Gene Feature Format) file. Since GFF files have only a loose standard for the 9th column,
* this involves several ad-hoc heuristics about what things to look for. As such, it is not the preferred way to
* read in annotations. (That is JSON or tab-delimited format.)
*
* This method does not build the map, so you can string multiple calls together (if, for example, you have
* different annotations in different files)
*
* @param filename
*/
public GenomeFeatureMapBuilder addFromGffFile(String filename){
myLogger.warn("GenomeFeatureMapBuilder - Loading genome annotations from GFF file. Will try to parse annotations " +
"field as best as possible. (JSON or tab-delimited formats are preferred.)");
try {
BufferedReader reader = Utils.getBufferedReader(filename);
String line=reader.readLine();
while(line != null ){
if(line.startsWith("#")){ //Skip comment lines
line=reader.readLine();
continue;
}
GenomeFeature newFeature = new GenomeFeatureBuilder().parseGffLine(line).build();
addFeature(newFeature);
line=reader.readLine();
}
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
return this;
}
/**
* Load in data from a JSON-formatted file. JSON format is defined at http://www.json.org/, and consists of structured
* key-value pairs. For genome features, the key is the name of an attribute and the value is (obviously) its value.
* (For example: "chromosome":1). Note that if you have more than one feature per file (the normal case), all but the
* last closing brace ('}') should be followed by a comma, and the whole group should be within square braces ('[...]'
* That is, the first character of the file should be '[' and the last should be ']'). This makes it a properly-formatted
* JSON array.
*
* Common attributes (keys) are listed below. Although only the "id" attribute is required, a feature is pretty
* useless without some sort of positional information (chromosome, start/stop, etc.).
* "id": Unique identifier for this feature. Repeated identifiers throw an error. (Also accepts "name".) Required.
* "chrom": Which chromosome it occurs on (Also accepts "chr" or "chromosome")
* "start": Start position on the chromosome
* "stop": Stop position on the chromosome (Also accepts "end")
* "position": Postion on chromosome (in place of "start" and "stop" for features that are a single nucleotide)
* "parent_id": What other named feature this descends from (eg, Gene -> Transcript -> Exon). If none given, this
* will default to the chromosome (or the genome if chromosome isn't supplied)
*
* This method does not build the map, so you can string multiple calls together (if, for example, you have
* different annotations in different files).
* @param filename
*/
public GenomeFeatureMapBuilder addFromJsonFile(String filename) {
JSONParser jparse = new JSONParser();
BufferedReader reader = Utils.getBufferedReader(filename);
try {
JSONArray jarray = (JSONArray) jparse.parse(reader);
Iterator iter = jarray.iterator();
while(iter.hasNext()){
JSONObject json = (JSONObject) iter.next();
GenomeFeature newFeature = new GenomeFeatureBuilder().parseJsonObject(json).build();
addFeature(newFeature);
}
reader.close();
} catch (IOException e) {
myLogger.error("Error loading data from JSON file " + filename);
e.printStackTrace();
} catch (ParseException e) {
myLogger.error("Error parsing information in JSON file " + filename);
e.printStackTrace();
}
return this;
}
//Alternate implementation with manually reading in the JSON data; not recommended
/*public GenomeFeatureMapBuilder addFromJsonFile(String filename) {
JSONParser jparse = new JSONParser();
BufferedReader reader = Utils.getBufferedReader(filename);
try {
String inline = reader.readLine();
String tempJson = "";
while(inline != null){
tempJson += inline;
//If has closing brace (mark of end of object), then parse
while(tempJson.contains("}")){ //While loop in case multiple objects on one line
//Subtract out JSON object
int start=tempJson.indexOf('{');
int end=tempJson.indexOf('}');
String json = tempJson.substring(start, end+1);
System.out.println("jsonAsString:" + tempJson);
System.out.println("\ttake out:" + json);
tempJson = tempJson.substring(end + 1, tempJson.length()); //Remove everything
System.out.println("\tleft:" + tempJson);
//Add data as feature
JSONObject featureData = (JSONObject) jparse.parse(json);
GenomeFeature newFeature = new GenomeFeatureBuilder().parseJsonObject(featureData).build();
addFeature(newFeature);
}
inline = reader.readLine();
}
} catch (IOException e) {
myLogger.error("Error loading data from JSON file " + filename);
e.printStackTrace();
} catch (ParseException e) {
myLogger.error("Error parsing information in JSON file " + filename);
e.printStackTrace();
}
return this;
}*/
/**
* Load in data from a flat, tab-delimited text file. The first row should be a header identifying what attribute is
* in each column, and each subsequent row should correspond to a single feature. Columns that don't apply to a
* given feature should use "NA" or left empty. Common attributes (columns) are listed below. Although only the "id"
* attribute is required, a feature is pretty useless without some sort of positional information (chromosome,
* start/stop, etc.).
* "id": Unique identifier for this feature. Repeated identifiers throw an error (Also accepts "name".) Required.
* "chrom": Which chromosome it occurs on (Also accepts "chr" or "chromosome")
* "start": Start position on the chromosome
* "stop": Stop position on the chromosome (Also accepts "end")
* "position": Postion on chromosome (in place of "start" and "stop" for features that are a single nucleotide)
* "parent_id": What other named feature this descends from (eg, Gene -> Transcript -> Exon). If none given, this
* will default to the chromosome (or the genome if chromosome isn't supplied)
*
* This method does not build the map, so you can string multiple calls together (if, for example, you have
* different annotations in different files).
* @param filename
*/
public GenomeFeatureMapBuilder addFromFlatFile(String filename) {
try {
BufferedReader reader = Utils.getBufferedReader(filename);
String line=reader.readLine();
String[] header = null;
int n=1; //Keep track of line numbers
while(line != null ){
n++;
if(line.startsWith("#")){ //Skip comment lines
line=reader.readLine();
continue;
}
String[] tokens = line.split("\t");
if(header == null){ //Save header data
header=tokens;
line=reader.readLine();
continue;
}
//Check that number of fields is correct
if(tokens.length != header.length){
myLogger.error("Error: line " + n + " has a different number of fields (" + tokens.length + ") than the header (" + header.length + ")");
}
//Load everything into a hapmap
HashMap data = new HashMap<>();
for(int i=0; i