net.maizegenetics.dna.map.GenomeFeatureMapBuilder Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of tassel Show documentation
TASSEL is a software package to evaluate traits associations, evolutionary patterns, and linkage disequilibrium.
The newest version!
package net.maizegenetics.dna.map;

import com.google.common.collect.*;
import net.maizegenetics.util.DirectedGraph;
import net.maizegenetics.util.Graph;
import net.maizegenetics.util.GraphBuilder;
import net.maizegenetics.util.Utils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
import org.json.simple.parser.ParseException;

import java.io.BufferedReader;
import java.io.IOException;
import java.util.*;

/**
 * Created by jgw87 on 7/2/14.
 * A Builder class to create a GenomeFeatureMap to identify genomic features. Can build piecemeal or read in from a file
 * For now, not implementing functionality to make a new builder from an existing map.
 */
//TODO: Add functionality for reading in a precompiled map
public class GenomeFeatureMapBuilder {

    private static final Logger myLogger = LogManager.getLogger(GenomeFeatureMapBuilder.class);

    //Graph of all the genome features connecting to each other
    DirectedGraph featureTree = null;

    //Lookups to identify GenomeFeatures by their name, location, and type (exon, gene, etc)
    private HashMap nameLookup = new HashMap<>();
    private HashMap>> locationLookup = new HashMap<>(); // Complex.
    private Multimap typeLookup = HashMultimap.create();

    //Helper variables used to store information to build the feature map
    HashSet chromosomes = new HashSet<>();


    public GenomeFeatureMap build(){
        buildGenomeTree();
        buildLocationLookup();
        return new GenomeFeatureMap(nameLookup, typeLookup, locationLookup, featureTree);
    }

    /**
     * Build the genome feature tree, linking everything to its parent. If something has no parent, it's assigned to the
     * chromosome (and if no chromosome, then the genome, but that's not very useful). It also checks that the genome itself
     * and chromosomes exist as their own {@link GenomeFeature}s, and creates them if needed.
     */
    private void buildGenomeTree(){
        addRootFeaturesIfNeeded();

        //Go through each feature and add parent IDs for those that don't have them (link to chromosome or genome)
        GenomeFeature genome = getGenome();
        ArrayList toReplace = new ArrayList();
        for(GenomeFeature feature: nameLookup.values()){
            if(feature.id() == getGenome().id()){   //Don't assign the genome as its own parent
                continue;
            }
            if(feature.parentId().equals("NA") || !nameLookup.containsKey(feature.parentId())){
                GenomeFeatureBuilder replacement = new GenomeFeatureBuilder(feature);
                if(feature.chromosome().equals("NA")){
                    replacement.parentId(genome.id());
                }else{
                    replacement.parentId(feature.chromosome());
                }
                toReplace.add(replacement.build());
            }
        }
        for(GenomeFeature f:toReplace){
            replaceFeature(f);
        }

        //Build the actual genome graph based on parent information
        GraphBuilder builder = new GraphBuilder(Graph.GraphType.DIRECTED);
        for(GenomeFeature feature: nameLookup.values()){
            if(feature.id() == getGenome().id()){   //Don't assign the genome as its own parent
                continue;
            }
            if(!nameLookup.containsKey(feature.parentId())){
                myLogger.warn("WARNING! Unable to find parent " + feature.parentId() + " for feature " + feature.id());
                continue;
            }
            GenomeFeature parent = nameLookup.get(feature.parentId());
            builder.addEdge(parent, feature);
        }
        featureTree = (DirectedGraph) builder.build();
    }

    /**
     * Add the genome itself and each chromosome as an overarching GenomeFeatures if they haven't been added already
     */
    private void addRootFeaturesIfNeeded(){

        //Check if a feature names "genome" has been added. If so, confirm there's only one. If not, create it.
        GenomeFeature genome;
        if(typeLookup.keySet().contains("genome")){
            genome = getGenome();
        }else{
            genome = new GenomeFeatureBuilder().id("GENOME").type("genome").build();
            addFeature(genome);
        }

        //Go through and check/create chromosomes, assigning their parent to the genome if need be.
        ArrayList toReplace = new ArrayList();
        ArrayList toAdd = new ArrayList();
        for(String c: chromosomes){
            //If chromosome already entered, check that parent is assigned to genome
            if(nameLookup.containsKey(c)){
                GenomeFeature mychrom = nameLookup.get(c);
                //No parent assigned -> Assign to the genome
                if(mychrom.parentId().equals("NA")){
                    GenomeFeature newChrom = new GenomeFeatureBuilder(mychrom).parentId(genome.id()).build();
                    toReplace.add(newChrom);
                //parent assigned, but to different feature -> throw error
                }else if(!mychrom.parentId().equals(genome.id())){
                    throw new UnsupportedOperationException("Error: Chromosome " + mychrom.id() + " assigned with parent "
                            + mychrom.parentId() + " that does not match the recognized genome " + genome.id());
                }
            }else{ //Chromosome not entered yet -> make a new one
                System.out.println("Making chromosome " + c + " from scratch");
                GenomeFeature newChrom = new GenomeFeatureBuilder().id(c).parentId(genome.id()).chromosome(c).type("chromosome").build();
                toAdd.add(newChrom);
            }
        }

        //Actually add & replace the relevant GenomeFeatures; Done here to avoid ConcurrentModificationExceptions
        for(GenomeFeature f: toAdd){
            addFeature(f);
        }
        for(GenomeFeature f: toReplace){
            replaceFeature(f);
        }
    }

    private GenomeFeature getGenome(){
        if(typeLookup.keySet().contains("genome")) {
            Collection genomes = typeLookup.get("genome");
            //If somehow can't pull feature out, throw an error
            if (genomes.size() == 0) {
                throw new UnsupportedOperationException("Error: Cannot find feature with type 'genome' even though supposedly loaded");
            }

            //If more than one feature annotated as "genome", throw an error
            if (genomes.size() > 1) {
                StringBuilder multiples = new StringBuilder();
                for (GenomeFeature g : genomes) {
                    multiples.append("\n\t" + g.id());
                }
                throw new UnsupportedOperationException("Error: Attempt to build a GenomeFeatureMap with more than one feature annotated as type 'genome':" + multiples);
            }
            return(genomes.iterator().next()); //Take first (and by this point, only) feature
        }else{
            myLogger.warn("Warning! Attempting to retrieve the root genome when it hasn't been created yet");
            return null;
        }
    }

    private void buildLocationLookup(){
        //Initialize each chromosome as a new Rangemap
        for(String c: chromosomes){
            RangeMap> newmap = TreeRangeMap.create();
            locationLookup.put(c, newmap);
        }

        for(GenomeFeature feature: nameLookup.values()){
            RangeMap mymap = locationLookup.get(feature.chromosome());
            addFeatureToRangemap(mymap, feature);
        }
    }

    /**
     * Add a GenomeFeature to a RangeMap, stacking it on top of any existing features instead of overwriting them
     * (how RangeMap behaves natively)
     * @param masterMap Rangemap object to be modified
     * @param feature GenomeFeature to be added. Only start and stop position are checked
     */
    public static void addFeatureToRangemap(
            RangeMap>  masterMap, GenomeFeature feature){

        //Check that range is valid (all >=0). Features with invalid ranges (e.g., default chromosomes) are ignored
        if(feature.start() <0 || feature.stop() < 0){
            return;
        }

        //Make ranges closed-open b/c otherwise results in a lot of 1-bp redundant ranges where adjacent features were added
        Range featureRange = Range.closedOpen(feature.start(), feature.stop() + 1);

        //First, extract out subrange and save any annotations already there. (Requires making a copy so later modification of
        // masterMap doesn't change things
        ArrayList> rangeList = new ArrayList<>();
        ArrayList> hashList = new ArrayList<>();
        Map, HashSet> subranges = masterMap.subRangeMap(featureRange).asMapOfRanges();
        for(Range r: subranges.keySet()){
            rangeList.add(Range.closedOpen(r.lowerEndpoint(), r.upperEndpoint()));
            hashList.add(new HashSet(subranges.get(r)));
        }

        //Next, set entire range equal to the new feature to cover any areas not covered by the existing ranges
        HashSet newset = new HashSet<>();
        newset.add(feature);
        masterMap.put(featureRange, newset);

        //Take the extracted, existing annotations and add back in, adding this feature to the list since they overwrite
        // everything already there
        for(int i=0; i tempset = hashList.get(i);
            Range temprange = rangeList.get(i);
            tempset.add(feature); //Add the feature on top of existing ones
            masterMap.put(temprange, tempset);
        }
    }

    /**
     * Adds a GenomeFeature to the map. If the feature's unique ID has already been loaded, it throws an UnsupportedOperationException
     * @param feature The GenomeFeature to be added
     * @return This builder
     */
    public GenomeFeatureMapBuilder addFeature(GenomeFeature feature){
        String ID=feature.id();
        if(nameLookup.containsKey(ID)){
            throw new UnsupportedOperationException("Error: Attempt to add a GenomeFeature whose unique ID is already loaded: "
            + ID);
        }
        return this.addOrReplaceFeature(feature);   //Done to avoid code duplication
    }

    /**
     * Replaces the GenomeFeature with a specified ID with a new one. If unique ID isn't already in the map, it throws
     * an UnsupportedOperationException.
     * @param feature The GenomeFeature to be added
     * @return This builder
     */
    public GenomeFeatureMapBuilder replaceFeature(GenomeFeature feature){
        String ID=feature.id();
        if(!nameLookup.containsKey(ID)){
            throw new UnsupportedOperationException("Error: Attempt to replace a GenomeFeature whose unique ID has not been loaded yet: "
                    + ID);
        }
        return this.addOrReplaceFeature(feature);   //Done to avoid code duplication
    }

    /**
     * Adds a GenomeFeature to the map, regardless of whether or not it's already been added. This method throws no
     * warnings if you'll overwrite existing data, so use it with caution.
     * @param feature The GenomeFeature to be added
     * @return This builder
     */
    public GenomeFeatureMapBuilder addOrReplaceFeature(GenomeFeature feature){
        nameLookup.put(feature.id(), feature);
        typeLookup.put(feature.type(), feature);
        if(!feature.chromosome().equals("NA")){
            chromosomes.add(feature.chromosome());
        }
        return this;
    }

    /**
     * Load in data from a GFF (Gene Feature Format) file. Since GFF files have only a loose standard for the 9th column,
     * this involves several ad-hoc heuristics about what things to look for. As such, it is not the preferred way to
     * read in annotations. (That is JSON or tab-delimited format.)
     *
     * This method does not build the map, so you can string multiple calls together (if, for example, you have
     * different annotations in different files)
     *
     * @param filename
     */
    public GenomeFeatureMapBuilder addFromGffFile(String filename){
        myLogger.warn("GenomeFeatureMapBuilder - Loading genome annotations from GFF file. Will try to parse annotations " +
                "field as best as possible. (JSON or tab-delimited formats are preferred.)");
        try {
            BufferedReader reader = Utils.getBufferedReader(filename);
            String line=reader.readLine();
            while(line != null ){
                if(line.startsWith("#")){    //Skip comment lines
                    line=reader.readLine();
                    continue;
                }
                GenomeFeature newFeature = new GenomeFeatureBuilder().parseGffLine(line).build();
                addFeature(newFeature);
                line=reader.readLine();
            }
            reader.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return this;
    }


    /**
     * Load in data from a JSON-formatted file. JSON format is defined at http://www.json.org/, and consists of structured
     * key-value pairs. For genome features, the key is the name of an attribute and the value is (obviously) its value.
     * (For example: "chromosome":1). Note that if you have more than one feature per file (the normal case), all but the
     * last closing brace ('}') should be followed by a comma, and the whole group should be within square braces ('[...]'
     * That is, the first character of the file should be '[' and the last should be ']'). This makes it a properly-formatted
     * JSON array.
     *
     * Common attributes (keys) are listed below. Although only the "id" attribute is required, a feature is pretty
     * useless without some sort of positional information (chromosome, start/stop, etc.).
     *   "id":       Unique identifier for this feature. Repeated identifiers throw an error. (Also accepts "name".) Required.
     *   "chrom":    Which chromosome it occurs on (Also accepts "chr" or "chromosome")
     *   "start":    Start position on the chromosome
     *   "stop":     Stop position on the chromosome (Also accepts "end")
     *   "position": Postion on chromosome (in place of "start" and "stop" for features that are a single nucleotide)
     *   "parent_id":   What other named feature this descends from (eg, Gene -> Transcript -> Exon). If none given, this
     *                   will default to the chromosome (or the genome if chromosome isn't supplied)
     *
     * This method does not build the map, so you can string multiple calls together (if, for example, you have
     * different annotations in different files).
     * @param filename
     */
    public GenomeFeatureMapBuilder addFromJsonFile(String filename) {
        JSONParser jparse = new JSONParser();
        BufferedReader reader = Utils.getBufferedReader(filename);
        try {
            JSONArray jarray = (JSONArray) jparse.parse(reader);
            Iterator iter = jarray.iterator();
            while(iter.hasNext()){
                JSONObject json = (JSONObject) iter.next();
                GenomeFeature newFeature = new GenomeFeatureBuilder().parseJsonObject(json).build();
                addFeature(newFeature);
            }
            reader.close();
        } catch (IOException e) {
            myLogger.error("Error loading data from JSON file " + filename);
            e.printStackTrace();
        } catch (ParseException e) {
            myLogger.error("Error parsing information in JSON file " + filename);
            e.printStackTrace();
        }

        return this;
    }

    //Alternate implementation with manually reading in the JSON data; not recommended
    /*public GenomeFeatureMapBuilder addFromJsonFile(String filename) {
        JSONParser jparse = new JSONParser();
        BufferedReader reader = Utils.getBufferedReader(filename);
        try {
            String inline = reader.readLine();
            String tempJson = "";
            while(inline != null){
                tempJson += inline;
                //If has closing brace (mark of end of object), then parse
                while(tempJson.contains("}")){  //While loop in case multiple objects on one line
                    //Subtract out JSON object
                    int start=tempJson.indexOf('{');
                    int end=tempJson.indexOf('}');
                    String json = tempJson.substring(start, end+1);
                    System.out.println("jsonAsString:" + tempJson);
                    System.out.println("\ttake out:" + json);
                    tempJson = tempJson.substring(end + 1, tempJson.length());    //Remove everything
                    System.out.println("\tleft:" + tempJson);

                    //Add data as feature
                    JSONObject featureData = (JSONObject) jparse.parse(json);
                    GenomeFeature newFeature = new GenomeFeatureBuilder().parseJsonObject(featureData).build();
                    addFeature(newFeature);
                }
                inline = reader.readLine();
            }
        } catch (IOException e) {
            myLogger.error("Error loading data from JSON file " + filename);
            e.printStackTrace();
        } catch (ParseException e) {
            myLogger.error("Error parsing information in JSON file " + filename);
            e.printStackTrace();
        }

        return this;
    }*/

    /**
     * Load in data from a flat, tab-delimited text file. The first row should be a header identifying what attribute is
     * in each column, and each subsequent row should correspond to a single feature. Columns that don't apply to a
     * given feature should use "NA" or left empty. Common attributes (columns) are listed below. Although only the "id"
     * attribute is required, a feature is pretty useless without some sort of positional information (chromosome,
     * start/stop, etc.).
     *   "id":       Unique identifier for this feature. Repeated identifiers throw an error (Also accepts "name".) Required.
     *   "chrom":    Which chromosome it occurs on (Also accepts "chr" or "chromosome")
     *   "start":    Start position on the chromosome
     *   "stop":     Stop position on the chromosome (Also accepts "end")
     *   "position": Postion on chromosome (in place of "start" and "stop" for features that are a single nucleotide)
     *   "parent_id":   What other named feature this descends from (eg, Gene -> Transcript -> Exon). If none given, this
     *                   will default to the chromosome (or the genome if chromosome isn't supplied)
     *
     * This method does not build the map, so you can string multiple calls together (if, for example, you have
     * different annotations in different files).
     * @param filename
     */
    public GenomeFeatureMapBuilder addFromFlatFile(String filename) {
        try {
            BufferedReader reader = Utils.getBufferedReader(filename);
            String line=reader.readLine();
            String[] header = null;
            int n=1;    //Keep track of line numbers
            while(line != null ){
                n++;
                if(line.startsWith("#")){    //Skip comment lines
                    line=reader.readLine();
                    continue;
                }
                String[] tokens = line.split("\t");

                if(header == null){ //Save header data
                    header=tokens;
                    line=reader.readLine();
                    continue;
                }

                //Check that number of fields is correct
                if(tokens.length != header.length){
                    myLogger.error("Error: line " + n + " has a different number of fields (" + tokens.length + ") than the header (" + header.length + ")");
                }

                //Load everything into a hapmap
                HashMap data = new HashMap<>();
                for(int i=0; i