net.maizegenetics.dna.map.GenomeFeatureMap Maven / Gradle / Ivy

package net.maizegenetics.dna.map;

import com.google.common.collect.BoundType;
import com.google.common.collect.Multimap;
import com.google.common.collect.Range;
import com.google.common.collect.RangeMap;
import net.maizegenetics.util.DirectedGraph;
import net.maizegenetics.util.Utils;
import org.apache.commons.lang.StringUtils;
import org.json.simple.JSONObject;

import java.io.BufferedWriter;
import java.io.IOException;
import java.util.*;

/**
 * Created by jgw87 on 7/2/14.
 * A map to hold genome features for lookup by name and by location. The features themselves are hierarchical and so
 * can be traced up and down the tree.
 * 
 * As methods are added to return based on different filters, try to unify the results. That is, all filter-like methods
 * should return the same sort of data structure, such as a HashSet of GenomeFeatures. It may be worthwhile to create
 * a custom GenomeFeatureSet class to be able to string such operations together (mygenes = mygenes.ofType("exon").onChrom(1).inRange(1, 10000);),
 * although whether such filters are needed for the bulk of this class's purpose (matching SNPs to genome annotations) is yet
 * to be seen.
 * 
 * This class shouldn't be created directly, but should instead use the GenomeFeatureMapBuilder class .build() method
 */
//TODO: Add functionality to write out a full constructed map to a file of some sort
public class GenomeFeatureMap {


    //Graph of all the genome features, rooted at the genome itself
    DirectedGraph featureTree = null;

    //Lookups to identify GenomeFeatures by their name, type, and location
    private HashMap nameLookup = new HashMap<>();
    private Multimap typeLookup = null;
    private HashMap>> locationLookup = null;

    /**
     * Default constructor for creating a GenomeFeatureMap from pre-made data structures. This should ONLY be called from
     * the {@link GenomeFeatureMapBuilder} class
     *
     * @param nameLookup     Lookup table of unique IDs -> {@link GenomeFeature} objects
     * @param locationLookup Lookup table to retrieve {@link GenomeFeature}s by their genomic location
     * @param typeLookup     Lookup table to retrieve {@link GenomeFeature}s by their type (exon, UTR, etc)
     * @param featureTree    The graph of genomic features. Should be a directed graph, with the first two levels being genome and chromosome
     */
    GenomeFeatureMap(HashMap nameLookup, Multimap typeLookup,
                     HashMap>> locationLookup, DirectedGraph featureTree) {
        this.typeLookup = typeLookup;
        this.nameLookup = nameLookup;
        this.locationLookup = locationLookup;
        this.featureTree = featureTree;
    }

    public GenomeFeature getFeatureFromId(String id) {
        return nameLookup.get(id);
    }

    /**
     * Get a {@link HashSet} of {@link GenomeFeature}s at a specified genome location. Takes chromsome as a String
     * for ones like "Pt", "scaffold487", etc.
     * @param chrom The chromosome name
     * @param start Beginning physical position
     * @param end End physical position
     * @return A {@link HashSet} of GenomeFeatures
     */
    public HashSet getFeaturesInRange(String chrom, int start, int end) {
        Range myrange = Range.closed(start, end); //'Closed' = inclusive, so closed(1,3) = 1,2,3 and closed(1,1) = 1
        Map, HashSet> chromMap = locationLookup.get(chrom).subRangeMap(myrange).asMapOfRanges();
        HashSet featureSet = new HashSet<>();
        for(Range r: chromMap.keySet()){
            featureSet.addAll(chromMap.get(r));
        }
        return featureSet;
    }

    /**
     * Get a {@link HashSet} of {@link GenomeFeature}s at a specified genome location
     * @param chrom Chromosome number (should be the same as its name)
     * @param position Physical position (base pair)
     * @return A HashSet of GenomeFeatures
     */
    public HashSet getFeaturesAtLocation(int chrom, int position) {
        return getFeaturesInRange(chrom, position, position);
    }

    /**
     * Get a {@link HashSet} of {@link GenomeFeature}s at a specified genome location Takes chromsome as a String
     * for ones like "Pt", "scaffold487", etc.
     * @param chrom Chromosome name
     * @param position Physical position (base pair)
     * @return A HashSet of GenomeFeatures
     */
    public HashSet getFeaturesAtLocation(String chrom, int position) {
        return getFeaturesInRange(chrom, position, position);
    }

    /**
     * Get a {@link HashSet} of {@link GenomeFeature}s at a specified genome location. Takes chromsome as a String
     * for ones like "Pt", "scaffold487", etc.
     * @param chrom The chromosome number (should be the same as its name)
     * @param start Beginning physical position
     * @param end End physical position
     * @return A HashSet of GenomeFeatures
     */
    public HashSet getFeaturesInRange(int chrom, int start, int end) {
        return getFeaturesInRange("" + chrom, start, end);
    }

    /** Get all {@link GenomeFeature}s of a specified type
     * @param type The type of feature to get
     * @return A {@link HashSet} of GenomeFeatures
     */
    public HashSet getFeaturesOfType(String type){
        HashSet featureSet = new HashSet<>();
        featureSet.addAll(typeLookup.get(type));
        return featureSet;
    }

    /**
     * Write just the location lookup to a tab-delimited file. This is mostly to check that your locations loaded properly,
     * as there is no way to read them back in.
     *
     * @param filename The output file to be written to
     */
    public void writeLocationLookupToFile(String filename) {
        try {
            BufferedWriter writer = Utils.getBufferedWriter(filename);
            writer.append("chrom\tstart\tstop\tfeatures\n");
            String[] chroms = locationLookup.keySet().toArray(new String[0]);
            Arrays.sort(chroms);
            for (String chrom : chroms) {
                Map, HashSet> itermap = locationLookup.get(chrom).asMapOfRanges();
                for (Range r : itermap.keySet()) {
                    int start = r.lowerEndpoint();
                    int stop = r.upperEndpoint();
                    //Adjust start-stop positions if ranges are open on that end (= up to but not including that number)
                    if (r.lowerBoundType() == BoundType.OPEN) {
                        start++;
                    }
                    if (r.upperBoundType() == BoundType.OPEN) {
                        stop--;
                    }
                    writer.append(chrom + "\t" + start + "\t" + stop + "\t");
                    //List of features
                    HashSet features = itermap.get(r);
                    for (GenomeFeature f : features) {
                        writer.append(f.id() + ";");
                    }
                    writer.append("\n");
                }
            }
            writer.close();
        } catch (IOException e) {
            e.printStackTrace();
        }

    }

    /**
     * Write the map data as a JSON file (which can be read in by {@link GenomeFeatureMapBuilder}). Core attributes
     * (unique ID, chromosome, start, stop, and parent ID) are output to all features. Any additional attributes are
     * output only for those features that have them. Attributes are output in alphabetical order. Since the attribute
     * name has to be output for every feature, this can waste space if all your features have the same attributes. In
     * that case a tab-delimited flat file or precompiled binary file is probably a better choice.
     *
     * @param filename The output file to be written to
     */
    public void writeMapAsJsonFile(String filename) {
        try {
            BufferedWriter writer = Utils.getBufferedWriter(filename);
            writer.append("[\n");
            String[] sortedNames = nameLookup.keySet().toArray(new String[0]);  //Sorted array so ordering is consistent
            Arrays.sort(sortedNames);

            //Go through with a for loop instead of foreach to know when hit last element
            for (int i=0; i attributes = new HashSet<>();
        for(GenomeFeature feature: nameLookup.values()){
            attributes.addAll(feature.annotations().keySet());
        }

        //Put column heads in alphabetical order (except for "id"; that goes first and _should_ always be included)
        ArrayList header = new ArrayList<>();
        if(attributes.contains("id")){
            attributes.remove("id");
            header.add("id");
        }
        String[] tempNames = attributes.toArray(new String[0]);
        Arrays.sort(tempNames);
        for(String s: tempNames){
            header.add(s);
        }

        //Write out file
        try {
            BufferedWriter writer = Utils.getBufferedWriter(filename);

            //Header line
            String firstline = StringUtils.join(header, "\t") + "\n";
            writer.append(firstline);

            //Bulk of data
            String[] names = nameLookup.keySet().toArray(new String[0]);
            Arrays.sort(names);
            for(String id: names){
                GenomeFeature feature = nameLookup.get(id);
                ArrayList data = new ArrayList<>();
                for(String column: header){
                    data.add(feature.getAnnotation(column));
                }
                writer.append(StringUtils.join(data, "\t") + "\n");
            }
            writer.close();
        }catch(IOException e){
            e.printStackTrace();
        }
    }

    /**
     * Write the map data as a precompiled binary that can be read back in by {@link GenomeFeatureMapBuilder}). Unlike
     * JSON or flatfile format, this file is not human-readable and is simply a a representation of the Java data
     * structures saved onto a disk. This makes it very compact and fast to read back in. This is the preferred format
     * for long-term storage of a {@link GenomeFeatureMap} since there is (almost) no risk of someone accidentally modifying
     * the file.
     *
     * @param filename The output file to be written to
     */
    //TODO: Test and use this
    //TODO: Implement Serializable interface in order to write all these things out
    public void writeMapAsBinaryFile(String filename) {

    }
}