All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.dna.map.GenomeFeatureMap Maven / Gradle / Ivy

package net.maizegenetics.dna.map;

import com.google.common.collect.BoundType;
import com.google.common.collect.Multimap;
import com.google.common.collect.Range;
import com.google.common.collect.RangeMap;
import net.maizegenetics.util.DirectedGraph;
import net.maizegenetics.util.Utils;
import org.apache.commons.lang.StringUtils;
import org.json.simple.JSONObject;

import java.io.BufferedWriter;
import java.io.IOException;
import java.util.*;

/**
 * Created by jgw87 on 7/2/14.
 * A map to hold genome features for lookup by name and by location. The features themselves are hierarchical and so
 * can be traced up and down the tree.
 * 

* As methods are added to return based on different filters, try to unify the results. That is, all filter-like methods * should return the same sort of data structure, such as a HashSet of GenomeFeatures. It may be worthwhile to create * a custom GenomeFeatureSet class to be able to string such operations together (mygenes = mygenes.ofType("exon").onChrom(1).inRange(1, 10000);), * although whether such filters are needed for the bulk of this class's purpose (matching SNPs to genome annotations) is yet * to be seen. *

* This class shouldn't be created directly, but should instead use the GenomeFeatureMapBuilder class .build() method */ //TODO: Add functionality to write out a full constructed map to a file of some sort public class GenomeFeatureMap { //Graph of all the genome features, rooted at the genome itself DirectedGraph featureTree = null; //Lookups to identify GenomeFeatures by their name, type, and location private HashMap nameLookup = new HashMap<>(); private Multimap typeLookup = null; private HashMap>> locationLookup = null; /** * Default constructor for creating a GenomeFeatureMap from pre-made data structures. This should ONLY be called from * the {@link GenomeFeatureMapBuilder} class * * @param nameLookup Lookup table of unique IDs -> {@link GenomeFeature} objects * @param locationLookup Lookup table to retrieve {@link GenomeFeature}s by their genomic location * @param typeLookup Lookup table to retrieve {@link GenomeFeature}s by their type (exon, UTR, etc) * @param featureTree The graph of genomic features. Should be a directed graph, with the first two levels being genome and chromosome */ GenomeFeatureMap(HashMap nameLookup, Multimap typeLookup, HashMap>> locationLookup, DirectedGraph featureTree) { this.typeLookup = typeLookup; this.nameLookup = nameLookup; this.locationLookup = locationLookup; this.featureTree = featureTree; } public GenomeFeature getFeatureFromId(String id) { return nameLookup.get(id); } /** * Get a {@link HashSet} of {@link GenomeFeature}s at a specified genome location. Takes chromsome as a String * for ones like "Pt", "scaffold487", etc. * @param chrom The chromosome name * @param start Beginning physical position * @param end End physical position * @return A {@link HashSet} of GenomeFeatures */ public HashSet getFeaturesInRange(String chrom, int start, int end) { Range myrange = Range.closed(start, end); //'Closed' = inclusive, so closed(1,3) = 1,2,3 and closed(1,1) = 1 Map, HashSet> chromMap = locationLookup.get(chrom).subRangeMap(myrange).asMapOfRanges(); HashSet featureSet = new HashSet<>(); for(Range r: chromMap.keySet()){ featureSet.addAll(chromMap.get(r)); } return featureSet; } /** * Get a {@link HashSet} of {@link GenomeFeature}s at a specified genome location * @param chrom Chromosome number (should be the same as its name) * @param position Physical position (base pair) * @return A HashSet of GenomeFeatures */ public HashSet getFeaturesAtLocation(int chrom, int position) { return getFeaturesInRange(chrom, position, position); } /** * Get a {@link HashSet} of {@link GenomeFeature}s at a specified genome location Takes chromsome as a String * for ones like "Pt", "scaffold487", etc. * @param chrom Chromosome name * @param position Physical position (base pair) * @return A HashSet of GenomeFeatures */ public HashSet getFeaturesAtLocation(String chrom, int position) { return getFeaturesInRange(chrom, position, position); } /** * Get a {@link HashSet} of {@link GenomeFeature}s at a specified genome location. Takes chromsome as a String * for ones like "Pt", "scaffold487", etc. * @param chrom The chromosome number (should be the same as its name) * @param start Beginning physical position * @param end End physical position * @return A HashSet of GenomeFeatures */ public HashSet getFeaturesInRange(int chrom, int start, int end) { return getFeaturesInRange("" + chrom, start, end); } /** Get all {@link GenomeFeature}s of a specified type * @param type The type of feature to get * @return A {@link HashSet} of GenomeFeatures */ public HashSet getFeaturesOfType(String type){ HashSet featureSet = new HashSet<>(); featureSet.addAll(typeLookup.get(type)); return featureSet; } /** * Write just the location lookup to a tab-delimited file. This is mostly to check that your locations loaded properly, * as there is no way to read them back in. * * @param filename The output file to be written to */ public void writeLocationLookupToFile(String filename) { try { BufferedWriter writer = Utils.getBufferedWriter(filename); writer.append("chrom\tstart\tstop\tfeatures\n"); String[] chroms = locationLookup.keySet().toArray(new String[0]); Arrays.sort(chroms); for (String chrom : chroms) { Map, HashSet> itermap = locationLookup.get(chrom).asMapOfRanges(); for (Range r : itermap.keySet()) { int start = r.lowerEndpoint(); int stop = r.upperEndpoint(); //Adjust start-stop positions if ranges are open on that end (= up to but not including that number) if (r.lowerBoundType() == BoundType.OPEN) { start++; } if (r.upperBoundType() == BoundType.OPEN) { stop--; } writer.append(chrom + "\t" + start + "\t" + stop + "\t"); //List of features HashSet features = itermap.get(r); for (GenomeFeature f : features) { writer.append(f.id() + ";"); } writer.append("\n"); } } writer.close(); } catch (IOException e) { e.printStackTrace(); } } /** * Write the map data as a JSON file (which can be read in by {@link GenomeFeatureMapBuilder}). Core attributes * (unique ID, chromosome, start, stop, and parent ID) are output to all features. Any additional attributes are * output only for those features that have them. Attributes are output in alphabetical order. Since the attribute * name has to be output for every feature, this can waste space if all your features have the same attributes. In * that case a tab-delimited flat file or precompiled binary file is probably a better choice. * * @param filename The output file to be written to */ public void writeMapAsJsonFile(String filename) { try { BufferedWriter writer = Utils.getBufferedWriter(filename); writer.append("[\n"); String[] sortedNames = nameLookup.keySet().toArray(new String[0]); //Sorted array so ordering is consistent Arrays.sort(sortedNames); //Go through with a for loop instead of foreach to know when hit last element for (int i=0; i attributes = new HashSet<>(); for(GenomeFeature feature: nameLookup.values()){ attributes.addAll(feature.annotations().keySet()); } //Put column heads in alphabetical order (except for "id"; that goes first and _should_ always be included) ArrayList header = new ArrayList<>(); if(attributes.contains("id")){ attributes.remove("id"); header.add("id"); } String[] tempNames = attributes.toArray(new String[0]); Arrays.sort(tempNames); for(String s: tempNames){ header.add(s); } //Write out file try { BufferedWriter writer = Utils.getBufferedWriter(filename); //Header line String firstline = StringUtils.join(header, "\t") + "\n"; writer.append(firstline); //Bulk of data String[] names = nameLookup.keySet().toArray(new String[0]); Arrays.sort(names); for(String id: names){ GenomeFeature feature = nameLookup.get(id); ArrayList data = new ArrayList<>(); for(String column: header){ data.add(feature.getAnnotation(column)); } writer.append(StringUtils.join(data, "\t") + "\n"); } writer.close(); }catch(IOException e){ e.printStackTrace(); } } /** * Write the map data as a precompiled binary that can be read back in by {@link GenomeFeatureMapBuilder}). Unlike * JSON or flatfile format, this file is not human-readable and is simply a a representation of the Java data * structures saved onto a disk. This makes it very compact and fast to read back in. This is the preferred format * for long-term storage of a {@link GenomeFeatureMap} since there is (almost) no risk of someone accidentally modifying * the file. * * @param filename The output file to be written to */ //TODO: Test and use this //TODO: Implement Serializable interface in order to write all these things out public void writeMapAsBinaryFile(String filename) { } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy