net.maizegenetics.pangenome.processAssemblyGenomes.Mummer4DoonerBZStats Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
/**
*
*/
package net.maizegenetics.pangenome.processAssemblyGenomes;
import java.awt.Frame;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.sql.Connection;
import java.sql.ResultSet;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import javax.swing.ImageIcon;
import org.apache.log4j.Logger;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Range;
import com.google.common.collect.RangeMap;
import com.google.common.collect.RangeSet;
import com.google.common.collect.TreeRangeMap;
import com.google.common.collect.TreeRangeSet;
import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.pangenome.api.CreateGraphUtils;
import net.maizegenetics.pangenome.api.ReferenceRange;
import net.maizegenetics.pangenome.db_loading.DBLoadingUtils;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.util.Tuple;
import net.maizegenetics.util.Utils;
/**
* This method takes a coords file, the genome fastas, ranges to be covered
* Prints out tab-delimited file of metrics related to the region.
*
* THe output is tab-delimited, rows are anchors.
* columns are
* anchor-coordinates,
* covered coordinates (list, semicolon separated)
* percent covered.
*
* @author lcj34
*
*/
public class Mummer4DoonerBZStats extends AbstractPlugin{
private static final Logger myLogger = Logger.getLogger(Mummer4DoonerBZStats.class);
private static final Pattern tab = Pattern.compile("\t");
private PluginParameter coordsFile = new PluginParameter.Builder("ref", null, String.class).guiName("Mummer Coords File").required(true).inFile()
.description("Output file created by Mummer show-coords ").build();
private PluginParameter intervalsFile = new PluginParameter.Builder("intervalsFile", null, String.class).guiName("Anchor Intervals File").required(false).inFile()
.description("Anchor Intervals file to be used when intervals are different than DB, e.g. when using just a small region ").build();
private PluginParameter mummerParams = new PluginParameter.Builder("mummerParams", null, String.class).guiName("Mummer Parameters").required(true)
.description("Mummer parameters used").build();
private PluginParameter prefix = new PluginParameter.Builder("prefix", null, String.class).guiName("Output File refix").required(true)
.description("Name to prefix to output results file").build();
private PluginParameter outputDir = new PluginParameter.Builder("outputDir", null, String.class).guiName("Output Directory").required(true).outDir()
.description("Output directory including trailing / for writing files").build();
private PluginParameter configFile = new PluginParameter.Builder("configFile", null, String.class).guiName("DB Config File").required(true).inFile()
.description("File containing lines with data for host=, user=, password= and DB=, DBtype= used for db connection").build();
private PluginParameter refChrom = new PluginParameter.Builder("refChrom", null, String.class).guiName("Reference Chromosome Name").required(true)
.description("Name of reference chromsome as stored in the database. This is the chromosome whose anchors will be pulled.").build();
public Mummer4DoonerBZStats() {
super(null, false);
}
public Mummer4DoonerBZStats(Frame parentFrame) {
super(parentFrame, false);
}
public Mummer4DoonerBZStats(Frame parentFrame, boolean isInteractive) {
super(parentFrame, isInteractive);
}
/**
* @param args
*/
public static void main(String[] args) {
//GeneratePluginCode.generate(Mummer4DoonerBZStats.class);
String baseDir = "/Users/lcj34/notes_files/phg_2018/assembly_testing/mummer/ed_deltaGvsdelta_metrics/";
String intervalsFile = null;
String prefix = "b73ph207_chr9_c250mum_delta_anchorDetail";
String coordsFile = baseDir + "ref_PH207_c250mum.coords";
// String intervalsFile = "/Users/lcj34/notes_files/phg_2018/assembly_testing/mummer/metrics_junit/intervals_bzRegion.txt";
// String prefix = "b73ph207_bzRegions_c250mum";
// String coordsFile = baseDir + "b73ph207_bzRegion_c250mum.coords";
String mummerParams = "c 250 mum ";
String config = "/Users/lcj34/notes_files/phg_2018/assembly_testing/mummer/configSQLite.txt";
String refChrom = "9";
String outputDir = baseDir;
// if (args.length != 6) {
// System.out.println("Expecting 6 parameters in this order: ");
// System.out.println(" cordsFile name");
// System.out.println(" String of Mummer parameters used");
// System.out.println(" Prefix to append to output file as name");
// System.out.println(" Path to config file for connecting to DB");
// System.out.println(" Name of reference chromosome to pull anchors from DB");
// System.out.println(" Path to output Directory for metrics file");
// System.out.println(" Please fix parameters and try again.");
// return;
// }
//
// String coordsFile = args[0];
// String mummerParams = args[1];
// String prefix = args[2];
// String config = args[3];
// String refChrom = args[4];
// String outputDir = args[5];
System.out.println("Begin - call Mummer4DoonerBZStats");
new Mummer4DoonerBZStats()
.coordsFile(coordsFile)
.mummerParams(mummerParams)
.prefix(prefix)
.configFile(config)
.intervalsFile(intervalsFile)
.refChrom(refChrom)
.outputDir(outputDir)
.performFunction(null);
System.out.println("FINished junit");
}
@Override
public DataSet processData(DataSet input) {
// Get db connection to grab anchor info
System.out.println("Get db connection ...");
Connection connection = DBLoadingUtils.connection(configFile(),false);
if (connection == null) {
System.out.println("\ntestConnectToPOstgres: COuld not get connection for config file db");
throw new IllegalStateException ("MummerAnalysisMetricsPlugin: could not get db connection from configFile " + configFile() );
}
// Get anchor regions for chrom. using Chromosome object as the DB will have stripped
// off any leading chr/chromosome. need this name to match what is stored in the db
Chromosome refChromosome = Chromosome.instance(refChrom());
System.out.println("Get refRangesForChrom");
// get anchorID, ReferenceRanges for this chromosome. This gets translated into a RangeSet
Map idToRefRangeMap;
if (intervalsFile() == null) {
idToRefRangeMap = refRangesForChrom( connection, refChromosome.getName());
} else {
idToRefRangeMap = getAnchorFromIntervalsFile( intervalsFile(), refChromosome.getName());
}
System.out.println("Number of all anchors: " + idToRefRangeMap.keySet().size());
// Split RefRangeMap above into a RangeSet of anchor coordinates
Map> refAnchorIDRangeSetMap = getRangesForChrom(idToRefRangeMap, refChromosome, DBLoadingUtils.AnchorType.ANCHOR);
int numAnchors = refAnchorIDRangeSetMap.keySet().size();
int numAnchorsRepresented = 0;
// Read the coordinates file. Need to store the rangeSet for ref, but also associate it with chrom start/end.
System.out.println("Begin coords file processing");
RangeMap coordsFileRefToDataMap = TreeRangeMap.create();
String fullChromSummary = outputDir() + "/" + prefix() + "_metrics.txt";
try (BufferedReader br = Utils.getBufferedReader(coordsFile());
BufferedWriter bw = Utils.getBufferedWriter(fullChromSummary)) {
int headerCount = 0;
int linesProcessed = 0;
String line = null;
//TODO: assume file is sorted and has no header, then remove the header related lines
while ((line = br.readLine()) != null) {
// headerCount++;
// if (headerCount < 5) {
// continue; // skipping 4 lines of header
// }
linesProcessed++;
addToCoordsMap(coordsFileRefToDataMap,line);
}
// Now I have a map of the coordinates. I have the anchors from above.
// create lines to write to outputfile
String outputHeaderLine = "DB_anchorID\tdb-anchor-coordinates\tMummer-Ref_overlapping-Coordinates\tMummer-asm-coordinates\tPercent_IDs\tanchor-len\tbps-aligned\tPercent_anchor-covered\tMummer-params\n";
bw.write(outputHeaderLine);
for (Map.Entry> entry : refAnchorIDRangeSetMap.entrySet()) {
// This list contains the Entries overlapping the anchor
RangeMap anchorOverlapEntries = getCoordsEntriesForAnchor(entry.getValue(), coordsFileRefToDataMap);
if (anchorOverlapEntries.asMapOfRanges().isEmpty()) continue; // no portion of anchor was aligned
numAnchorsRepresented++;
int anchorID = entry.getKey();
Range dbAnchorRange = entry.getValue();
// compute overlap. Write to range set. call getRegionCoverage?
RangeSet overlappingAnchorRanges = getAnchorRangeSet(anchorOverlapEntries);
Tuple anchorCoverage = AssemblyProcessingUtils.getRegionCoverage(overlappingAnchorRanges, entry.getValue());
// Write the entries!
StringBuilder fileSB = new StringBuilder();
fileSB.append(anchorID).append("\t"); // DB anchor_id
fileSB.append(entry.getValue().lowerEndpoint()).append("-").append(entry.getValue().upperEndpoint()).append("\t"); // db anchor coords
StringBuilder refSB = new StringBuilder();
StringBuilder asmSB = new StringBuilder();
StringBuilder idSB = new StringBuilder();
for (Map.Entry, String> overlapEntry : anchorOverlapEntries.asMapOfRanges().entrySet()) {
Range refCoords = overlapEntry.getKey();
String refData = overlapEntry.getValue();
refSB.append(refCoords.lowerEndpoint()).append("-").append(refCoords.upperEndpoint()).append(";");
// refData has querystart\tqueryend\t%id: created in addToCoordsMap()
String[] refDataTokens = tab.split(refData);
asmSB.append(refDataTokens[0]).append("-").append(refDataTokens[1]).append(";");
idSB.append(refDataTokens[2]).append(";");
}
fileSB.append(refSB.toString()).append("\t"); // ref coords from mummer
fileSB.append(asmSB.toString()).append("\t"); // assembly coords from mummer
fileSB.append(idSB.toString()).append("\t"); // id lines from represented regions in file (for this anchor)
int anchorLen = (dbAnchorRange.upperEndpoint() - dbAnchorRange.lowerEndpoint()) +1;
fileSB.append(anchorLen).append("\t");
fileSB.append(anchorCoverage.x).append("\t"); // number of bps represented
fileSB.append(anchorCoverage.y).append("\t"); // percent of bps covered in this anchor
fileSB.append(mummerParams()).append("\n"); // mummer params used (same for all anchors)
bw.write(fileSB.toString());
}
System.out.println("Finished: Total lines processed: " + linesProcessed
+ ", totoal number of anchors for chrom: " + refAnchorIDRangeSetMap.keySet().size()
+ ", number of anchors represented: " + numAnchorsRepresented);
} catch (Exception exc) {
exc.printStackTrace();
throw new IllegalStateException("Mummer4DoonerBZStats - error processing data");
}
return null;
}
public Map getAnchorFromIntervalsFile( String intervalsFile, String chrom){
ImmutableMap.Builder builder = ImmutableMap.builder();
try (BufferedReader br = Utils.getBufferedReader(intervalsFile)) {
String line = br.readLine(); // skip header
while ((line = br.readLine())!= null) {
String[] tokens = tab.split(line);
int id = Integer.parseInt(tokens[0]);
String chromosome = tokens[1];
int start = Integer.parseInt(tokens[2]);
int end = Integer.parseInt(tokens[3]);
builder.put(id, new ReferenceRange("B73Ref", Chromosome.instance(chromosome), start, end, id));
}
} catch (Exception ioe) {
ioe.printStackTrace(); // numberFormat or IO exception
throw new IllegalStateException("getAnchorFromIntervalsFIle - error processing file " + intervalsFile);
}
Map result = builder.build();
return result;
}
public static Map refRangesForChrom(Connection dbConn, String chrom) {
// Create method name for querying initial ref region and inter-region ref_range_group method ids
String refLine = CreateGraphUtils.getRefLineName( dbConn);
String refMethodName = DBLoadingUtils.REGION_REFERENCE_RANGE_GROUP;
String refInterMethodName = DBLoadingUtils.INTER_REGION_REFERENCE_RANGE_GROUP;
List refGrpMethodsList = new ArrayList();
int refGrpMethodID = CreateGraphUtils.methodId(dbConn, refMethodName);
int refInterGrpMethodID = CreateGraphUtils.methodId(dbConn, refInterMethodName);
// Add method Ids to list if valid. There are not always inter-group methods in the db
if (refGrpMethodID > 0) refGrpMethodsList.add(Integer.toString(refGrpMethodID));
if (refInterGrpMethodID > 0) refGrpMethodsList.add(Integer.toString(refInterGrpMethodID));
String refGrpMethodString = refGrpMethodsList.stream().collect(Collectors.joining(","));
// query anchors, filter on chrom/version. This gets focus AND non-focus as the latter will be
// needed later.
// Query based on methodIds created above
String query = "select reference_ranges.ref_range_id, chrom,range_start,range_end, methods.name " +
" from reference_ranges, ref_range_ref_range_group,ref_range_groups, methods " +
" where reference_ranges.ref_range_id=ref_range_ref_range_group.ref_range_id " +
" AND ref_range_groups.ref_range_group_id = ref_range_ref_range_group.ref_range_group_id " +
" AND ref_range_groups.group_method_id = methods.method_id " +
" AND methods.method_type = " + DBLoadingUtils.MethodType.REF_RANGE_GROUP.getValue();
myLogger.info("refRangesForChrom: query statement: " + query);
ImmutableMap.Builder builder = ImmutableMap.builder();
try (ResultSet rs = dbConn.createStatement().executeQuery(query)) {
while (rs.next()) {
int id = rs.getInt("ref_range_id");
String chromosome = rs.getString("chrom");
int start = rs.getInt("range_start");
int end = rs.getInt("range_end");
String methodName = rs.getString("name");
builder.put(id, new ReferenceRange("B73Ref", Chromosome.instance(chromosome), start, end, id, methodName));
}
} catch (Exception se) {
myLogger.debug(se.getMessage(), se);
throw new IllegalStateException("CreateGraphUtils: referenceRanges: Problem querying the database: " + se.getMessage());
}
Map result = builder.build();
return result;
}
// Returns a range set from the referenceRangeMap. If "anchor" is true, only
// return the anchor. If "false", return interanchors.
// Return value is a Map of
public static Map> getRangesForChrom(Map refRangeMap, Chromosome chrom, DBLoadingUtils.AnchorType anchorType) {
Map> anchorIdRangeSet = new HashMap>();
for (Map.Entry entry : refRangeMap.entrySet()) {
int gid = entry.getKey();
ReferenceRange refRange = entry.getValue();
if (!(refRange.chromosome().equals(chrom))) continue; // skip anchors for other chroms
if (anchorType == DBLoadingUtils.AnchorType.ANCHOR || anchorType == DBLoadingUtils.AnchorType.BOTH) {
//if (refRange.isAnchor()) {
// anchorIdRangeSet.put(gid,Range.closed(refRange.start(), refRange.end()));
//}
} else if (anchorType == DBLoadingUtils.AnchorType.INTER_ANCHOR || anchorType == DBLoadingUtils.AnchorType.BOTH) {
//if (!refRange.isAnchor()) {
// anchorIdRangeSet.put(gid,Range.closed(refRange.start(), refRange.end()));
//}
}
}
System.out.println("getRangesForChrom: anchorType : " + anchorType + ", anchorRangeSet size: " + anchorIdRangeSet.keySet().size());
return anchorIdRangeSet;
}
public static void addToCoordsMap(RangeMap coordsFileRefToDataMap,String line) {
String[] tokens = tab.split(line);
// mummer coords file has 4 header lines when tab-delimited, then these columns:
// S1 E1 S2 E2 [LEN 1] {LEN 2] [%id] chr1 chr2
// RangeSet coalesce the ranges, which is what I want
int refStart = Integer.parseInt(tokens[0]);
int refEnd = Integer.parseInt(tokens[1]);
// If the end is < start, it mapped on reverse strand.
// These are not reverse strand coordinates with the reverse starting at 0
// The coordinates are relative to the positive strand, so just flip them
if (refStart > refEnd) {
int tempStart = refEnd;
refEnd = refStart;
refStart = tempStart;
}
Range refCoords = Range.closed(refStart,refEnd);
int queryStart = Integer.parseInt(tokens[2]);
int queryEnd = Integer.parseInt(tokens[3]);
if (queryStart > queryEnd) {
int tempStart = queryEnd;
queryEnd = queryStart;
queryStart = tempStart;
}
// add %ID to use in creating average
double id = Double.parseDouble(tokens[6]);
String refData = queryStart + "\t" + queryEnd + "\t" + id;
coordsFileRefToDataMap.put(refCoords,refData);
}
//This returns a list of entries in the coordsFileRefToDataMap that overlap the anchorRange
public RangeMap getCoordsEntriesForAnchor(Range anchorRange, RangeMap coordsFileRefToDataMap) {
RangeMap matchingRanges = TreeRangeMap.create();;
List, String>> overlaps = new ArrayList<>(
coordsFileRefToDataMap.subRangeMap(anchorRange).asMapOfRanges().entrySet());
if (overlaps.size() > 0 ) {
// find the original entries and add them
for (int idx = 0; idx < overlaps.size(); idx++) {
Map.Entry, String> overlappingEntry = coordsFileRefToDataMap.getEntry(overlaps.get(idx).getKey().lowerEndpoint());
matchingRanges.put(overlappingEntry.getKey(), overlappingEntry.getValue());
}
}
return matchingRanges;
}
// Create a RangeSet from a map of ranges.
public RangeSet getAnchorRangeSet(RangeMap anchorOverlapEntries) {
RangeSet ranges = TreeRangeSet.create();
for (Range range : anchorOverlapEntries.asMapOfRanges().keySet()) {
ranges.add(range);
}
return ranges;
}
@Override
public ImageIcon getIcon() {
// TODO Auto-generated method stub
return null;
}
@Override
public String getButtonName() {
// TODO Auto-generated method stub
return null;
}
@Override
public String getToolTipText() {
// TODO Auto-generated method stub
return null;
}
/**
* Output of Mummer coords file
*
* @return Mummer Coords File
*/
public String coordsFile() {
return coordsFile.value();
}
/**
* Set Mummer Coords File. Output of Mummer coords file
*
*
* @param value Mummer Coords File
*
* @return this plugin
*/
public Mummer4DoonerBZStats coordsFile(String value) {
coordsFile = new PluginParameter<>(coordsFile, value);
return this;
}
/**
* Anchor Intervals file to be used when intervals are
* different than DB, e.g. when using just a small region
*
*
* @return Anchor Intervals File
*/
public String intervalsFile() {
return intervalsFile.value();
}
/**
* Set Anchor Intervals File. Anchor Intervals file to
* be used when intervals are different than DB, e.g.
* when using just a small region
*
* @param value Anchor Intervals File
*
* @return this plugin
*/
public Mummer4DoonerBZStats intervalsFile(String value) {
intervalsFile = new PluginParameter<>(intervalsFile, value);
return this;
}
/**
* Mummer parameters used
*
* @return Mummer Parameters
*/
public String mummerParams() {
return mummerParams.value();
}
/**
* Set Mummer Parameters. Mummer parameters used
*
* @param value Mummer Parameters
*
* @return this plugin
*/
public Mummer4DoonerBZStats mummerParams(String value) {
mummerParams = new PluginParameter<>(mummerParams, value);
return this;
}
/**
* Name to prefix to output results file
*
* @return Output File refix
*/
public String prefix() {
return prefix.value();
}
/**
* Set Output File refix. Name to prefix to output results
* file
*
* @param value Output File refix
*
* @return this plugin
*/
public Mummer4DoonerBZStats prefix(String value) {
prefix = new PluginParameter<>(prefix, value);
return this;
}
/**
* Output directory including trailing / for writing files
*
* @return Output Directory
*/
public String outputDir() {
return outputDir.value();
}
/**
* Set Output Directory. Output directory including trailing
* / for writing files
*
* @param value Output Directory
*
* @return this plugin
*/
public Mummer4DoonerBZStats outputDir(String value) {
outputDir = new PluginParameter<>(outputDir, value);
return this;
}
/**
* File containing lines with data for host=, user=, password=
* and DB=, DBtype= used for db connection
*
* @return DB Config File
*/
public String configFile() {
return configFile.value();
}
/**
* Set DB Config File. File containing lines with data
* for host=, user=, password= and DB=, DBtype= used for
* db connection
*
* @param value DB Config File
*
* @return this plugin
*/
public Mummer4DoonerBZStats configFile(String value) {
configFile = new PluginParameter<>(configFile, value);
return this;
}
/**
* Name of reference chromsome as stored in the database.
* This is the chromosome whose anchors will be pulled.
*
* @return Reference Chromosome Name
*/
public String refChrom() {
return refChrom.value();
}
/**
* Set Reference Chromosome Name. Name of reference chromsome
* as stored in the database. This is the chromosome
* whose anchors will be pulled.
*
* @param value Reference Chromosome Name
*
* @return this plugin
*/
public Mummer4DoonerBZStats refChrom(String value) {
refChrom = new PluginParameter<>(refChrom, value);
return this;
}
}