net.maizegenetics.pangenome.processAssemblyGenomes.RampSeqContigToGenomeIntervalPlugin Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
/**
*
*/
package net.maizegenetics.pangenome.processAssemblyGenomes;
import java.awt.Frame;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.sql.Connection;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.swing.ImageIcon;
import org.apache.log4j.Logger;
import com.google.common.collect.Range;
import com.google.common.collect.RangeMap;
import com.google.common.collect.TreeRangeMap;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMRecordIterator;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.ValidationStringency;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.pangenome.api.CreateGraphUtils;
import net.maizegenetics.pangenome.api.ReferenceRange;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.util.Tuple;
import net.maizegenetics.util.Utils;
/**
* NOTE: this method created to aid Dan Ilut. It is not official part of Assembly pipeline
* This method is old, obsolete and only works with sqlite3 dbs
*
* This method takes a sorted/indexed BAM File or a SAM file.
* It uses htsjdk to read the BAM file and find locations of the assembly contigs in the
* reference genome. These locations are then mapped to genome_intervals (anchors or inter-anchors) in the db..
*
* Genome Interval identification is based on the mid-point of the reference range where the
* contig maps. Use htsjdk to get the start/end of alignment from SamRecord, find the midpoint of this
* alignment, search the ReferenceRanges from the db to find the range (ie genome interval) where this read falls.
*
* Output:
* A tab delimited file with the headings:
* ContigNam GenomeIntervalRange GenomeIntervalId IsAnchor
* The file name is the BAM/SAM file name minus the extension, plus "_intervalMapping.txt", written
* to the specified user directory.
*
* For example: outputDir = /workdir/lcj34/danOutput/
* BAM file = /workdir/lcj34/bamFiles/w22B73Asm10.bam
* output File from this plugin: /workdir/lcj34/danOutput/w22B73Asm10_intervalMapping.txt
*
*
* @author lcj34
*
*/
@Deprecated
public class RampSeqContigToGenomeIntervalPlugin extends AbstractPlugin {
private static final Logger myLogger = Logger.getLogger(RampSeqContigToGenomeIntervalPlugin.class);
private PluginParameter myHostname = new PluginParameter.Builder<>("hostname", null, String.class)
.description("Hostname where database resides")
.build();
private PluginParameter myUserid = new PluginParameter.Builder<>("userid", "", String.class)
.description("Userid for database")
.build();
private PluginParameter myPassword = new PluginParameter.Builder<>("password", "", String.class)
.password()
.description("Password for database")
.build();
private PluginParameter myDatabaseName = new PluginParameter.Builder<>("databaseName", null, String.class)
.required(true)
.description("Database name")
.build();
private PluginParameter myContigFile = new PluginParameter.Builder<>("contigBAM", null, String.class)
.required(true)
.description("Name of contig BAM file to process")
.build();
private PluginParameter myOutputDir = new PluginParameter.Builder<>("outputDir", null, String.class)
.required(true)
.description("Output Directory including trailing / ")
.build();
public RampSeqContigToGenomeIntervalPlugin() {
super(null, false);
}
public RampSeqContigToGenomeIntervalPlugin(Frame parentFrame) {
super(parentFrame, false);
}
public RampSeqContigToGenomeIntervalPlugin(Frame parentFrame, boolean isInteractive) {
super(parentFrame, isInteractive);
}
// public static void main(String[] args) {
// GeneratePluginCode.generate(RampSeqContigToGenomeIntervalPlugin.class);
// }
/**
* @param args
*/
public static void main(String[] args) {
String contigBam = "/Users/lcj34/temp/minimap2Alignment/w22B73Asm10.sam";
String db = "/Users/lcj34/notes_files/repgen/wgs_pipeline/agpv4_phg_dbs/august2017_refAnchorInteranchor.db";
String outputDir = "/Users/lcj34/notes_files/repgen/wgs_pipeline/septOct_ramuStats/danOutputFiles/";
new RampSeqContigToGenomeIntervalPlugin()
.contigBAM(contigBam)
.databaseName(db)
.outputDir(outputDir)
.processData(null);
System.out.println("\nFinished!!");
}
@Override
public DataSet processData(DataSet input) {
System.out.println("Begin processing ...");
long totalTime = System.nanoTime();
try (Connection conn = CreateGraphUtils.connection(hostname(), userid(), password(), databaseName())) {
Map refRangeMap = CreateGraphUtils.referenceRangeMap( conn);
// Need RangeMap for finding positions
RangeMap intervalMap = createRefRangePositionMap(refRangeMap);
// Read the sam file
System.out.println("Got ranges, start reading BAM file");
SamReader bamReader = SamReaderFactory.makeDefault().validationStringency(ValidationStringency.SILENT).open(new File(contigBAM()));
SAMSequenceDictionary dict=bamReader.getFileHeader().getSequenceDictionary();
if(dict==null) throw new RuntimeException("Sequence dictionary missing");
SAMRecordIterator iter=bamReader.iterator();
Map,Integer>> contigToGenomeIntervalMap = new HashMap,Integer>>();
int count = 0;
int processed = 0;
Set contigNameSet = new HashSet();
while(iter.hasNext()) {
count++;
SAMRecord rec=iter.next();
String contigName = rec.getReadName();
contigNameSet.add(contigName);
// SKip unmapped and secondary reads
if(rec.getReadUnmappedFlag() || rec.isSecondaryOrSupplementary()) {
// Avoid overwriting a valid entry with NULL based on a supplementary entry
if (!contigToGenomeIntervalMap.containsKey(contigName)) {
contigToGenomeIntervalMap.put(contigName, null);
}
continue;
}
// Process remaining reads. Find the range relating to the
// reference of this read. Pass that into findGenomeInterval()
// Store result on a map.
String refChrom = rec.getReferenceName();
int alignmentStart = rec.getAlignmentStart();
int alignmentEnd = rec.getAlignmentEnd();
int midAlignment = alignmentStart + ((alignmentEnd-alignmentStart)/2); // WILl this always be positive? based on forward strand?
// System.out.println("\nAlignmentStart: " + alignmentStart + " alignmentEnd " + alignmentEnd + " midpoint " + midAlignment);
// System.out.println("contigName-refName:startAlign:endAlign " + contigName + " " +
// refChrom + ":" + alignmentStart + ":" + alignmentEnd);
Range midIntervalRange= Range.closed(Position.of(refChrom, midAlignment), Position.of(refChrom, midAlignment));
Tuple,Integer> refInterval = findGenomeInterval(intervalMap, midIntervalRange);
if (refInterval == null && contigToGenomeIntervalMap.containsKey(contigName)) {
// do nothing - don't want to overwrite a potentially good value.
} else {
// as long as value is not null, is ok to overwrite. There should be only
// 1 good mapping.
contigToGenomeIntervalMap.put(contigName, refInterval);
}
processed++;
}
System.out.println("\nNum Sam records: " + count + ", num GOOD reads processed: "
+ processed + ", size of contigToGEnomeIntervalMap: " + contigToGenomeIntervalMap.keySet().size()
+ ", size of contigNameSet: " + contigNameSet.size());
printContigMapping(contigToGenomeIntervalMap, refRangeMap);
iter.close();
myLogger.info("Finished processing, total time " + (System.nanoTime()-totalTime)/1e9 + " seconds");
} catch (Exception exc) {
myLogger.error("error processing input file " + contigBAM() + " " + exc.getMessage());
}
return null;
}
// takes a map of Integer/position and transforms into a RangeMap to be used
// for finding a range where the alignement overlaps.
private RangeMap createRefRangePositionMap(Map refRangeMap){
RangeMap intervalMap = TreeRangeMap.create();
refRangeMap.entrySet().stream().forEach(entry -> {
ReferenceRange refRange = entry.getValue();
intervalMap.put(Range.closed(Position.of(refRange.chromosome().getName(), refRange.start()),
Position.of(refRange.chromosome().getName(), refRange.end())),entry.getKey());
});
return intervalMap;
}
// Method takes a range map of positions and a range. Find the map entry which encompasses
// the position, or return NULL if position is not included in the map.
private Tuple,Integer> findGenomeInterval(RangeMap intervalRangeMap, Range range) {
// look for an entry in the intervalRangeMap into which the specified Range falls
List, Integer>> overlaps = new ArrayList<>(
intervalRangeMap.subRangeMap(range).asMapOfRanges().entrySet());
// if an overlap is found, record the genome interval for this contig
if (overlaps.size() != 0) {
Map.Entry, Integer> overlappingEntry = intervalRangeMap.getEntry(overlaps.get(0).getKey().lowerEndpoint());
return new Tuple,Integer> (overlappingEntry.getKey(),overlappingEntry.getValue());
}
else {
return null; // range falls outside of overlap
}
}
private String getOutputFileName() {
String shortName = contigBAM().substring(contigBAM().lastIndexOf("/")+1);
shortName = shortName.substring(0, shortName.lastIndexOf("."));
String outputFileName = outputDir() + shortName + "_intervalMapping.txt";
return outputFileName;
}
// Prints to user specified output file
private void printContigMapping(Map,Integer>> contigToGenomeIntervalMap,
Map refRangeMap) {
String headerLine = "ContigName\tGenomeIntervalRange\tGenomeIntervalId\tIsAnchor\n";
String outputFile = getOutputFileName();
try (BufferedWriter bw = Utils.getBufferedWriter(outputFile)) {
bw.write(headerLine);
for (Map.Entry,Integer>> entry : contigToGenomeIntervalMap.entrySet()) {
StringBuilder fileLineSB = new StringBuilder();
if (entry.getValue() == null) {
fileLineSB.append(entry.getKey()).append("\tunmapped\t0\tunmapped\n");
} else {
Range intervalPositionRange = entry.getValue().getX();
Set methods = refRangeMap.get(entry.getValue().getY()).groupMethods();
StringBuilder rangeSB = new StringBuilder()
.append(intervalPositionRange.lowerEndpoint().getChromosome().getName()).append(":")
.append(intervalPositionRange.lowerEndpoint().getPosition()).append(":")
.append(intervalPositionRange.upperEndpoint().getPosition());
fileLineSB.append(entry.getKey()).append("\t") // contig name
.append(rangeSB.toString()).append("\t") // interval specifics
.append(entry.getValue().getY()).append("\t") // genome_interval_id (anchor/interanchorid)
.append(methods.toString()).append("\n");
}
bw.write(fileLineSB.toString());
}
} catch (IOException ioe) {
throw new IllegalArgumentException("Error writing contigMapping file to output dir " + outputDir());
}
}
@Override
public ImageIcon getIcon() {
return null;
}
@Override
public String getButtonName() {
return ("RampSeq contig to Genome Interval");
}
@Override
public String getToolTipText() {
return ("RampSeq contig to Gdnome Interval");
}
/**
* Hostname where database resides
*
* @return Hostname
*/
public String hostname() {
return myHostname.value();
}
/**
* Set Hostname. Hostname where database resides
*
* @param value Hostname
*
* @return this plugin
*/
public RampSeqContigToGenomeIntervalPlugin hostname(String value) {
myHostname = new PluginParameter<>(myHostname, value);
return this;
}
/**
* Userid for database
*
* @return Userid
*/
public String userid() {
return myUserid.value();
}
/**
* Set Userid. Userid for database
*
* @param value Userid
*
* @return this plugin
*/
public RampSeqContigToGenomeIntervalPlugin userid(String value) {
myUserid = new PluginParameter<>(myUserid, value);
return this;
}
/**
* Password for database
*
* @return Password
*/
public String password() {
return myPassword.value();
}
/**
* Set Password. Password for database
*
* @param value Password
*
* @return this plugin
*/
public RampSeqContigToGenomeIntervalPlugin password(String value) {
myPassword = new PluginParameter<>(myPassword, value);
return this;
}
/**
* Database name
*
* @return Database Name
*/
public String databaseName() {
return myDatabaseName.value();
}
/**
* Set Database Name. Database name
*
* @param value Database Name
*
* @return this plugin
*/
public RampSeqContigToGenomeIntervalPlugin databaseName(String value) {
myDatabaseName = new PluginParameter<>(myDatabaseName, value);
return this;
}
/**
* Name of contig BAM file to process
*
* @return Contig File
*/
public String contigBAM() {
return myContigFile.value();
}
/**
* Set Contig File. Name of contig fasta file to process
*
* @param value Contig File
*
* @return this plugin
*/
public RampSeqContigToGenomeIntervalPlugin contigBAM(String value) {
myContigFile = new PluginParameter<>(myContigFile, value);
return this;
}
/**
* Output Directory
*
* @return Output Directory
*/
public String outputDir() {
return myOutputDir.value();
}
/**
* Set Output Directory. Output Directory
*
* @param value Output Directory
*
* @return this plugin
*/
public RampSeqContigToGenomeIntervalPlugin outputDir(String value) {
myOutputDir = new PluginParameter<>(myOutputDir, value);
return this;
}
}