All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.pangenome.processAssemblyGenomes.RampSeqContigToGenomeIntervalPlugin Maven / Gradle / Ivy

There is a newer version: 1.10
Show newest version
/**
 * 
 */
package net.maizegenetics.pangenome.processAssemblyGenomes;

import java.awt.Frame;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.sql.Connection;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import javax.swing.ImageIcon;

import org.apache.log4j.Logger;

import com.google.common.collect.Range;
import com.google.common.collect.RangeMap;
import com.google.common.collect.TreeRangeMap;

import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMRecordIterator;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.ValidationStringency;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.pangenome.api.CreateGraphUtils;
import net.maizegenetics.pangenome.api.ReferenceRange;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.util.Tuple;
import net.maizegenetics.util.Utils;

/**
 * NOTE: this method created to aid Dan Ilut.  It is not official part of Assembly pipeline
 *    This method is old, obsolete and only works with sqlite3 dbs
 * 
 * This method takes a sorted/indexed BAM File or a SAM file.
 * It uses htsjdk to read the BAM file and find locations of the assembly contigs in the 
 * reference genome.  These locations are then mapped to genome_intervals (anchors or inter-anchors) in the db..
 * 
 * Genome Interval identification is based on the mid-point of the reference range where the
 * contig maps.  Use htsjdk to get the start/end of alignment from SamRecord, find the midpoint of this 
 * alignment, search the ReferenceRanges from the db to find the range (ie genome interval) where this read falls.
 * 
 * Output:
 *   A tab delimited file with the headings: 
 *      ContigNam GenomeIntervalRange GenomeIntervalId IsAnchor
 *   The file name is the BAM/SAM file name minus the extension, plus "_intervalMapping.txt", written
 *   to the specified user directory.
 *   
 *   For example:  outputDir = /workdir/lcj34/danOutput/
 *                 BAM file = /workdir/lcj34/bamFiles/w22B73Asm10.bam
 *                 output File from this plugin:  /workdir/lcj34/danOutput/w22B73Asm10_intervalMapping.txt
 *   
 * 
 * @author lcj34
 *
 */
@Deprecated
public class RampSeqContigToGenomeIntervalPlugin extends AbstractPlugin {
    private static final Logger myLogger = Logger.getLogger(RampSeqContigToGenomeIntervalPlugin.class);

    private PluginParameter myHostname = new PluginParameter.Builder<>("hostname", null, String.class)
            .description("Hostname where database resides")
            .build();

    private PluginParameter myUserid = new PluginParameter.Builder<>("userid", "", String.class)
            .description("Userid for database")
            .build();

    private PluginParameter myPassword = new PluginParameter.Builder<>("password", "", String.class)
            .password()
            .description("Password for database")
            .build();

    private PluginParameter myDatabaseName = new PluginParameter.Builder<>("databaseName", null, String.class)
            .required(true)
            .description("Database name")
            .build();

    private PluginParameter myContigFile = new PluginParameter.Builder<>("contigBAM", null, String.class)
            .required(true)
            .description("Name of contig BAM file to process")
            .build();
    
    private PluginParameter myOutputDir = new PluginParameter.Builder<>("outputDir", null, String.class)
            .required(true)
            .description("Output Directory including trailing / ")
            .build();
    
    public RampSeqContigToGenomeIntervalPlugin() {
        super(null, false);
    }

    public RampSeqContigToGenomeIntervalPlugin(Frame parentFrame) {
        super(parentFrame, false);
    }

    public RampSeqContigToGenomeIntervalPlugin(Frame parentFrame, boolean isInteractive) {
        super(parentFrame, isInteractive);
    }
    
//    public static void main(String[] args) {
//        GeneratePluginCode.generate(RampSeqContigToGenomeIntervalPlugin.class);
//    }
    
    /**
     * @param args
     */
    public static void main(String[] args) {
       String contigBam = "/Users/lcj34/temp/minimap2Alignment/w22B73Asm10.sam";
       String db = "/Users/lcj34/notes_files/repgen/wgs_pipeline/agpv4_phg_dbs/august2017_refAnchorInteranchor.db";
       String outputDir = "/Users/lcj34/notes_files/repgen/wgs_pipeline/septOct_ramuStats/danOutputFiles/";
        new RampSeqContigToGenomeIntervalPlugin()
            .contigBAM(contigBam)
            .databaseName(db)
            .outputDir(outputDir)
            .processData(null);
         
        System.out.println("\nFinished!!");
    }
    
    @Override
    public DataSet processData(DataSet input) {
        System.out.println("Begin processing ...");
        long totalTime = System.nanoTime();
        try (Connection conn = CreateGraphUtils.connection(hostname(), userid(), password(), databaseName())) {
            Map refRangeMap = CreateGraphUtils.referenceRangeMap( conn);
            // Need RangeMap for finding positions
            RangeMap intervalMap = createRefRangePositionMap(refRangeMap);
            
            // Read the sam file
            System.out.println("Got ranges, start reading BAM file");
            SamReader bamReader = SamReaderFactory.makeDefault().validationStringency(ValidationStringency.SILENT).open(new File(contigBAM()));
            SAMSequenceDictionary dict=bamReader.getFileHeader().getSequenceDictionary();
            if(dict==null) throw new RuntimeException("Sequence dictionary missing");
            SAMRecordIterator iter=bamReader.iterator();

            Map,Integer>> contigToGenomeIntervalMap = new HashMap,Integer>>();
            int count = 0;
            int processed = 0;

            Set contigNameSet = new HashSet();
            while(iter.hasNext()) {

                count++;
                SAMRecord rec=iter.next();
     
                String contigName = rec.getReadName();
                contigNameSet.add(contigName);
                // SKip unmapped and secondary reads
                if(rec.getReadUnmappedFlag() || rec.isSecondaryOrSupplementary()) {
                    // Avoid overwriting a valid entry with NULL based on a supplementary entry
                    if (!contigToGenomeIntervalMap.containsKey(contigName)) {
                        contigToGenomeIntervalMap.put(contigName, null);
                    }               
                    continue;
                }
                
                // Process remaining reads.  Find the range relating to the
                // reference of this read.  Pass that into findGenomeInterval()
                // Store result on a map. 
                String refChrom = rec.getReferenceName();
                int alignmentStart = rec.getAlignmentStart();
                int alignmentEnd = rec.getAlignmentEnd();
                 
                int midAlignment = alignmentStart + ((alignmentEnd-alignmentStart)/2); // WILl this always be positive?  based on forward strand?
//                System.out.println("\nAlignmentStart: " + alignmentStart + " alignmentEnd " + alignmentEnd + " midpoint " + midAlignment);
//                System.out.println("contigName-refName:startAlign:endAlign " + contigName + " " + 
//                    refChrom + ":" + alignmentStart + ":" + alignmentEnd);  
                Range midIntervalRange= Range.closed(Position.of(refChrom, midAlignment), Position.of(refChrom, midAlignment));
                Tuple,Integer> refInterval = findGenomeInterval(intervalMap, midIntervalRange);
     
                if (refInterval == null && contigToGenomeIntervalMap.containsKey(contigName)) {
                    // do nothing - don't want to overwrite a potentially good value.
                } else {
                    // as long as value is not null, is ok to overwrite. There should be only
                    // 1 good mapping.
                    contigToGenomeIntervalMap.put(contigName, refInterval);
                }
                
                processed++;    
            }
            
            System.out.println("\nNum Sam records: " + count + ", num GOOD reads processed: " 
                    + processed + ", size of contigToGEnomeIntervalMap: " + contigToGenomeIntervalMap.keySet().size()
                    + ", size of contigNameSet: " + contigNameSet.size());
            printContigMapping(contigToGenomeIntervalMap, refRangeMap);
            
            iter.close();    
            myLogger.info("Finished processing, total time " + (System.nanoTime()-totalTime)/1e9 + " seconds");
            
        } catch (Exception exc) {
            myLogger.error("error processing input file " + contigBAM() + " " + exc.getMessage());
        }
        return null;
    }

    // takes a map of Integer/position and transforms into a RangeMap to be used
    // for finding a range where the alignement overlaps.
    private RangeMap  createRefRangePositionMap(Map refRangeMap){
        RangeMap intervalMap = TreeRangeMap.create();
        
        refRangeMap.entrySet().stream().forEach(entry -> {
            ReferenceRange refRange = entry.getValue();
            intervalMap.put(Range.closed(Position.of(refRange.chromosome().getName(), refRange.start()),
                                         Position.of(refRange.chromosome().getName(), refRange.end())),entry.getKey());
        });
        return intervalMap;
    }
    
    // Method takes a range map of positions and a range.  Find the map entry which encompasses
    // the position, or return NULL if position is not included in the map.
    private  Tuple,Integer> findGenomeInterval(RangeMap intervalRangeMap, Range range) {
        // look for an entry in the intervalRangeMap into which the specified Range falls
        List, Integer>> overlaps = new ArrayList<>(
                intervalRangeMap.subRangeMap(range).asMapOfRanges().entrySet());
        // if an overlap is found, record the genome interval for this contig
        if (overlaps.size() != 0) {            
            Map.Entry, Integer> overlappingEntry = intervalRangeMap.getEntry(overlaps.get(0).getKey().lowerEndpoint());
            return new Tuple,Integer> (overlappingEntry.getKey(),overlappingEntry.getValue()); 
        }
        else {
            return null; // range falls outside of overlap
        }
    }

    private String getOutputFileName() {
        String shortName = contigBAM().substring(contigBAM().lastIndexOf("/")+1);
        shortName = shortName.substring(0, shortName.lastIndexOf("."));
        String outputFileName = outputDir() + shortName + "_intervalMapping.txt";
        return outputFileName;
    }
    
    // Prints to user specified output file
    private void printContigMapping(Map,Integer>> contigToGenomeIntervalMap,
            Map refRangeMap) {
        String headerLine = "ContigName\tGenomeIntervalRange\tGenomeIntervalId\tIsAnchor\n";
        String outputFile = getOutputFileName();
        
        try (BufferedWriter bw = Utils.getBufferedWriter(outputFile)) {
            bw.write(headerLine);
            for (Map.Entry,Integer>> entry : contigToGenomeIntervalMap.entrySet()) {
                StringBuilder fileLineSB = new StringBuilder();
                if (entry.getValue() == null) {                    
                    fileLineSB.append(entry.getKey()).append("\tunmapped\t0\tunmapped\n");
                                                       
                } else {
                    Range intervalPositionRange = entry.getValue().getX();
                    Set methods = refRangeMap.get(entry.getValue().getY()).groupMethods();
                    StringBuilder rangeSB = new StringBuilder()
                            .append(intervalPositionRange.lowerEndpoint().getChromosome().getName()).append(":")
                            .append(intervalPositionRange.lowerEndpoint().getPosition()).append(":")
                            .append(intervalPositionRange.upperEndpoint().getPosition());                                        
                    fileLineSB.append(entry.getKey()).append("\t") // contig name
                            .append(rangeSB.toString()).append("\t") // interval specifics
                            .append(entry.getValue().getY()).append("\t") // genome_interval_id (anchor/interanchorid)
                            .append(methods.toString()).append("\n");
                }
 
                bw.write(fileLineSB.toString());
            }
        } catch (IOException ioe) {
            throw new IllegalArgumentException("Error writing contigMapping file to output dir " + outputDir());
        }
    }
    
    @Override
    public ImageIcon getIcon() {       
        return null;
    }

    @Override
    public String getButtonName() {
        
        return ("RampSeq contig to Genome Interval");
    }

    @Override
    public String getToolTipText() {
        return ("RampSeq contig to Gdnome Interval");
    }
    
    /**
     * Hostname where database resides
     *
     * @return Hostname
     */
    public String hostname() {
        return myHostname.value();
    }

    /**
     * Set Hostname. Hostname where database resides
     *
     * @param value Hostname
     *
     * @return this plugin
     */
    public RampSeqContigToGenomeIntervalPlugin hostname(String value) {
        myHostname = new PluginParameter<>(myHostname, value);
        return this;
    }

    /**
     * Userid for database
     *
     * @return Userid
     */
    public String userid() {
        return myUserid.value();
    }

    /**
     * Set Userid. Userid for database
     *
     * @param value Userid
     *
     * @return this plugin
     */
    public RampSeqContigToGenomeIntervalPlugin userid(String value) {
        myUserid = new PluginParameter<>(myUserid, value);
        return this;
    }

    /**
     * Password for database
     *
     * @return Password
     */
    public String password() {
        return myPassword.value();
    }

    /**
     * Set Password. Password for database
     *
     * @param value Password
     *
     * @return this plugin
     */
    public RampSeqContigToGenomeIntervalPlugin password(String value) {
        myPassword = new PluginParameter<>(myPassword, value);
        return this;
    }

    /**
     * Database name
     *
     * @return Database Name
     */
    public String databaseName() {
        return myDatabaseName.value();
    }

    /**
     * Set Database Name. Database name
     *
     * @param value Database Name
     *
     * @return this plugin
     */
    public RampSeqContigToGenomeIntervalPlugin databaseName(String value) {
        myDatabaseName = new PluginParameter<>(myDatabaseName, value);
        return this;
    }

    /**
     * Name of contig BAM file to process
     *
     * @return Contig File
     */
    public String contigBAM() {
        return myContigFile.value();
    }

    /**
     * Set Contig File. Name of contig fasta file to process
     *
     * @param value Contig File
     *
     * @return this plugin
     */
    public RampSeqContigToGenomeIntervalPlugin contigBAM(String value) {
        myContigFile = new PluginParameter<>(myContigFile, value);
        return this;
    }

    /**
     * Output Directory
     *
     * @return Output Directory
     */
    public String outputDir() {
        return myOutputDir.value();
    }

    /**
     * Set Output Directory. Output Directory
     *
     * @param value Output Directory
     *
     * @return this plugin
     */
    public RampSeqContigToGenomeIntervalPlugin outputDir(String value) {
        myOutputDir = new PluginParameter<>(myOutputDir, value);
        return this;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy