net.maizegenetics.pangenome.pipelineTests.CountNsInRawHapSequencesPlugin Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
There is a newer version: 1.10
/**
 * 
 */
package net.maizegenetics.pangenome.pipelineTests;

import java.awt.Frame;
import java.io.BufferedWriter;
import java.sql.Connection;
import java.sql.ResultSet;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import javax.swing.ImageIcon;

import org.apache.log4j.Logger;

import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;

import net.maizegenetics.pangenome.api.CreateGraphUtils;
import net.maizegenetics.pangenome.api.ReferenceRange;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.GeneratePluginCode;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.taxa.TaxaList;
import net.maizegenetics.taxa.Taxon;
import net.maizegenetics.util.Tuple;
import net.maizegenetics.util.Utils;

/**
 * 
 * NOTE:  THis test is obsolete - uses old schema and old connection method.
 * 
 * Calculates the percentage of N's for each "raw" haplotype (ie, not consensus)
 * for each genome interval (anchor, not inter-anchors).  Results shoudl be printed
 * to a tab-delimited file with genome_interval_id as rows, and taxa/percentage as the
 * columns.
 * 
 * Testing:  Testing was done by random manual checking.  Once the output file is created, it is
 * uploaded to excel.  use sqlite3  on machine holding db.  Look at sequences and
 * verify when it has "1" as percent, there are all N's.  WHen 0, there are 0.  When 
 * a percentage, e.g. .53, the sequence looks to be half N's.
 * 
 * Find gamete_grp_id for the line you want to check:
 * sqlite> SELECT gamete_haplotypes.gamete_grp_id FROM gamete_haplotypes 
 * sqlite> INNER JOIN gametes ON gametes.gameteid=gamete_haplotypes.gameteid 
 * sqlite> INNER JOIN genotypes on gametes.genoid=genotypes.genoid
 * sqlite> WHERE genotypes.line_name='CML247';
 * sqlite> 5   (this is returned value)
 * 
 * Then look at the sequence:
 * sqlite> select gamete_grp_id, genome_interval_id,sequence from haplotypes where gamete_grp_id=5 and genome_interval_id=111;
 * 
 * @author lcj34
 *
 */
@Deprecated
public class CountNsInRawHapSequencesPlugin extends AbstractPlugin {
    private static final Logger myLogger = Logger.getLogger(CountNsInRawHapSequencesPlugin.class);

    private PluginParameter myHostname = new PluginParameter.Builder<>("hostname", null, String.class)
            .description("Hostname where database resides")
            .build();

    private PluginParameter myUserid = new PluginParameter.Builder<>("userid", "", String.class)
            .description("Userid for database")
            .build();

    private PluginParameter myPassword = new PluginParameter.Builder<>("password", "", String.class)
            .password()
            .description("Password for database")
            .build();

    private PluginParameter myDatabaseName = new PluginParameter.Builder<>("databaseName", null, String.class)
            .required(true)
            .description("Database name")
            .build();
    
    private PluginParameter myOutputFile = new PluginParameter.Builder<>("outputFile", null, String.class)
            .outFile()
            .required(true)
            .description("Output File")
            .build();
    
    public CountNsInRawHapSequencesPlugin() {
        super(null, false);
    }

    public CountNsInRawHapSequencesPlugin(Frame parentFrame) {
        super(parentFrame, false);
    }

    public CountNsInRawHapSequencesPlugin(Frame parentFrame, boolean isInteractive) {
        super(parentFrame, isInteractive);
    }
    
    @Override
    public DataSet processData(DataSet input) {
        
        long totalTime = System.nanoTime();
        myLogger.info("Creating connection to db " + databaseName());
        
        try (Connection conn = CreateGraphUtils.connection(hostname(), userid(), password(), databaseName())) {

            myLogger.info("create taxaListMap and reference ranges");
            Map taxaListMap = CreateGraphUtils.taxaListMap(conn);
            
           
            // Map of genome_interval_id, Tuple
            Multimap> intervalIdTaxonPercentNMap =  HashMultimap.create();
            
            // For each gameteid with a single taxon associated, grab the sequence
            // for each anchor interval (not inter-anchors, genome_interval_id < 37805)
            // Calculate the percentage of N's in each sequence, store to map for this haplotype
            int hapCount = 0;
            for (Map.Entry entry : taxaListMap.entrySet()) {
                if (entry.getValue().size() > 1) continue; // skip consensus - only want raw haplotypes
                hapCount++;
                int gamete_grp_id=entry.getKey();
                Taxon taxon = entry.getValue().get(0);
                StringBuilder sb = new StringBuilder();
                sb.append("SELECT haplotypes.genome_interval_id, haplotypes.sequence from haplotypes ");
                sb.append("WHERE gamete_grp_id=");
                sb.append(gamete_grp_id);
                sb.append(" AND haplotypes.genome_interval_id < 37805"); // last anchor is 37804, ignore interanchors
                sb.append(" ORDER BY haplotypes.genome_interval_id;");
                
                String query = sb.toString();
                myLogger.info("querying data for gamete_grp_id " + gamete_grp_id);
                
                ResultSet rs = conn.createStatement().executeQuery(query);
                
                Map anchorToSequenceMap = new HashMap();
                while (rs.next()) {
                    anchorToSequenceMap.put(rs.getInt("genome_interval_id"), rs.getString("sequence"));
                }
                // >
                Map  hapPercentNCountMap = countSequenceNs(anchorToSequenceMap);
                
                // Add results for this taxon/haplotype to genomeIntervalMap.  This map gathers
                // the information for all haplotypes in a manner easily printed to a file.
                addToIntervalMap(intervalIdTaxonPercentNMap,hapPercentNCountMap, taxon.getName());
                
            }
            
            myLogger.info("Processed " + hapCount + " haplotypes for each interval.");
            
            printHaplotypeResults(intervalIdTaxonPercentNMap, conn);
            System.out.println("Number haplotypes processed " + hapCount);
            
        } catch (Exception exc) {
            throw new IllegalStateException("Error processing CountNsInRawHapSequencesPlugin: " + exc.getMessage());
        }
        
        myLogger.info("Finished processing in " + (System.nanoTime()-totalTime)/1e9 + " seconds");
        return null;
    }
    
    // Method takes map of anchorid to sequence, returns map of 
    // genome_interval_id/percentN
    private Map countSequenceNs(Map anchorToSequenceMap) {
        Map anchorToNMap = new HashMap();
        for (Map.Entry entry : anchorToSequenceMap.entrySet()) {
            int nCount = (int)getNCount(entry.getValue());
            double nPercent = (double)nCount/entry.getValue().length();
            anchorToNMap.put(entry.getKey(), nPercent);
        }
        
        return anchorToNMap;
    }
    
    // Find number of N's in the given sequence string.
    private long getNCount(String seq) {
        long count = seq.chars().filter(allele -> allele=='N').count();
        return count;
    }
    
    private void addToIntervalMap(Multimap> intervalIdTaxonPercentNMap,
            Map  idToNCountMap, String haplotype) {
        
        for (Map.Entry entry: idToNCountMap.entrySet()) {
            // for each entry on the haplotype's percentN map, add data to the global
            // genome interval percent N map.
            intervalIdTaxonPercentNMap.put(entry.getKey(), new Tuple(haplotype,entry.getValue()));
        }
    }
    
    // This multimap needs the haplotypes ordered - must always print
    // in same order on line
    private void printHaplotypeResults(Multimap> intervalIdTaxonPercentNMap,
            Connection conn){
        
        myLogger.info("Begin printHaplotypeResults");
        
        // refRange map to be used for printing intervals
        Map refRangeMap = CreateGraphUtils.referenceRangeMap( conn);
        
        // Get ordered list of haplotypes for header line
        Collection> haplotypes = intervalIdTaxonPercentNMap.asMap().get(1); // any element
        List> haplotypeList = new ArrayList>(haplotypes);
        Collections.sort(haplotypeList, (Tuple first, Tuple second) -> first.x.compareTo(second.x));
        
        
        // Create file header line
        StringBuilder headerLine = new StringBuilder().append("GenomeIntervalID")
                .append("\t").append("ReferenceRangeInterval");
        for (Tuple haplotype : haplotypeList) {
            headerLine.append("\t").append(haplotype.getX());
        }
        headerLine.append("\n");
        
        try (BufferedWriter writer = Utils.getBufferedWriter(outputFile())) {
            writer.write(headerLine.toString());
            
            // Add data for each row
            StringBuilder sb = new StringBuilder();
            intervalIdTaxonPercentNMap.asMap().entrySet().stream().forEach(entry -> {              
                
                int genomeIntervalId = entry.getKey();
                Collection> taxaPercents = entry.getValue();
                
                String fileLine = createFileEntry(genomeIntervalId, refRangeMap,taxaPercents);
                sb.append(fileLine); // values were sorted in createFileEntry()
            });
            writer.write(sb.toString());

            
        } catch (Exception exc) {
            myLogger.error("Error writing file " + outputFile() + " " + exc.getMessage());
            throw new IllegalStateException("Error writing output file " + exc.getMessage());
        }
    }
    
    private String createFileEntry(int genomeInterval, Map refRangeMap, Collection> taxaPercents) {
        // Sort the list so values are consistently printed to correct columns
        List> percentList = new ArrayList>(taxaPercents);
        Collections.sort(percentList, (Tuple first, Tuple second) -> first.x.compareTo(second.x));
       
        String refRangeInterval = refRangeMap.get(genomeInterval).intervalString();
        StringBuilder sb = new StringBuilder().append(genomeInterval)
                .append("\t").append(refRangeInterval);
        for (Tuple taxaEntry : percentList) {
            // Add the percentN value for each taxa/haplotype
            sb.append("\t").append(taxaEntry.getY());
        }
        sb.append("\n");
        
        return sb.toString();
    }
    
    /**
     * @param args
     */
//    public static void main(String[] args) {
//        // TODO Auto-generated method stub
//
//    }

    public static void main(String[] args) {
        GeneratePluginCode.generate(CountNsInRawHapSequencesPlugin.class);
    }
    @Override
    public ImageIcon getIcon() {        
        return null;
    }

    @Override
    public String getButtonName() {        
        return ("Count Haplotype Sequence Ns");
    }

    @Override
    public String getToolTipText() {       
        return ("Count Haplotype Sequence Ns");
    }

    /**
     * Hostname where database resides
     *
     * @return Hostname
     */
    public String hostname() {
        return myHostname.value();
    }

    /**
     * Set Hostname. Hostname where database resides
     *
     * @param value Hostname
     *
     * @return this plugin
     */
    public CountNsInRawHapSequencesPlugin hostname(String value) {
        myHostname = new PluginParameter<>(myHostname, value);
        return this;
    }

    /**
     * Userid for database
     *
     * @return Userid
     */
    public String userid() {
        return myUserid.value();
    }

    /**
     * Set Userid. Userid for database
     *
     * @param value Userid
     *
     * @return this plugin
     */
    public CountNsInRawHapSequencesPlugin userid(String value) {
        myUserid = new PluginParameter<>(myUserid, value);
        return this;
    }

    /**
     * Password for database
     *
     * @return Password
     */
    public String password() {
        return myPassword.value();
    }

    /**
     * Set Password. Password for database
     *
     * @param value Password
     *
     * @return this plugin
     */
    public CountNsInRawHapSequencesPlugin password(String value) {
        myPassword = new PluginParameter<>(myPassword, value);
        return this;
    }

    /**
     * Database name
     *
     * @return Database Name
     */
    public String databaseName() {
        return myDatabaseName.value();
    }

    /**
     * Set Database Name. Database name
     *
     * @param value Database Name
     *
     * @return this plugin
     */
    public CountNsInRawHapSequencesPlugin databaseName(String value) {
        myDatabaseName = new PluginParameter<>(myDatabaseName, value);
        return this;
    }

    /**
     * Output File
     *
     * @return Output File
     */
    public String outputFile() {
        return myOutputFile.value();
    }

    /**
     * Set Output File. Output File
     *
     * @param value Output File
     *
     * @return this plugin
     */
    public CountNsInRawHapSequencesPlugin outputFile(String value) {
        myOutputFile = new PluginParameter<>(myOutputFile, value);
        return this;
    }
}