All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.pangenome.hapCalling.FastqToKmerCountPlugin Maven / Gradle / Ivy

There is a newer version: 1.10
Show newest version
package net.maizegenetics.pangenome.hapCalling;

import com.google.common.collect.HashMultiset;
import com.google.common.collect.Multiset;
import com.google.common.collect.TreeMultiset;
import com.google.common.primitives.SignedBytes;
import gnu.trove.map.TLongObjectMap;
import htsjdk.samtools.fastq.FastqReader;
import htsjdk.samtools.fastq.FastqRecord;
import net.maizegenetics.dna.BaseEncoder;
import net.maizegenetics.dna.snp.NucleotideAlignmentConstants;
import net.maizegenetics.pangenome.api.HaplotypeGraph;
import net.maizegenetics.pangenome.db_loading.GetDBConnectionPlugin;
import net.maizegenetics.plugindef.*;
import net.maizegenetics.util.Utils;
import org.apache.log4j.Logger;

import javax.swing.*;
import java.awt.*;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.PrintWriter;
import java.net.URL;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.*;
import java.util.List;
import java.util.stream.Collector;
import java.util.stream.Collectors;

/**
 * Created by zrm22 on 10/23/17.
 */
@Deprecated
public class FastqToKmerCountPlugin extends AbstractPlugin {
    private static final Logger myLogger = Logger.getLogger(FastqToKmerCountPlugin.class);

    private PluginParameter myKmerFile = new PluginParameter.Builder<>("kmerFile", null, String.class)
            .inFile()
            .required(false)
            .description("Binary kmer map file. The plugin expects either this file or a data set created by IndexHaplotypeKmersPlugin.")
            .guiName("Kmer File")
            .build();

    private PluginParameter myReadFile = new PluginParameter.Builder<>("rawReadFile", null, String.class)
            .inFile()
            .required(true)
            .description("Raw Read file aligned to the reference")
            .build();

    private PluginParameter myExportHaplotypeFile = new PluginParameter.Builder<>("exportHaploFile", null, String.class)
            .outFile()
            .required(false)
            .description("Text file to store haplotype scoring")
            .build();

    private PluginParameter myDebugTaxon = new PluginParameter.Builder<>("debugTaxon", null, String.class)
            .required(false)
            .description("Debug taxon")
            .build();



    public FastqToKmerCountPlugin(Frame parentFrame, boolean isInteractive) {
        super(parentFrame, isInteractive);
    }

    @Override
    public DataSet processData(DataSet input) {
        //Input a TLongObjectMap or supply a kmer file name
        TLongObjectMap kmerMap = null;
        
        if (myKmerFile.value() != null) {
            try {
            	myLogger.info("Reading kmerMap.");
            	long start = System.currentTimeMillis();
                kmerMap = (TLongObjectMap) deserializeMapFromFile();
                myLogger.info("kmerMap read into memory in " + (System.currentTimeMillis() - start) + " ms.");
            } catch (ClassNotFoundException e) {
                throw new IllegalArgumentException(myKmerFile.value() + " is not a valid kmer map file.", e);
            } catch (IOException e) {
                throw new IllegalArgumentException("Unable to read " + myKmerFile.value(), e);
            }
        } else {
            List temp = input.getDataOfType(TLongObjectMap.class);
            if(temp.size() != 1) {
                throw new IllegalArgumentException("FastqToKmerCountPlugin: processData: must input one TLongObjectMap: " + temp.size());
            }
            kmerMap = (TLongObjectMap) temp.get(0).getData();
        }

        System.out.println("kmerMapSize: " + kmerMap.keys().length);
        //Load in the read file
        FastqReader reader = new FastqReader(new File(readFile()));

        long start = System.currentTimeMillis();
        Multiset hapidCounts = processFastqFile(kmerMap,reader);
        myLogger.info("fastq file processes in " + (System.currentTimeMillis() - start) + " ms.");
        if (myExportHaplotypeFile.value() != null) exportInclusionAndExclusion(hapidCounts, myExportHaplotypeFile.value());
        return DataSet.getDataSet(hapidCounts);
    }

    private Multiset processFastqFile(TLongObjectMap kmerMap, FastqReader reader) {
        Multiset inclusionAndExclusionCounts = TreeMultiset.create();
        
        PrintWriter pw;
        try {
//			pw = new PrintWriter("/workdir/pjb39/output/B97_read_info.txt");
            pw = new PrintWriter("/Users/peterbradbury/temp/W22_read_info.txt");
		} catch (FileNotFoundException e) {
			throw new RuntimeException(e);
		}
        
        //get a db connection for diagnostic output
//      Connection dbconn = (Connection) new GetDBConnectionPlugin()
//      .configFile("/workdir/pjb39/dbconfig_fastq_kmer_count.txt")
//      .createNew(false)
//      .performFunction(null).getData(0).getData();
        //for small seq test
        Connection dbconn = (Connection) new GetDBConnectionPlugin()
                .configFile("/Users/peterbradbury/Documents/projects/phg/training/dbconfig_fastq_kmer_count.txt")
                .createNew(false)
                .performFunction(null).getData(0).getData();
        
        //loop through the read file for each sequence
        long start = System.currentTimeMillis();
        int recordCount = 0;
        for(FastqRecord currentRecord : reader) {
            //loop through each read in batches of 32bps
            //retrieve the list of haplotypeNodeIds for each 32bp kmer
            //Loop through each kmer and add up how many times a node Id is seen
            //If #kmers containing a nodeId == number of non-filtered kmers it should be counted as an inclusion
            //else do some filtering
                //Idea if #kmers containing a nodeId > 50% but not == to number of non-filtered kmers it should be an exclusion
//            List hapIdCountsForRead = extractInclusionAndExclusionFromKmers(kmerMap,currentRecord.getReadBases(), 32, 50);
            if(SignedBytes.min(currentRecord.getBaseQualities()) > 10 && currentRecord.getReadLength()>60) {
                List hapIdCountsForRead = extractInclusionExclusionFromKmers(kmerMap, currentRecord.getReadString(), 32, 1);
                for (Integer hapId : hapIdCountsForRead) {
                    inclusionAndExclusionCounts.add(hapId);
                }
                
            }
            
            saveReadKmers(readMappings(kmerMap, currentRecord.getReadString(), 32), pw, recordCount, dbconn);
            recordCount++;
        }
        
        try {
            dbconn.close();
        } catch (SQLException e) {
            e.printStackTrace();
        }
        
        pw.close();
        return inclusionAndExclusionCounts;
    }

    private List extractInclusionExclusionFromKmers(TLongObjectMap kmerMap, String sequence, int kmerLength, double minimumKmerCountRatio) {
//        Multiset hapIdCounts = TreeMultiset.create();
    	List hapIdList = new ArrayList<>();
        int nonDuplicateKmers = 0;
        int totalKmersExamined = 0;
        for(int i = 0; i < sequence.length() - kmerLength; i++) {
            totalKmersExamined++;
            byte[] subSequence = NucleotideAlignmentConstants.convertHaplotypeStringToAlleleByteArray(sequence.substring(i, i + kmerLength));
            long kmerEncoded = BaseEncoder.getLongSeqFromByteArray(subSequence);

            int[] kmerCounts = (int[])kmerMap.get(kmerEncoded);
            
            if (kmerCounts != null) {
                for (int hapId : kmerCounts) hapIdList.add(hapId);
                nonDuplicateKmers++;
            }
        }
        
        return hapIdList;
    }

    private void exportInclusionAndExclusion(Multiset inclusionAndExclusionSet, String outputFileName) {
        try(BufferedWriter writer = Utils.getBufferedWriter(outputFileName)) {
            //Get the list of keys to make it easier to couple exclusions and inclusions
            Set allHapIds = inclusionAndExclusionSet.elementSet().stream().map(hapId -> (hapId<0)?-1*hapId:hapId).collect(Collector.of(TreeSet::new,(set, hapId)->set.add(hapId), (set1,set2)->{set1.addAll(set2); return set1;}));

            for(Integer hapId : allHapIds ) {
                writer.write(hapId+"\t");
                writer.write(inclusionAndExclusionSet.count(hapId)+"\t");
                writer.write(inclusionAndExclusionSet.count(-1*hapId)+"\n");
            }
        }
        catch(Exception e) {
            myLogger.error("FastqToKmerCountPlugin: exportInclusionAndExclusion:" + e.toString());
            throw new IllegalStateException("FastqToKmerCountPlugin: exportInclusionAndExclusion:", e);
        }
    }

    private ReadKmers readMappings(TLongObjectMap kmerMap, String sequence, int kmerLength) {
        ReadKmers kmerstats = new ReadKmers();
        for(int i = 0; i < sequence.length() - kmerLength; i++) {
            kmerstats.kmerCount++;
            String kmerString = sequence.substring(i, i + kmerLength);
            long kmerEncoded = BaseEncoder.getLongFromSeq(kmerString);
            int[] kmerCounts = (int[])kmerMap.get(kmerEncoded);
            if (kmerCounts != null) {
                kmerstats.matchCount++;
                for (int hapid : kmerCounts) kmerstats.hapMultiset.add(hapid);
            }
        }

        return kmerstats;
    }
    
    private void saveReadKmers(ReadKmers info, PrintWriter pw, int readId, Connection db) {
        if (info.matchCount > 10) {
            
            try {
                Statement st = db.createStatement();
                for (Integer hapid : info.hapMultiset.elementSet()) {
                    //get some info from db for this hapid
                    //reference_range_id, isTarget
                    String sql = String.format("SELECT ref_range_id FROM haplotypes WHERE haplotypes_id = %d", hapid);
                    ResultSet rs = st.executeQuery(sql);
                    rs.next();
                    int refRangeId = rs.getInt(1);
                    rs.close();
                    
                    sql = String.format("SELECT line_name, haplotypes_id FROM haplotypes a, gamete_haplotypes b, gametes c, genotypes d "
                            + "WHERE a.gamete_grp_id=b.gamete_grp_id AND b.gameteid=c.gameteid AND "
                            + "c.genoid=d.genoid AND line_name = '%s' AND haplotypes_id = %d", debugTaxon(), hapid);
                   
                    rs = st.executeQuery(sql);
                    boolean isTarget = rs.next();
                    rs.close();
                    
                    pw.print(readId + "\t");
                    pw.printf("%d\t%d\t", info.kmerCount, info.matchCount);
                    pw.printf("%d\t%d\t%d\t%b%n", hapid, info.hapMultiset.count(hapid), refRangeId, isTarget);
                }
                st.close();
            } catch (SQLException e) {
                throw new RuntimeException(e);
            }
            
        }
    }

    class ReadKmers {
        int kmerCount = 0;
        int matchCount = 0;
        int haplotypeCount = 0;
        Multiset hapMultiset = HashMultiset.create();
    }
    
    private Object deserializeMapFromFile() throws IOException, ClassNotFoundException {
        FileInputStream fileOutputStream = new FileInputStream(new File(myKmerFile.value()));
        ObjectInputStream objectStream = new ObjectInputStream(fileOutputStream);
        Object obj = objectStream.readObject();
        objectStream.close();
        fileOutputStream.close();
        return obj;
    }

    @Override
    public String pluginUserManualURL() {
        //TODO
        return "https://bitbucket.org/tasseladmin/tassel-5-source/wiki/UserManual/Kinship/Missing";
    }

    @Override
    public ImageIcon getIcon() {
        URL imageURL = FastqToKmerCountPlugin.class.getResource("/net/maizegenetics/analysis/images/missing.gif");
        if (imageURL == null) {
            return null;
        } else {
            return new ImageIcon(imageURL);
        }
    }

    @Override
    public String getButtonName() {
        return "PHG Index Kmers for haplotypes";
    }

    @Override
    public String getToolTipText() {
        return "PHG Index Kmers for the haplotype graph";
    }

    // The following getters and setters were auto-generated.
    // Please use this method to re-generate.
    //
//    public static void main(String[] args) {
//        GeneratePluginCode.generate(FastqToKmerCountPlugin.class);
//    }


    /**
     * Convenience method to run plugin with one return object.
     */
    // TODO: Replace  with specific type.
//    public  runPlugin(DataSet input) {
//        return () performFunction(input).getData(0).getData();
//    }

    /**
     * Fasta file and associated BWA indices for haplotypes
     *
     * @return Haplotype File
     */
//    public String haplotypesGenomeFile() {
//        return myHaplotypesGenomeFile.value();
//    }

    /**
     * Set Haplotype File. Fasta file and associated BWA indices
     * for haplotypes
     *
     * @param value Haplotype File
     *
     * @return this plugin
     */
//    public FastqToKmerCountPlugin haplotypesGenomeFile(String value) {
//        myHaplotypesGenomeFile = new PluginParameter<>(myHaplotypesGenomeFile, value);
//        return this;
//    }

    /**
     * Reference genome file - temporary need until we can
     * back convert coordinates
     *
     * @return Ref File
     */
//    public String refGenomeFile() {
//        return myRefGenomeFile.value();
//    }

    /**
     * Set Ref File. Reference genome file - temporary need
     * until we can back convert coordinates
     *
     * @param value Ref File
     *
     * @return this plugin
     */
//    public FastqToKmerCountPlugin refGenomeFile(String value) {
//        myRefGenomeFile = new PluginParameter<>(myRefGenomeFile, value);
//        return this;
//    }

    /**
     * Raw Read file aligned to the reference
     *
     * @return Raw Read File
     */
    public String readFile() {
        return myReadFile.value();
    }

    /**
     * Set Raw Read File. Raw Read file aligned to the reference
     *
     * @param value Raw Read File
     *
     * @return this plugin
     */
    public FastqToKmerCountPlugin readFile(String value) {
        myReadFile = new PluginParameter<>(myReadFile, value);
        return this;
    }

    /**
     * Maximum allowable error in order to count the read
     * mapping
     *
     * @return Allowed Error
     */
//    public Integer numErrorAllowed() {
//        return myNumErrorAllowed.value();
//    }

    /**
     * Set Allowed Error. Maximum allowable error in order
     * to count the read mapping
     *
     * @param value Allowed Error
     *
     * @return this plugin
     */
//    public FastqToKmerCountPlugin numErrorAllowed(Integer value) {
//        myNumErrorAllowed = new PluginParameter<>(myNumErrorAllowed, value);
//        return this;
//    }

    /**
     * Text file to store haplotype scoring
     *
     * @return Export Haplo File
     */
    public String exportHaplotypeFile() {
        return myExportHaplotypeFile.value();
    }

    /**
     * Set Export Haplo File. Text file to store haplotype
     * scoring
     *
     * @param value Export Haplo File
     *
     * @return this plugin
     */
    public FastqToKmerCountPlugin exportHaplotypeFile(String value) {
        myExportHaplotypeFile = new PluginParameter<>(myExportHaplotypeFile, value);
        return this;
    }

    /**
     * Debug taxon
     *
     * @return Debug Taxon
     */
    public String debugTaxon() {
        return myDebugTaxon.value();
    }

    /**
     * Set Debug Taxon. Debug taxon
     *
     * @param value Debug Taxon
     *
     * @return this plugin
     */
    public FastqToKmerCountPlugin debugTaxon(String value) {
        myDebugTaxon = new PluginParameter<>(myDebugTaxon, value);
        return this;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy