net.maizegenetics.pangenome.hapCalling.FastqToKmerCountPlugin Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
package net.maizegenetics.pangenome.hapCalling;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.Multiset;
import com.google.common.collect.TreeMultiset;
import com.google.common.primitives.SignedBytes;
import gnu.trove.map.TLongObjectMap;
import htsjdk.samtools.fastq.FastqReader;
import htsjdk.samtools.fastq.FastqRecord;
import net.maizegenetics.dna.BaseEncoder;
import net.maizegenetics.dna.snp.NucleotideAlignmentConstants;
import net.maizegenetics.pangenome.api.HaplotypeGraph;
import net.maizegenetics.pangenome.db_loading.GetDBConnectionPlugin;
import net.maizegenetics.plugindef.*;
import net.maizegenetics.util.Utils;
import org.apache.log4j.Logger;
import javax.swing.*;
import java.awt.*;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.PrintWriter;
import java.net.URL;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.*;
import java.util.List;
import java.util.stream.Collector;
import java.util.stream.Collectors;
/**
* Created by zrm22 on 10/23/17.
*/
@Deprecated
public class FastqToKmerCountPlugin extends AbstractPlugin {
private static final Logger myLogger = Logger.getLogger(FastqToKmerCountPlugin.class);
private PluginParameter myKmerFile = new PluginParameter.Builder<>("kmerFile", null, String.class)
.inFile()
.required(false)
.description("Binary kmer map file. The plugin expects either this file or a data set created by IndexHaplotypeKmersPlugin.")
.guiName("Kmer File")
.build();
private PluginParameter myReadFile = new PluginParameter.Builder<>("rawReadFile", null, String.class)
.inFile()
.required(true)
.description("Raw Read file aligned to the reference")
.build();
private PluginParameter myExportHaplotypeFile = new PluginParameter.Builder<>("exportHaploFile", null, String.class)
.outFile()
.required(false)
.description("Text file to store haplotype scoring")
.build();
private PluginParameter myDebugTaxon = new PluginParameter.Builder<>("debugTaxon", null, String.class)
.required(false)
.description("Debug taxon")
.build();
public FastqToKmerCountPlugin(Frame parentFrame, boolean isInteractive) {
super(parentFrame, isInteractive);
}
@Override
public DataSet processData(DataSet input) {
//Input a TLongObjectMap or supply a kmer file name
TLongObjectMap kmerMap = null;
if (myKmerFile.value() != null) {
try {
myLogger.info("Reading kmerMap.");
long start = System.currentTimeMillis();
kmerMap = (TLongObjectMap) deserializeMapFromFile();
myLogger.info("kmerMap read into memory in " + (System.currentTimeMillis() - start) + " ms.");
} catch (ClassNotFoundException e) {
throw new IllegalArgumentException(myKmerFile.value() + " is not a valid kmer map file.", e);
} catch (IOException e) {
throw new IllegalArgumentException("Unable to read " + myKmerFile.value(), e);
}
} else {
List temp = input.getDataOfType(TLongObjectMap.class);
if(temp.size() != 1) {
throw new IllegalArgumentException("FastqToKmerCountPlugin: processData: must input one TLongObjectMap: " + temp.size());
}
kmerMap = (TLongObjectMap) temp.get(0).getData();
}
System.out.println("kmerMapSize: " + kmerMap.keys().length);
//Load in the read file
FastqReader reader = new FastqReader(new File(readFile()));
long start = System.currentTimeMillis();
Multiset hapidCounts = processFastqFile(kmerMap,reader);
myLogger.info("fastq file processes in " + (System.currentTimeMillis() - start) + " ms.");
if (myExportHaplotypeFile.value() != null) exportInclusionAndExclusion(hapidCounts, myExportHaplotypeFile.value());
return DataSet.getDataSet(hapidCounts);
}
private Multiset processFastqFile(TLongObjectMap kmerMap, FastqReader reader) {
Multiset inclusionAndExclusionCounts = TreeMultiset.create();
PrintWriter pw;
try {
// pw = new PrintWriter("/workdir/pjb39/output/B97_read_info.txt");
pw = new PrintWriter("/Users/peterbradbury/temp/W22_read_info.txt");
} catch (FileNotFoundException e) {
throw new RuntimeException(e);
}
//get a db connection for diagnostic output
// Connection dbconn = (Connection) new GetDBConnectionPlugin()
// .configFile("/workdir/pjb39/dbconfig_fastq_kmer_count.txt")
// .createNew(false)
// .performFunction(null).getData(0).getData();
//for small seq test
Connection dbconn = (Connection) new GetDBConnectionPlugin()
.configFile("/Users/peterbradbury/Documents/projects/phg/training/dbconfig_fastq_kmer_count.txt")
.createNew(false)
.performFunction(null).getData(0).getData();
//loop through the read file for each sequence
long start = System.currentTimeMillis();
int recordCount = 0;
for(FastqRecord currentRecord : reader) {
//loop through each read in batches of 32bps
//retrieve the list of haplotypeNodeIds for each 32bp kmer
//Loop through each kmer and add up how many times a node Id is seen
//If #kmers containing a nodeId == number of non-filtered kmers it should be counted as an inclusion
//else do some filtering
//Idea if #kmers containing a nodeId > 50% but not == to number of non-filtered kmers it should be an exclusion
// List hapIdCountsForRead = extractInclusionAndExclusionFromKmers(kmerMap,currentRecord.getReadBases(), 32, 50);
if(SignedBytes.min(currentRecord.getBaseQualities()) > 10 && currentRecord.getReadLength()>60) {
List hapIdCountsForRead = extractInclusionExclusionFromKmers(kmerMap, currentRecord.getReadString(), 32, 1);
for (Integer hapId : hapIdCountsForRead) {
inclusionAndExclusionCounts.add(hapId);
}
}
saveReadKmers(readMappings(kmerMap, currentRecord.getReadString(), 32), pw, recordCount, dbconn);
recordCount++;
}
try {
dbconn.close();
} catch (SQLException e) {
e.printStackTrace();
}
pw.close();
return inclusionAndExclusionCounts;
}
private List extractInclusionExclusionFromKmers(TLongObjectMap kmerMap, String sequence, int kmerLength, double minimumKmerCountRatio) {
// Multiset hapIdCounts = TreeMultiset.create();
List hapIdList = new ArrayList<>();
int nonDuplicateKmers = 0;
int totalKmersExamined = 0;
for(int i = 0; i < sequence.length() - kmerLength; i++) {
totalKmersExamined++;
byte[] subSequence = NucleotideAlignmentConstants.convertHaplotypeStringToAlleleByteArray(sequence.substring(i, i + kmerLength));
long kmerEncoded = BaseEncoder.getLongSeqFromByteArray(subSequence);
int[] kmerCounts = (int[])kmerMap.get(kmerEncoded);
if (kmerCounts != null) {
for (int hapId : kmerCounts) hapIdList.add(hapId);
nonDuplicateKmers++;
}
}
return hapIdList;
}
private void exportInclusionAndExclusion(Multiset inclusionAndExclusionSet, String outputFileName) {
try(BufferedWriter writer = Utils.getBufferedWriter(outputFileName)) {
//Get the list of keys to make it easier to couple exclusions and inclusions
Set allHapIds = inclusionAndExclusionSet.elementSet().stream().map(hapId -> (hapId<0)?-1*hapId:hapId).collect(Collector.of(TreeSet::new,(set, hapId)->set.add(hapId), (set1,set2)->{set1.addAll(set2); return set1;}));
for(Integer hapId : allHapIds ) {
writer.write(hapId+"\t");
writer.write(inclusionAndExclusionSet.count(hapId)+"\t");
writer.write(inclusionAndExclusionSet.count(-1*hapId)+"\n");
}
}
catch(Exception e) {
myLogger.error("FastqToKmerCountPlugin: exportInclusionAndExclusion:" + e.toString());
throw new IllegalStateException("FastqToKmerCountPlugin: exportInclusionAndExclusion:", e);
}
}
private ReadKmers readMappings(TLongObjectMap kmerMap, String sequence, int kmerLength) {
ReadKmers kmerstats = new ReadKmers();
for(int i = 0; i < sequence.length() - kmerLength; i++) {
kmerstats.kmerCount++;
String kmerString = sequence.substring(i, i + kmerLength);
long kmerEncoded = BaseEncoder.getLongFromSeq(kmerString);
int[] kmerCounts = (int[])kmerMap.get(kmerEncoded);
if (kmerCounts != null) {
kmerstats.matchCount++;
for (int hapid : kmerCounts) kmerstats.hapMultiset.add(hapid);
}
}
return kmerstats;
}
private void saveReadKmers(ReadKmers info, PrintWriter pw, int readId, Connection db) {
if (info.matchCount > 10) {
try {
Statement st = db.createStatement();
for (Integer hapid : info.hapMultiset.elementSet()) {
//get some info from db for this hapid
//reference_range_id, isTarget
String sql = String.format("SELECT ref_range_id FROM haplotypes WHERE haplotypes_id = %d", hapid);
ResultSet rs = st.executeQuery(sql);
rs.next();
int refRangeId = rs.getInt(1);
rs.close();
sql = String.format("SELECT line_name, haplotypes_id FROM haplotypes a, gamete_haplotypes b, gametes c, genotypes d "
+ "WHERE a.gamete_grp_id=b.gamete_grp_id AND b.gameteid=c.gameteid AND "
+ "c.genoid=d.genoid AND line_name = '%s' AND haplotypes_id = %d", debugTaxon(), hapid);
rs = st.executeQuery(sql);
boolean isTarget = rs.next();
rs.close();
pw.print(readId + "\t");
pw.printf("%d\t%d\t", info.kmerCount, info.matchCount);
pw.printf("%d\t%d\t%d\t%b%n", hapid, info.hapMultiset.count(hapid), refRangeId, isTarget);
}
st.close();
} catch (SQLException e) {
throw new RuntimeException(e);
}
}
}
class ReadKmers {
int kmerCount = 0;
int matchCount = 0;
int haplotypeCount = 0;
Multiset hapMultiset = HashMultiset.create();
}
private Object deserializeMapFromFile() throws IOException, ClassNotFoundException {
FileInputStream fileOutputStream = new FileInputStream(new File(myKmerFile.value()));
ObjectInputStream objectStream = new ObjectInputStream(fileOutputStream);
Object obj = objectStream.readObject();
objectStream.close();
fileOutputStream.close();
return obj;
}
@Override
public String pluginUserManualURL() {
//TODO
return "https://bitbucket.org/tasseladmin/tassel-5-source/wiki/UserManual/Kinship/Missing";
}
@Override
public ImageIcon getIcon() {
URL imageURL = FastqToKmerCountPlugin.class.getResource("/net/maizegenetics/analysis/images/missing.gif");
if (imageURL == null) {
return null;
} else {
return new ImageIcon(imageURL);
}
}
@Override
public String getButtonName() {
return "PHG Index Kmers for haplotypes";
}
@Override
public String getToolTipText() {
return "PHG Index Kmers for the haplotype graph";
}
// The following getters and setters were auto-generated.
// Please use this method to re-generate.
//
// public static void main(String[] args) {
// GeneratePluginCode.generate(FastqToKmerCountPlugin.class);
// }
/**
* Convenience method to run plugin with one return object.
*/
// TODO: Replace with specific type.
// public runPlugin(DataSet input) {
// return () performFunction(input).getData(0).getData();
// }
/**
* Fasta file and associated BWA indices for haplotypes
*
* @return Haplotype File
*/
// public String haplotypesGenomeFile() {
// return myHaplotypesGenomeFile.value();
// }
/**
* Set Haplotype File. Fasta file and associated BWA indices
* for haplotypes
*
* @param value Haplotype File
*
* @return this plugin
*/
// public FastqToKmerCountPlugin haplotypesGenomeFile(String value) {
// myHaplotypesGenomeFile = new PluginParameter<>(myHaplotypesGenomeFile, value);
// return this;
// }
/**
* Reference genome file - temporary need until we can
* back convert coordinates
*
* @return Ref File
*/
// public String refGenomeFile() {
// return myRefGenomeFile.value();
// }
/**
* Set Ref File. Reference genome file - temporary need
* until we can back convert coordinates
*
* @param value Ref File
*
* @return this plugin
*/
// public FastqToKmerCountPlugin refGenomeFile(String value) {
// myRefGenomeFile = new PluginParameter<>(myRefGenomeFile, value);
// return this;
// }
/**
* Raw Read file aligned to the reference
*
* @return Raw Read File
*/
public String readFile() {
return myReadFile.value();
}
/**
* Set Raw Read File. Raw Read file aligned to the reference
*
* @param value Raw Read File
*
* @return this plugin
*/
public FastqToKmerCountPlugin readFile(String value) {
myReadFile = new PluginParameter<>(myReadFile, value);
return this;
}
/**
* Maximum allowable error in order to count the read
* mapping
*
* @return Allowed Error
*/
// public Integer numErrorAllowed() {
// return myNumErrorAllowed.value();
// }
/**
* Set Allowed Error. Maximum allowable error in order
* to count the read mapping
*
* @param value Allowed Error
*
* @return this plugin
*/
// public FastqToKmerCountPlugin numErrorAllowed(Integer value) {
// myNumErrorAllowed = new PluginParameter<>(myNumErrorAllowed, value);
// return this;
// }
/**
* Text file to store haplotype scoring
*
* @return Export Haplo File
*/
public String exportHaplotypeFile() {
return myExportHaplotypeFile.value();
}
/**
* Set Export Haplo File. Text file to store haplotype
* scoring
*
* @param value Export Haplo File
*
* @return this plugin
*/
public FastqToKmerCountPlugin exportHaplotypeFile(String value) {
myExportHaplotypeFile = new PluginParameter<>(myExportHaplotypeFile, value);
return this;
}
/**
* Debug taxon
*
* @return Debug Taxon
*/
public String debugTaxon() {
return myDebugTaxon.value();
}
/**
* Set Debug Taxon. Debug taxon
*
* @param value Debug Taxon
*
* @return this plugin
*/
public FastqToKmerCountPlugin debugTaxon(String value) {
myDebugTaxon = new PluginParameter<>(myDebugTaxon, value);
return this;
}
}