net.maizegenetics.pangenome.pipelineTests.CountNsInRawHapSequencesPlugin Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
/**
*
*/
package net.maizegenetics.pangenome.pipelineTests;
import java.awt.Frame;
import java.io.BufferedWriter;
import java.sql.Connection;
import java.sql.ResultSet;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.swing.ImageIcon;
import org.apache.log4j.Logger;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import net.maizegenetics.pangenome.api.CreateGraphUtils;
import net.maizegenetics.pangenome.api.ReferenceRange;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.GeneratePluginCode;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.taxa.TaxaList;
import net.maizegenetics.taxa.Taxon;
import net.maizegenetics.util.Tuple;
import net.maizegenetics.util.Utils;
/**
*
* NOTE: THis test is obsolete - uses old schema and old connection method.
*
* Calculates the percentage of N's for each "raw" haplotype (ie, not consensus)
* for each genome interval (anchor, not inter-anchors). Results shoudl be printed
* to a tab-delimited file with genome_interval_id as rows, and taxa/percentage as the
* columns.
*
* Testing: Testing was done by random manual checking. Once the output file is created, it is
* uploaded to excel. use sqlite3 on machine holding db. Look at sequences and
* verify when it has "1" as percent, there are all N's. WHen 0, there are 0. When
* a percentage, e.g. .53, the sequence looks to be half N's.
*
* Find gamete_grp_id for the line you want to check:
* sqlite> SELECT gamete_haplotypes.gamete_grp_id FROM gamete_haplotypes
* sqlite> INNER JOIN gametes ON gametes.gameteid=gamete_haplotypes.gameteid
* sqlite> INNER JOIN genotypes on gametes.genoid=genotypes.genoid
* sqlite> WHERE genotypes.line_name='CML247';
* sqlite> 5 (this is returned value)
*
* Then look at the sequence:
* sqlite> select gamete_grp_id, genome_interval_id,sequence from haplotypes where gamete_grp_id=5 and genome_interval_id=111;
*
* @author lcj34
*
*/
@Deprecated
public class CountNsInRawHapSequencesPlugin extends AbstractPlugin {
private static final Logger myLogger = Logger.getLogger(CountNsInRawHapSequencesPlugin.class);
private PluginParameter myHostname = new PluginParameter.Builder<>("hostname", null, String.class)
.description("Hostname where database resides")
.build();
private PluginParameter myUserid = new PluginParameter.Builder<>("userid", "", String.class)
.description("Userid for database")
.build();
private PluginParameter myPassword = new PluginParameter.Builder<>("password", "", String.class)
.password()
.description("Password for database")
.build();
private PluginParameter myDatabaseName = new PluginParameter.Builder<>("databaseName", null, String.class)
.required(true)
.description("Database name")
.build();
private PluginParameter myOutputFile = new PluginParameter.Builder<>("outputFile", null, String.class)
.outFile()
.required(true)
.description("Output File")
.build();
public CountNsInRawHapSequencesPlugin() {
super(null, false);
}
public CountNsInRawHapSequencesPlugin(Frame parentFrame) {
super(parentFrame, false);
}
public CountNsInRawHapSequencesPlugin(Frame parentFrame, boolean isInteractive) {
super(parentFrame, isInteractive);
}
@Override
public DataSet processData(DataSet input) {
long totalTime = System.nanoTime();
myLogger.info("Creating connection to db " + databaseName());
try (Connection conn = CreateGraphUtils.connection(hostname(), userid(), password(), databaseName())) {
myLogger.info("create taxaListMap and reference ranges");
Map taxaListMap = CreateGraphUtils.taxaListMap(conn);
// Map of genome_interval_id, Tuple
Multimap> intervalIdTaxonPercentNMap = HashMultimap.create();
// For each gameteid with a single taxon associated, grab the sequence
// for each anchor interval (not inter-anchors, genome_interval_id < 37805)
// Calculate the percentage of N's in each sequence, store to map for this haplotype
int hapCount = 0;
for (Map.Entry entry : taxaListMap.entrySet()) {
if (entry.getValue().size() > 1) continue; // skip consensus - only want raw haplotypes
hapCount++;
int gamete_grp_id=entry.getKey();
Taxon taxon = entry.getValue().get(0);
StringBuilder sb = new StringBuilder();
sb.append("SELECT haplotypes.genome_interval_id, haplotypes.sequence from haplotypes ");
sb.append("WHERE gamete_grp_id=");
sb.append(gamete_grp_id);
sb.append(" AND haplotypes.genome_interval_id < 37805"); // last anchor is 37804, ignore interanchors
sb.append(" ORDER BY haplotypes.genome_interval_id;");
String query = sb.toString();
myLogger.info("querying data for gamete_grp_id " + gamete_grp_id);
ResultSet rs = conn.createStatement().executeQuery(query);
Map anchorToSequenceMap = new HashMap();
while (rs.next()) {
anchorToSequenceMap.put(rs.getInt("genome_interval_id"), rs.getString("sequence"));
}
// >
Map hapPercentNCountMap = countSequenceNs(anchorToSequenceMap);
// Add results for this taxon/haplotype to genomeIntervalMap. This map gathers
// the information for all haplotypes in a manner easily printed to a file.
addToIntervalMap(intervalIdTaxonPercentNMap,hapPercentNCountMap, taxon.getName());
}
myLogger.info("Processed " + hapCount + " haplotypes for each interval.");
printHaplotypeResults(intervalIdTaxonPercentNMap, conn);
System.out.println("Number haplotypes processed " + hapCount);
} catch (Exception exc) {
throw new IllegalStateException("Error processing CountNsInRawHapSequencesPlugin: " + exc.getMessage());
}
myLogger.info("Finished processing in " + (System.nanoTime()-totalTime)/1e9 + " seconds");
return null;
}
// Method takes map of anchorid to sequence, returns map of
// genome_interval_id/percentN
private Map countSequenceNs(Map anchorToSequenceMap) {
Map anchorToNMap = new HashMap();
for (Map.Entry entry : anchorToSequenceMap.entrySet()) {
int nCount = (int)getNCount(entry.getValue());
double nPercent = (double)nCount/entry.getValue().length();
anchorToNMap.put(entry.getKey(), nPercent);
}
return anchorToNMap;
}
// Find number of N's in the given sequence string.
private long getNCount(String seq) {
long count = seq.chars().filter(allele -> allele=='N').count();
return count;
}
private void addToIntervalMap(Multimap> intervalIdTaxonPercentNMap,
Map idToNCountMap, String haplotype) {
for (Map.Entry entry: idToNCountMap.entrySet()) {
// for each entry on the haplotype's percentN map, add data to the global
// genome interval percent N map.
intervalIdTaxonPercentNMap.put(entry.getKey(), new Tuple(haplotype,entry.getValue()));
}
}
// This multimap needs the haplotypes ordered - must always print
// in same order on line
private void printHaplotypeResults(Multimap> intervalIdTaxonPercentNMap,
Connection conn){
myLogger.info("Begin printHaplotypeResults");
// refRange map to be used for printing intervals
Map refRangeMap = CreateGraphUtils.referenceRangeMap( conn);
// Get ordered list of haplotypes for header line
Collection> haplotypes = intervalIdTaxonPercentNMap.asMap().get(1); // any element
List> haplotypeList = new ArrayList>(haplotypes);
Collections.sort(haplotypeList, (Tuple first, Tuple second) -> first.x.compareTo(second.x));
// Create file header line
StringBuilder headerLine = new StringBuilder().append("GenomeIntervalID")
.append("\t").append("ReferenceRangeInterval");
for (Tuple haplotype : haplotypeList) {
headerLine.append("\t").append(haplotype.getX());
}
headerLine.append("\n");
try (BufferedWriter writer = Utils.getBufferedWriter(outputFile())) {
writer.write(headerLine.toString());
// Add data for each row
StringBuilder sb = new StringBuilder();
intervalIdTaxonPercentNMap.asMap().entrySet().stream().forEach(entry -> {
int genomeIntervalId = entry.getKey();
Collection> taxaPercents = entry.getValue();
String fileLine = createFileEntry(genomeIntervalId, refRangeMap,taxaPercents);
sb.append(fileLine); // values were sorted in createFileEntry()
});
writer.write(sb.toString());
} catch (Exception exc) {
myLogger.error("Error writing file " + outputFile() + " " + exc.getMessage());
throw new IllegalStateException("Error writing output file " + exc.getMessage());
}
}
private String createFileEntry(int genomeInterval, Map refRangeMap, Collection> taxaPercents) {
// Sort the list so values are consistently printed to correct columns
List> percentList = new ArrayList>(taxaPercents);
Collections.sort(percentList, (Tuple first, Tuple second) -> first.x.compareTo(second.x));
String refRangeInterval = refRangeMap.get(genomeInterval).intervalString();
StringBuilder sb = new StringBuilder().append(genomeInterval)
.append("\t").append(refRangeInterval);
for (Tuple taxaEntry : percentList) {
// Add the percentN value for each taxa/haplotype
sb.append("\t").append(taxaEntry.getY());
}
sb.append("\n");
return sb.toString();
}
/**
* @param args
*/
// public static void main(String[] args) {
// // TODO Auto-generated method stub
//
// }
public static void main(String[] args) {
GeneratePluginCode.generate(CountNsInRawHapSequencesPlugin.class);
}
@Override
public ImageIcon getIcon() {
return null;
}
@Override
public String getButtonName() {
return ("Count Haplotype Sequence Ns");
}
@Override
public String getToolTipText() {
return ("Count Haplotype Sequence Ns");
}
/**
* Hostname where database resides
*
* @return Hostname
*/
public String hostname() {
return myHostname.value();
}
/**
* Set Hostname. Hostname where database resides
*
* @param value Hostname
*
* @return this plugin
*/
public CountNsInRawHapSequencesPlugin hostname(String value) {
myHostname = new PluginParameter<>(myHostname, value);
return this;
}
/**
* Userid for database
*
* @return Userid
*/
public String userid() {
return myUserid.value();
}
/**
* Set Userid. Userid for database
*
* @param value Userid
*
* @return this plugin
*/
public CountNsInRawHapSequencesPlugin userid(String value) {
myUserid = new PluginParameter<>(myUserid, value);
return this;
}
/**
* Password for database
*
* @return Password
*/
public String password() {
return myPassword.value();
}
/**
* Set Password. Password for database
*
* @param value Password
*
* @return this plugin
*/
public CountNsInRawHapSequencesPlugin password(String value) {
myPassword = new PluginParameter<>(myPassword, value);
return this;
}
/**
* Database name
*
* @return Database Name
*/
public String databaseName() {
return myDatabaseName.value();
}
/**
* Set Database Name. Database name
*
* @param value Database Name
*
* @return this plugin
*/
public CountNsInRawHapSequencesPlugin databaseName(String value) {
myDatabaseName = new PluginParameter<>(myDatabaseName, value);
return this;
}
/**
* Output File
*
* @return Output File
*/
public String outputFile() {
return myOutputFile.value();
}
/**
* Set Output File. Output File
*
* @param value Output File
*
* @return this plugin
*/
public CountNsInRawHapSequencesPlugin outputFile(String value) {
myOutputFile = new PluginParameter<>(myOutputFile, value);
return this;
}
}