
net.maizegenetics.analysis.data.HDF5SummaryPlugin Maven / Gradle / Ivy
package net.maizegenetics.analysis.data;
import ch.systemsx.cisd.hdf5.HDF5Factory;
import ch.systemsx.cisd.hdf5.IHDF5Reader;
import net.maizegenetics.dna.BaseEncoder;
import net.maizegenetics.dna.map.TOPMInterface;
import net.maizegenetics.dna.map.TOPMUtils;
import net.maizegenetics.dna.snp.GenotypeTable;
import net.maizegenetics.dna.snp.ImportUtils;
import net.maizegenetics.dna.tag.TagsByTaxa;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.util.HDF5Utils;
import net.maizegenetics.util.Utils;
import org.apache.log4j.Logger;
import javax.swing.*;
import java.awt.*;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.util.List;
/**
* Created by jgw87 on 5/28/14.
* This plugin is meant to take an HDF5 file and output a report of various summary statistics or information.
* INclude
*/
//TODO: Add support for chromosomes, positions, etc.
public class HDF5SummaryPlugin extends AbstractPlugin {
private Logger logger = Logger.getLogger(HDF5SummaryPlugin.class);
private enum H5FileType {GENOTYPE, TOPM, TBT, UNKNOWN}
private H5FileType myFileType = H5FileType.UNKNOWN;
//Various handles to be used by the class functions
GenotypeTable genos = null;
TagsByTaxa tbt = null;
TOPMInterface topm = null;
IHDF5Reader h5reader = null;
BufferedWriter outputWriter = null;
private PluginParameter inputFile
= new PluginParameter.Builder<>("input", null, String.class)
.description("TASSEL HDF5 file to get summary data from")
.guiName("HDF5 file")
.required(true)
.inFile()
.build();
private PluginParameter outputFile
= new PluginParameter.Builder<>("output", null, String.class)
.description("File to write summary data to")
.guiName("Output file")
.required(true)
.outFile()
.build();
private PluginParameter taxaCount
= new PluginParameter.Builder<>("taxaCount", false, Boolean.class)
.description("Output number of taxa in file")
.guiName("Output count of taxa?")
.build();
private PluginParameter taxaNames
= new PluginParameter.Builder<>("taxaNames", false, Boolean.class)
.description("Output names of all taxa in file")
.guiName("Output taxa names")
.build();
private PluginParameter siteCount
= new PluginParameter.Builder<>("siteCount", false, Boolean.class)
.description("Output number of sites in file")
.guiName("Output site count")
.build();
private PluginParameter siteNames
= new PluginParameter.Builder<>("siteNames", false, Boolean.class)
.description("Output names of all sites in file")
.guiName("Output site names")
.build();
private PluginParameter tagCount
= new PluginParameter.Builder<>("tagCount", false, Boolean.class)
.description("Output number of sequence tags in file")
.guiName("Output tag count")
.build();
private PluginParameter tagSeqs
= new PluginParameter.Builder<>("tagNames", false, Boolean.class)
.description("Output sequence of all tags in file")
.guiName("Output tag sequences")
.build();
private PluginParameter hasDepth
= new PluginParameter.Builder<>("hasDepth", false, Boolean.class)
.description("Output whether file contains read depth information")
.guiName("Output if has read depth")
.build();
private PluginParameter printAll
= new PluginParameter.Builder<>("all", false, Boolean.class)
.description("Output all available information (overrides other options)")
.guiName("Output all available information")
.build();
private PluginParameter rawData
= new PluginParameter.Builder<>("rawData", false, Boolean.class)
.description("Include only the raw output (no metadata or descriptions)")
.guiName("Output only raw data")
.build();
public HDF5SummaryPlugin(Frame parentFrame, boolean isInteractive) {
super(parentFrame, isInteractive);
}
@Override
public DataSet processData(DataSet input) {
h5reader = HDF5Factory.openForReading(inputFile());
determineFileType();
setUpDataStructures();
//Check for each option in succession and output if needed
//The check for printAll() is here rather than in postProcessParameters to make it easier to remember for when adding new parameters
try {
outputWriter = Utils.getBufferedWriter(outputFile());
if(! rawData()){
writeFileData();
}
//First batch is the one-line summary stats
if (siteCount() || printAll()) {
writeSiteCount();
}
if (taxaCount() || printAll()) {
writeTaxaCount();
}
if ( tagCount() || printAll()){
writeTagCount();
}
if (hasDepth() || printAll()) {
writeHasDepth();
}
//Next batch is the larger datasets
if (siteNames() || printAll()) {
writeSiteNames();
}
if (taxaNames() || printAll()) {
writeTaxaNames();
}
if (tagSeqs() || printAll()){
writeTagSequences();
}
outputWriter.close();
} catch (IOException e) {
logger.error("Error writing report to " + outputFile() + ":\n" + e.getStackTrace());
}
return null;
}
/**
* Determine the file type by checking for various fields
*/
private void determineFileType(){
if(HDF5Utils.doesGenotypeModuleExist(h5reader)){
myFileType = H5FileType.GENOTYPE;
}
else if(HDF5Utils.doTagsByTaxaExist(h5reader)){
myFileType = H5FileType.TBT;
}
else if(HDF5Utils.doTagsExist(h5reader) && !HDF5Utils.doTagsByTaxaExist(h5reader)){ // Extra check for lack of tags-by-taxa is in case this statement ever gets reordered
myFileType = H5FileType.TOPM;
} else{
myFileType = H5FileType.UNKNOWN;
}
}
private void setUpDataStructures(){
switch (myFileType){
case GENOTYPE:
genos = ImportUtils.readGuessFormat(inputFile());
break;
case TBT:
//tbt = TagsByTaxaHDF5Builder.openForReading(inputFile());
break;
case TOPM:
topm = TOPMUtils.readTOPM(inputFile());
break;
default:
//do nothing
}
}
//Methods to write the different selected data
private void writeFileData() throws IOException {
File infile = new File(inputFile());
String path = infile.getCanonicalFile().getAbsolutePath();
String output ="### Summary data for TASSEL HDF5 file " + infile.getName() + " (" + path + ")###\n";
switch(myFileType){
case GENOTYPE:
output += "File type:\tGenotypeTable";
if(HDF5Utils.isTASSEL4HDF5Format(h5reader)){
output += " (TASSEL 4 formatted)";
}
output += "\n";
break;
case TOPM:
output += "File type:\tTagsOnPhysicalMap\n";
break;
case TBT:
output += "File type:\tTagsByTaxa\n";
break;
default:
output += "File type:\tUnknown\n";
}
outputWriter.append(output);
}
private void writeSiteCount() throws IOException {
//Get the number of sites only if a genotype file
String nsites;
switch(myFileType){
case GENOTYPE:
nsites = "" + genos.numberOfSites();
break;
default:
nsites = "n/a";
}
String output;
if(rawData()) {
output = nsites + "\n";
}else{
output = "numberOfSites:\t" + nsites + "\n";
}
outputWriter.append(output);
}
private void writeTaxaCount() throws IOException {
String ntaxa;
switch(myFileType) {
case GENOTYPE: // may need to invoke getHDF5GenotypeTaxaCount
case TBT:
ntaxa = "" + HDF5Utils.getHDF5TaxaNumTaxa(h5reader);
break;
case TOPM:
default:
ntaxa = "n/a";
}
String output;
if(rawData()) {
output = ntaxa + "\n";
}else{
output = "numberOfTaxa:\t" + ntaxa + "\n";
}
outputWriter.append(output);
}
private void writeTagCount() throws IOException {
String ntags;
switch(myFileType) {
case TBT:
case TOPM:
ntags = "" + HDF5Utils.getHDF5TagCount(h5reader);
break;
default:
ntags = "n/a";
}
String output;
if(rawData()) {
output = ntags + "\n";
}else{
output = "numberOfTags:\t" + ntags + "\n";
}
outputWriter.append(output);
}
private void writeHasDepth() throws IOException {
String isDepth;
switch(myFileType) {
case GENOTYPE:
isDepth = "" + HDF5Utils.doesGenotypeDepthExist(h5reader);
break;
case TOPM:
isDepth = "n/a";
break;
case TBT:
isDepth = "true";
break;
default:
isDepth = "unknown";
}
String output;
if(rawData()) {
output = isDepth + "\n";
}else{
output = "hasDepth:\t" + isDepth + "\n";
}
outputWriter.append(output);
}
private void writeSiteNames() throws IOException {
StringBuilder sites = new StringBuilder();
if(!rawData()) {
sites.append("###Site Names###\n");
}
switch(myFileType){
case GENOTYPE:
for(int i=0; i myTaxaList = HDF5Utils.getAllTaxaNames(h5reader);
for(String t: myTaxaList){
taxa.append(t + "\n");
}
break;
default:
taxa.append("(not applicable for this file type)\n");
}
outputWriter.append(taxa.toString());
}
private void writeTagSequences() throws IOException {
StringBuilder tags = new StringBuilder();
if(!rawData()) {
tags.append("###Tag Sequences###\n");
}
switch(myFileType){
case TBT:
case TOPM:
long[][] myTags = HDF5Utils.getTags(h5reader);
for (int i = 0; i < myTags.length; i++) {
tags.append(BaseEncoder.getSequenceFromLong(myTags[i]) + "\n");
}
break;
default:
tags.append("(not applicable for this file type)\n");
}
outputWriter.append(tags.toString());
}
//Overridden methods from AbstractPlugin
/*@Override
protected void postProcessParameters() {
if (!(taxaCount() || taxaNames() || siteCount() || siteNames() || hasDepth())) {
throw new IllegalArgumentException("\n\nMust select at least one option to output.\n\n");
}
}*/
@Override
public String pluginDescription(){
return "This plugin takes a TASSEL-generated HDF5 file and prints out a set of summary data depending on which " +
"command-line flags are passed to it. It is meant to allow quick retrieval of certain basic data (taxa names, " +
"site count, etc.) that are not easily available without operations that could take considerable time " +
"(e.g., a genotype summary report). This plugin currently supports Genotype, TOPM (tags on physical map), and" +
"TBT (tags by taxa) files. (Note that the implementation of TOPM and TBT formats in HDF5 is still ongoing, " +
"so this plugin may not work on them.)\n";
}
//Parameter set functions
public HDF5SummaryPlugin inputFile(String filename) {
inputFile = new PluginParameter<>(inputFile, filename);
return this;
}
public HDF5SummaryPlugin outputFile(String filename) {
outputFile = new PluginParameter<>(outputFile, filename);
return this;
}
public HDF5SummaryPlugin printAll(Boolean value) {
printAll = new PluginParameter<>(printAll, value);
return this;
}
public HDF5SummaryPlugin taxaCount(Boolean value) {
taxaCount = new PluginParameter<>(taxaCount, value);
return this;
}
public HDF5SummaryPlugin taxaNames(Boolean value) {
taxaNames = new PluginParameter<>(taxaNames, value);
return this;
}
public HDF5SummaryPlugin siteCount(Boolean value) {
siteCount = new PluginParameter<>(siteCount, value);
return this;
}
public HDF5SummaryPlugin siteNames(Boolean value) {
siteNames = new PluginParameter<>(siteNames, value);
return this;
}
public HDF5SummaryPlugin hasDepth(Boolean value) {
hasDepth = new PluginParameter<>(hasDepth, value);
return this;
}
public HDF5SummaryPlugin rawData(Boolean value) {
rawData = new PluginParameter<>(rawData, value);
return this;
}
//Parameter get functions
public String inputFile() {
return inputFile.value();
}
public String outputFile() {
return outputFile.value();
}
public Boolean printAll() {
return printAll.value();
}
public Boolean taxaCount() {
return taxaCount.value();
}
public Boolean taxaNames() {
return taxaNames.value();
}
public Boolean siteCount() {
return siteCount.value();
}
public Boolean siteNames() {
return siteNames.value();
}
public Boolean tagCount() {
return tagCount.value();
}
public Boolean tagSeqs() {
return tagSeqs.value();
}
public Boolean hasDepth() {
return hasDepth.value();
}
public Boolean rawData() {
return rawData.value();
}
//GUI-required methods
@Override
public ImageIcon getIcon() {
return null;
}
@Override
public String getButtonName() {
return "HDF5 Summary";
}
@Override
public String getToolTipText() {
return "HDF5 Summary";
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy