net.maizegenetics.analysis.data.HDF5SummaryPlugin Maven / Gradle / Ivy

Go to download
package net.maizegenetics.analysis.data;

import ch.systemsx.cisd.hdf5.HDF5Factory;
import ch.systemsx.cisd.hdf5.IHDF5Reader;
import net.maizegenetics.dna.BaseEncoder;
import net.maizegenetics.dna.map.TOPMInterface;
import net.maizegenetics.dna.map.TOPMUtils;
import net.maizegenetics.dna.snp.GenotypeTable;
import net.maizegenetics.dna.snp.ImportUtils;
import net.maizegenetics.dna.tag.TagsByTaxa;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.util.HDF5Utils;
import net.maizegenetics.util.Utils;
import org.apache.log4j.Logger;

import javax.swing.*;
import java.awt.*;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.util.List;

/**
 * Created by jgw87 on 5/28/14.
 * This plugin is meant to take an HDF5 file and output a report of various summary statistics or information.
 *  INclude
 */
//TODO: Add support for chromosomes, positions, etc.
public class HDF5SummaryPlugin extends AbstractPlugin {

    private Logger logger = Logger.getLogger(HDF5SummaryPlugin.class);
    private enum H5FileType {GENOTYPE, TOPM, TBT, UNKNOWN}
    private H5FileType myFileType = H5FileType.UNKNOWN;

    //Various handles to be used by the class functions
    GenotypeTable genos = null;
    TagsByTaxa tbt = null;
    TOPMInterface topm = null;
    IHDF5Reader h5reader = null;
    BufferedWriter outputWriter = null;


    private PluginParameter inputFile
            = new PluginParameter.Builder<>("input", null, String.class)
            .description("TASSEL HDF5 file to get summary data from")
            .guiName("HDF5 file")
            .required(true)
            .inFile()
            .build();
    private PluginParameter outputFile
            = new PluginParameter.Builder<>("output", null, String.class)
            .description("File to write summary data to")
            .guiName("Output file")
            .required(true)
            .outFile()
            .build();
    private PluginParameter taxaCount
            = new PluginParameter.Builder<>("taxaCount", false, Boolean.class)
            .description("Output number of taxa in file")
            .guiName("Output count of taxa?")
            .build();
    private PluginParameter taxaNames
            = new PluginParameter.Builder<>("taxaNames", false, Boolean.class)
            .description("Output names of all taxa in file")
            .guiName("Output taxa names")
            .build();
    private PluginParameter siteCount
            = new PluginParameter.Builder<>("siteCount", false, Boolean.class)
            .description("Output number of sites in file")
            .guiName("Output site count")
            .build();
    private PluginParameter siteNames
            = new PluginParameter.Builder<>("siteNames", false, Boolean.class)
            .description("Output names of all sites in file")
            .guiName("Output site names")
            .build();
    private PluginParameter tagCount
            = new PluginParameter.Builder<>("tagCount", false, Boolean.class)
            .description("Output number of sequence tags in file")
            .guiName("Output tag count")
            .build();
    private PluginParameter tagSeqs
            = new PluginParameter.Builder<>("tagNames", false, Boolean.class)
            .description("Output sequence of all tags in file")
            .guiName("Output tag sequences")
            .build();
    private PluginParameter hasDepth
            = new PluginParameter.Builder<>("hasDepth", false, Boolean.class)
            .description("Output whether file contains read depth information")
            .guiName("Output if has read depth")
            .build();
    private PluginParameter printAll
            = new PluginParameter.Builder<>("all", false, Boolean.class)
            .description("Output all available information (overrides other options)")
            .guiName("Output all available information")
            .build();
    private PluginParameter rawData
            = new PluginParameter.Builder<>("rawData", false, Boolean.class)
            .description("Include only the raw output (no metadata or descriptions)")
            .guiName("Output only raw data")
            .build();

    public HDF5SummaryPlugin(Frame parentFrame, boolean isInteractive) {
        super(parentFrame, isInteractive);
    }


    @Override
    public DataSet processData(DataSet input) {

        h5reader = HDF5Factory.openForReading(inputFile());
        determineFileType();
        setUpDataStructures();

        //Check for each option in succession and output if needed
        //The check for printAll() is here rather than in postProcessParameters to make it easier to remember for when adding new parameters
        try {
            outputWriter = Utils.getBufferedWriter(outputFile());

            if(! rawData()){
                writeFileData();
            }

            //First batch is the one-line summary stats
            if (siteCount() || printAll()) {
                writeSiteCount();
            }
            if (taxaCount() || printAll()) {
                writeTaxaCount();
            }
            if ( tagCount() || printAll()){
                writeTagCount();
            }
            if (hasDepth() || printAll()) {
                writeHasDepth();
            }

            //Next batch is the larger datasets
            if (siteNames() || printAll()) {
                writeSiteNames();
            }
            if (taxaNames() || printAll()) {
                writeTaxaNames();
            }
            if (tagSeqs() || printAll()){
                writeTagSequences();
            }

            outputWriter.close();
        } catch (IOException e) {
            logger.error("Error writing report to " + outputFile() + ":\n" + e.getStackTrace());
        }

        return null;
    }

    /**
     * Determine the file type by checking for various fields
     */
    private void determineFileType(){
        if(HDF5Utils.doesGenotypeModuleExist(h5reader)){
            myFileType =  H5FileType.GENOTYPE;
        }
        else if(HDF5Utils.doTagsByTaxaExist(h5reader)){
            myFileType = H5FileType.TBT;
        }
        else if(HDF5Utils.doTagsExist(h5reader) && !HDF5Utils.doTagsByTaxaExist(h5reader)){ // Extra check for lack of tags-by-taxa is in case this statement ever gets reordered
            myFileType = H5FileType.TOPM;
        } else{
            myFileType = H5FileType.UNKNOWN;
        }
    }

    private void setUpDataStructures(){
        switch (myFileType){
            case GENOTYPE:
                genos = ImportUtils.readGuessFormat(inputFile());
                break;
            case TBT:
                //tbt = TagsByTaxaHDF5Builder.openForReading(inputFile());
                break;
            case TOPM:
                topm = TOPMUtils.readTOPM(inputFile());
                break;
            default:
                //do nothing
            }
    }

    //Methods to write the different selected data
    private void writeFileData() throws IOException {
        File infile = new File(inputFile());
        String path = infile.getCanonicalFile().getAbsolutePath();
        String output ="### Summary data for TASSEL HDF5 file " + infile.getName() + " (" + path + ")###\n";

        switch(myFileType){
            case GENOTYPE:
                output += "File type:\tGenotypeTable";
                if(HDF5Utils.isTASSEL4HDF5Format(h5reader)){
                    output += " (TASSEL 4 formatted)";
                }
                output += "\n";
                break;
            case TOPM:
                output += "File type:\tTagsOnPhysicalMap\n";
                break;
            case TBT:
                output += "File type:\tTagsByTaxa\n";
                break;
            default:
                output += "File type:\tUnknown\n";
        }

        outputWriter.append(output);
    }

    private void writeSiteCount() throws IOException {
        //Get the number of sites only if a genotype file
        String nsites;
        switch(myFileType){
            case GENOTYPE:
                nsites = "" + genos.numberOfSites();
                break;
            default:
                nsites = "n/a";
        }

        String output;
        if(rawData()) {
            output = nsites + "\n";
        }else{
            output = "numberOfSites:\t" + nsites + "\n";
        }
        outputWriter.append(output);
    }

    private void writeTaxaCount()  throws IOException {
        String ntaxa;
        switch(myFileType) {
            case GENOTYPE: // may need to invoke getHDF5GenotypeTaxaCount
            case TBT:
                ntaxa = "" + HDF5Utils.getHDF5TaxaNumTaxa(h5reader);
                break;
            case TOPM:
            default:
                ntaxa = "n/a";
        }

        String output;
        if(rawData()) {
            output =  ntaxa + "\n";
        }else{
            output = "numberOfTaxa:\t" + ntaxa + "\n";
        }
        outputWriter.append(output);
    }

    private void writeTagCount()  throws IOException {
        String ntags;
        switch(myFileType) {
            case TBT:
            case TOPM:
                ntags = "" + HDF5Utils.getHDF5TagCount(h5reader);
                break;
            default:
                ntags = "n/a";
        }

        String output;
        if(rawData()) {
            output =  ntags + "\n";
        }else{
            output = "numberOfTags:\t" + ntags + "\n";
        }
        outputWriter.append(output);
    }

    private void writeHasDepth()  throws IOException {
        String isDepth;
        switch(myFileType) {
            case GENOTYPE:
                isDepth = "" + HDF5Utils.doesGenotypeDepthExist(h5reader);
                break;
            case TOPM:
                isDepth = "n/a";
                break;
            case TBT:
                isDepth = "true";
                break;
            default:
                isDepth = "unknown";
        }
        String output;
        if(rawData()) {
            output = isDepth + "\n";
        }else{
            output = "hasDepth:\t" +  isDepth + "\n";
        }
        outputWriter.append(output);
    }

    private void writeSiteNames()  throws IOException {
        StringBuilder sites = new StringBuilder();
        if(!rawData()) {
            sites.append("###Site Names###\n");
        }

        switch(myFileType){
            case GENOTYPE:
                for(int i=0; i myTaxaList = HDF5Utils.getAllTaxaNames(h5reader);
                for(String t: myTaxaList){
                    taxa.append(t + "\n");
                }
                break;
            default:
                taxa.append("(not applicable for this file type)\n");
        }

        outputWriter.append(taxa.toString());
    }

    private void writeTagSequences()  throws IOException {
        StringBuilder tags = new StringBuilder();
        if(!rawData()) {
            tags.append("###Tag Sequences###\n");
        }

        switch(myFileType){
            case TBT:
            case TOPM:
                long[][] myTags = HDF5Utils.getTags(h5reader);
                for (int i = 0; i < myTags.length; i++) {
                    tags.append(BaseEncoder.getSequenceFromLong(myTags[i]) + "\n");
                }
                break;
            default:
                tags.append("(not applicable for this file type)\n");
        }

        outputWriter.append(tags.toString());
    }

    //Overridden methods from AbstractPlugin
    /*@Override
    protected void postProcessParameters() {
        if (!(taxaCount() || taxaNames() || siteCount() || siteNames() || hasDepth())) {
            throw new IllegalArgumentException("\n\nMust select at least one option to output.\n\n");
        }
    }*/

    @Override
    public String pluginDescription(){
        return "This plugin takes a TASSEL-generated HDF5 file and prints out a set of summary data depending on which " +
                "command-line flags are passed to it. It is meant to allow quick retrieval of certain basic data (taxa names, " +
                "site count, etc.) that are not easily available without operations that could take considerable time " +
                "(e.g., a genotype summary report). This plugin currently supports Genotype, TOPM (tags on physical map), and" +
                "TBT (tags by taxa) files. (Note that the implementation of TOPM and TBT formats in HDF5 is still ongoing, " +
                "so this plugin may not work on them.)\n";
    }

    //Parameter set functions
    public HDF5SummaryPlugin inputFile(String filename) {
        inputFile = new PluginParameter<>(inputFile, filename);
        return this;
    }

    public HDF5SummaryPlugin outputFile(String filename) {
        outputFile = new PluginParameter<>(outputFile, filename);
        return this;
    }

    public HDF5SummaryPlugin printAll(Boolean value) {
        printAll = new PluginParameter<>(printAll, value);
        return this;
    }

    public HDF5SummaryPlugin taxaCount(Boolean value) {
        taxaCount = new PluginParameter<>(taxaCount, value);
        return this;
    }

    public HDF5SummaryPlugin taxaNames(Boolean value) {
        taxaNames = new PluginParameter<>(taxaNames, value);
        return this;
    }

    public HDF5SummaryPlugin siteCount(Boolean value) {
        siteCount = new PluginParameter<>(siteCount, value);
        return this;
    }

    public HDF5SummaryPlugin siteNames(Boolean value) {
        siteNames = new PluginParameter<>(siteNames, value);
        return this;
    }

    public HDF5SummaryPlugin hasDepth(Boolean value) {
        hasDepth = new PluginParameter<>(hasDepth, value);
        return this;
    }

    public HDF5SummaryPlugin rawData(Boolean value) {
        rawData = new PluginParameter<>(rawData, value);
        return this;
    }

    //Parameter get functions
    public String inputFile() {
        return inputFile.value();
    }

    public String outputFile() {
        return outputFile.value();
    }

    public Boolean printAll() {
        return printAll.value();
    }

    public Boolean taxaCount() {
        return taxaCount.value();
    }

    public Boolean taxaNames() {
        return taxaNames.value();
    }

    public Boolean siteCount() {
        return siteCount.value();
    }

    public Boolean siteNames() {
        return siteNames.value();
    }

    public Boolean tagCount() {
        return tagCount.value();
    }

    public Boolean tagSeqs() {
        return tagSeqs.value();
    }

    public Boolean hasDepth() {
        return hasDepth.value();
    }

    public Boolean rawData() {
        return rawData.value();
    }

    //GUI-required methods
    @Override
    public ImageIcon getIcon() {
        return null;
    }

    @Override
    public String getButtonName() {
        return "HDF5 Summary";
    }

    @Override
    public String getToolTipText() {
        return "HDF5 Summary";
    }

}