All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.analysis.imputation.ReImputeUpdatedTaxaByFILLINPlugin Maven / Gradle / Ivy

Go to download

TASSEL is a software package to evaluate traits associations, evolutionary patterns, and linkage disequilibrium.

The newest version!
/*
 * ReImputeUpdatedTaxaByFILLINPlugin
 */
package net.maizegenetics.analysis.imputation;

import ch.systemsx.cisd.hdf5.HDF5Factory;
import ch.systemsx.cisd.hdf5.IHDF5Reader;
import ch.systemsx.cisd.hdf5.IHDF5Writer;
import java.awt.*;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.swing.*;
import net.maizegenetics.dna.map.PositionList;
import net.maizegenetics.dna.map.PositionListBuilder;
import net.maizegenetics.dna.snp.GenotypeTableBuilder;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.taxa.TaxaList;
import net.maizegenetics.taxa.TaxaListBuilder;
import net.maizegenetics.taxa.Taxon;
import net.maizegenetics.util.HDF5Utils;
import net.maizegenetics.util.Utils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

/**
 * Compares an unfinished HDF5 file containing raw genotypes to a corresponding 
 * unfinished HDF5 file containing FILLIN-imputed genotypes to find new taxa (or 
 * taxa with additional depth) in the raw geno file, then imputes (or reimputes) 
 * these with FILLIN and adds them to (or replaces them in) the imputed geno file.
 * 
 * This is part of the Automated Production Pipeline.
 * 
 * @author jcg233
 */
public class ReImputeUpdatedTaxaByFILLINPlugin extends AbstractPlugin {

    private static final Logger myLogger = LogManager.getLogger(ReImputeUpdatedTaxaByFILLINPlugin.class);

    private PluginParameter rawHDF5GenotypeFile 
        = new PluginParameter.Builder<>("raw", null, String.class)
        .guiName("Raw HDF5 Genotype File")
        .required(true)
        .inFile()
        .description("Input, unfinished HDF5 (*.h5) file containing raw (unimputed) genotypes")
        .build();
    private PluginParameter imputedHDF5GenotypeFile 
        = new PluginParameter.Builder<>("imp", null, String.class)
        .guiName("Imputed HDF5 Genotype File")
        .required(true)
        .inFile()
        .description("Target, unfinished HDF5 (*.h5) file containing imputed genotypes to be updated")
        .build();
    private PluginParameter donorDir 
        = new PluginParameter.Builder<>("d", null, String.class)
        .guiName("Donor Dir")
        .inDir()
        .required(true)
        .description("Directory containing donor haplotype files from output of the FILLINFindHaplotypesPlugin. "
                    +"All files with '.gc' in the filename will be read in, only those with matching sites are used")
        .build();
    private PluginParameter positionSourceHDF5GenoFile 
        = new PluginParameter.Builder<>("pos", null, String.class)
        .guiName("Position Source HDF5 Geno File")
        .required(false)
        .inFile()
        .description("Finished (built) HDF5 (*.h5) file to be used as a PositionList source (containing a small number of [ignored] taxa)")
        .build();
    private PluginParameter preferredHaplotypeSize
        = new PluginParameter.Builder<>("hapSize", 8000, Integer.class)
        .guiName("Preferred haplotype size")
        .required(false)
        .description("Preferred haplotype block size in sites (use same as in FILLINFindHaplotypesPlugin)")        
        .build();
    private PluginParameter keepTempGenotypes 
        = new PluginParameter.Builder<>("kt", false, Boolean.class)
        .guiName("Keep Temp Genotypes")
        .description("Keep the temporary hdf5 genotype files generated by this plugin (raw and imputed) rather than deleting them")
        .build();

    // TODO: add all possible FILLINImputationPlugin parameters?  It seems that the default parameters were used for maize.
    
    // global variables
    IHDF5Reader rawGenosReader;
    IHDF5Writer impGenosWriter;
    String tempPath;
    

    public ReImputeUpdatedTaxaByFILLINPlugin() {
        super(null, false);
    }

    public ReImputeUpdatedTaxaByFILLINPlugin(Frame parentFrame, boolean isInteractive) {
        super(parentFrame, false);
    }

    @Override
    public String pluginDescription() {
        return 
            "This plugin " +
            "compares an unfinished HDF5 file containing raw genotypes to a corresponding " +
            "unfinished HDF5 file containing FILLIN-imputed genotypes to find new taxa (or " +
            "taxa with additional depth) in the raw geno file, then imputes (or reimputes) " +
            "these with FILLIN and adds them to (or replaces them in) the imputed geno file."
        ;
    }

    @Override
    protected void postProcessParameters() {
        tempPath = Utils.getDirectory(imputedHDF5GenotypeFile()) + File.separator;
    }

    @Override
    public DataSet processData(DataSet input) {
        ReImputeUpdatedTaxaByFILLIN();
        fireProgress(100);
        return null;
    }
    
    private void ReImputeUpdatedTaxaByFILLIN() {
        // open raw and target imputed genos (both are unfinished HDF5 genos)
        openInputHDF5GenoFiles();
        
        // compare taxa (exit if no change)
        TaxaList modifiedTaxa = compareRawAndImputedTaxa();
        if (modifiedTaxa.isEmpty()) {
            myLogger.info("  No additional or updated taxa were found in the raw genotype input file.");
            return;
        }
        
        // create temporary input HDF5 file (no depth needed) with taxa subset to feed to the FILLINFindHaplotypesPlugin
        String tempInFile = createTempInputFileForFILLIN(modifiedTaxa);
        
        // run FILLINFindHaplotypesPlugin, producing temporary output HDF5 imputed genotypes
        String tempOutFile = runFILLIN(tempInFile);
        
        // replace taxa & genotypes in target HDF5 imputed genotypes file
        replaceTaxaInImputedFile(tempOutFile);
        
        // delete temporary files (unless -kt option invoked)
        if (!keepTempGenotypes()) {
            deleteTemporaryFiles(tempInFile, tempOutFile);
        }
    }
    
    private void openInputHDF5GenoFiles() {
        myLogger.info("\nOpening input raw genotypes file:\n  "+rawHDF5GenotypeFile()+"\n");
        rawGenosReader=HDF5Factory.openForReading(rawHDF5GenotypeFile());
        myLogger.info("\nOpening target imputed genotypes file:\n  "+imputedHDF5GenotypeFile()+"\n");
        impGenosWriter=HDF5Factory.open(imputedHDF5GenotypeFile());
    }
    
    private TaxaList compareRawAndImputedTaxa() {
        myLogger.info("Comparing taxa in raw and imputed genotype files for additions or additional depth in the raw genotypes:\n");
        StringBuilder modifiedTaxaReport = new StringBuilder("Modified taxa:\n");
        ArrayList modifiedTaxa = new ArrayList();
        // compare taxa & add to modified taxa if new or changed
        List rawTaxaNames = HDF5Utils.getAllTaxaNames(rawGenosReader);
        for (String taxonName : rawTaxaNames) {
            if (!HDF5Utils.doTaxonCallsExist(impGenosWriter, taxonName)) {
                Taxon modTax = HDF5Utils.getTaxon(rawGenosReader, taxonName);
                modifiedTaxa.add(modTax);
                modifiedTaxaReport.append("  "+taxonName+" (new taxon) "+modTax.toStringWithVCFAnnotation()+"\n");
            } else if (flowcellLaneAdded(taxonName)) {
                Taxon modTax = HDF5Utils.getTaxon(rawGenosReader, taxonName);
                modifiedTaxa.add(modTax);
                modifiedTaxaReport.append("  "+taxonName+" (additional depth) "+modTax.toStringWithVCFAnnotation()+"\n");
            }
        }
        if (!modifiedTaxa.isEmpty()) myLogger.info(modifiedTaxaReport.toString());
        return new TaxaListBuilder().addAll(modifiedTaxa).sortTaxaAlphabetically().build();
    }
    
    private boolean flowcellLaneAdded(String taxonName) {
        Taxon rawTaxon = HDF5Utils.getTaxon(rawGenosReader, taxonName);
        if (rawTaxon == null) {
            throw new IllegalStateException("No corresponding Taxon found in the raw genotype file for the existing taxon name: "+taxonName);
        }
        Taxon impTaxon = HDF5Utils.getTaxon(impGenosWriter, taxonName);
        if (impTaxon == null) return true;
        String[] rawFlowCellLanes = rawTaxon.getAnnotation().getTextAnnotation("Flowcell_Lane");
        String[] impFlowCellLanes = impTaxon.getAnnotation().getTextAnnotation("Flowcell_Lane");
        for (String rawFlowCellLane : rawFlowCellLanes) {
            boolean found = false;
            for (String impFlowCellLane : impFlowCellLanes) {
                if(impFlowCellLane.equals(rawFlowCellLane)) {
                    found = true;
                    continue;
                }
            }
            if (!found) return true;
        }
        return false;
    }
    
    private String createTempInputFileForFILLIN(TaxaList modifiedTaxa) {
        myLogger.info("Creating temporary HDF5 file to hold raw genos for modified taxa (input for FILLIN)");
        String tempRawGenosFileName = "tempRawGenos" + new SimpleDateFormat("yyyy-MM-dd_HH-mm-ss_Z").format(new Date()) + ".h5";
        PositionList positionList;
        if (positionSourceHDF5GenoFile() == null) {
            positionList = PositionListBuilder.getInstance(rawGenosReader);
        } else {
            IHDF5Reader posListReader = HDF5Factory.openForReading(positionSourceHDF5GenoFile());
            positionList = PositionListBuilder.getInstance(posListReader);
        }
        GenotypeTableBuilder gtb =  GenotypeTableBuilder.getTaxaIncremental(positionList, tempPath+tempRawGenosFileName);
        for (Taxon modTaxon : modifiedTaxa) {
            gtb.addTaxon(modTaxon, HDF5Utils.getHDF5GenotypesCalls(rawGenosReader, modTaxon.getName()));
        }
        gtb.build();
        return tempRawGenosFileName;
    }
    
    private String runFILLIN(String tempInFile) {
        myLogger.info("Running FILLIN on the modified taxa using default paramenters (preferredHaplotypeSize:"+preferredHaplotypeSize()+")");
        String tempImpGenosFileName = tempInFile.replaceFirst("Raw", "Imp");
        FILLINImputationPlugin fip = new FILLINImputationPlugin()
            .targetFile(tempPath+tempInFile)
            .outputFilename(tempPath+tempImpGenosFileName)
            .donorDir(donorDir())
            .preferredHaplotypeSize(preferredHaplotypeSize())
        ;
        fip.performFunction(null);
        return tempImpGenosFileName;
    }
    
    private void replaceTaxaInImputedFile(String tempImpFile) {
        myLogger.info("Replacing modified taxa in the target file containing cumulative, imputed genotypes");
        IHDF5Reader impGenosReader = HDF5Factory.openForReading(tempPath+tempImpFile);
        List impTaxaNames = HDF5Utils.getAllTaxaNames(impGenosReader);
        for (String taxonName : impTaxaNames) {
            Taxon impTaxon = HDF5Utils.getTaxon(impGenosReader, taxonName);
            byte[] genoCalls = HDF5Utils.getHDF5GenotypesCalls(impGenosReader, taxonName);
            Taxon origTaxon = HDF5Utils.getTaxon(impGenosWriter, taxonName);
            if (origTaxon == null) {
                HDF5Utils.addTaxon(impGenosWriter, impTaxon);
                HDF5Utils.writeHDF5GenotypesCalls(impGenosWriter, taxonName, genoCalls);
            } else {
                Taxon modTaxon = updateTaxonAnnotations(origTaxon, impTaxon);
                HDF5Utils.replaceTaxonAnnotations(impGenosWriter, modTaxon);
                HDF5Utils.replaceHDF5GenotypesCalls(impGenosWriter, taxonName, genoCalls);
            }
        }
    }
    
    private Taxon updateTaxonAnnotations(Taxon origTaxon, Taxon newTaxon) {
        Map.Entry[] allNewAnnos = newTaxon.getAnnotation().getAllAnnotationEntries();
        Map annosToAdd = new HashMap();
        for (Map.Entry newAnno : allNewAnnos) {
            if (!origTaxon.getAnnotation().isAnnotatedWithValue(newAnno.getKey(), newAnno.getValue())) {
                annosToAdd.put(newAnno.getKey(), newAnno.getValue());
            }
        }
        Taxon.Builder modTaxonBuilder = new Taxon.Builder(origTaxon);
        for (Map.Entry annoToAdd : annosToAdd.entrySet()) {
            modTaxonBuilder.addAnno(annoToAdd.getKey(), annoToAdd.getValue());
        }
        return modTaxonBuilder.build();
    }
    
    private void deleteTemporaryFiles(String tempInFile, String tempOutFile) {
        myLogger.info("Deleting the temporary HDF5 files");
        try {
            Files.delete(Paths.get(tempPath+tempInFile));
        } catch (Exception e) {
            throw new IllegalStateException("Can't delete temporary HDF5 raw geno file: "+e);
        }
        try {
            Files.delete(Paths.get(tempPath+tempOutFile));
        } catch (Exception e) {
            throw new IllegalStateException("Can't delete temporary HDF5 imputed geno file: "+e);
        }
    }
    
    @Override
    public ImageIcon getIcon() {
        return null;
    }

    @Override
    public String getButtonName() {
        return "Update imputed genotypes";
    }

    @Override
    public String getToolTipText() {
        return "Update imputed genotypes file based on modified/new taxa in raw genotypes file";
    }

    // The following getters and setters were auto-generated.
    // Please use this method to re-generate.
    //
//     public static void main(String[] args) {
//         GeneratePluginCode.generate(ReImputeUpdatedTaxaByFILLINPlugin.class);
//     }

    // The following getters and setters were auto-generated.
    // Please use this method to re-generate.
    //
    // public static void main(String[] args) {
    //     GeneratePluginCode.generate(ReImputeUpdatedTaxaByFILLINPlugin.class);
    // }

    /**
     * Convenience method to run plugin with one return object.
     */
    public DataSet runPlugin(DataSet input) {
        return (DataSet) performFunction(input).getData(0).getData();
    }

    /**
     * Input, unfinished HDF5 (*.h5) file containing raw (unimputed)
     * genotypes
     *
     * @return Raw HDF5 Genotype File
     */
    public String rawHDF5GenotypeFile() {
        return rawHDF5GenotypeFile.value();
    }

    /**
     * Set Raw HDF5 Genotype File. Input, unfinished HDF5
     * (*.h5) file containing raw (unimputed) genotypes
     *
     * @param value Raw HDF5 Genotype File
     *
     * @return this plugin
     */
    public ReImputeUpdatedTaxaByFILLINPlugin rawHDF5GenotypeFile(String value) {
        rawHDF5GenotypeFile = new PluginParameter<>(rawHDF5GenotypeFile, value);
        return this;
    }

    /**
     * Target, unfinished HDF5 (*.h5) file containing imputed
     * genotypes to be updated
     *
     * @return Imputed HDF5 Genotype File
     */
    public String imputedHDF5GenotypeFile() {
        return imputedHDF5GenotypeFile.value();
    }

    /**
     * Set Imputed HDF5 Genotype File. Target, unfinished
     * HDF5 (*.h5) file containing imputed genotypes to be
     * updated
     *
     * @param value Imputed HDF5 Genotype File
     *
     * @return this plugin
     */
    public ReImputeUpdatedTaxaByFILLINPlugin imputedHDF5GenotypeFile(String value) {
        imputedHDF5GenotypeFile = new PluginParameter<>(imputedHDF5GenotypeFile, value);
        return this;
    }

    /**
     * Directory containing donor haplotype files from output
     * of the FILLINFindHaplotypesPlugin. All files with '.gc'
     * in the filename will be read in, only those with matching
     * sites are used
     *
     * @return Donor Dir
     */
    public String donorDir() {
        return donorDir.value();
    }

    /**
     * Set Donor Dir. Directory containing donor haplotype
     * files from output of the FILLINFindHaplotypesPlugin.
     * All files with '.gc' in the filename will be read in,
     * only those with matching sites are used
     *
     * @param value Donor Dir
     *
     * @return this plugin
     */
    public ReImputeUpdatedTaxaByFILLINPlugin donorDir(String value) {
        donorDir = new PluginParameter<>(donorDir, value);
        return this;
    }

    /**
     * Finished (built) HDF5 (*.h5) file to be used as a PositionList
     * source (containing a small number of [ignored] taxa)
     *
     * @return Position Source HDF5 Geno File
     */
    public String positionSourceHDF5GenoFile() {
        return positionSourceHDF5GenoFile.value();
    }

    /**
     * Set Position Source HDF5 Geno File. Finished (built)
     * HDF5 (*.h5) file to be used as a PositionList source
     * (containing a small number of [ignored] taxa)
     *
     * @param value Position Source HDF5 Geno File
     *
     * @return this plugin
     */
    public ReImputeUpdatedTaxaByFILLINPlugin positionSourceHDF5GenoFile(String value) {
        positionSourceHDF5GenoFile = new PluginParameter<>(positionSourceHDF5GenoFile, value);
        return this;
    }

    /**
     * Preferred haplotype block size in sites (use same as
     * in FILLINFindHaplotypesPlugin)
     *
     * @return Preferred haplotype size
     */
    public Integer preferredHaplotypeSize() {
        return preferredHaplotypeSize.value();
    }

    /**
     * Set Preferred haplotype size. Preferred haplotype block
     * size in sites (use same as in FILLINFindHaplotypesPlugin)
     *
     * @param value Preferred haplotype size
     *
     * @return this plugin
     */
    public ReImputeUpdatedTaxaByFILLINPlugin preferredHaplotypeSize(Integer value) {
        preferredHaplotypeSize = new PluginParameter<>(preferredHaplotypeSize, value);
        return this;
    }

    /**
     * Keep the temporary hdf5 genotype files generated by
     * this plugin (raw and imputed) rather than deleting
     * them
     *
     * @return Keep Temp Genotypes
     */
    public Boolean keepTempGenotypes() {
        return keepTempGenotypes.value();
    }

    /**
     * Set Keep Temp Genotypes. Keep the temporary hdf5 genotype
     * files generated by this plugin (raw and imputed) rather
     * than deleting them
     *
     * @param value Keep Temp Genotypes
     *
     * @return this plugin
     */
    public ReImputeUpdatedTaxaByFILLINPlugin keepTempGenotypes(Boolean value) {
        keepTempGenotypes = new PluginParameter<>(keepTempGenotypes, value);
        return this;
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy