
net.maizegenetics.analysis.imputation.ReImputeUpdatedTaxaByFILLINPlugin Maven / Gradle / Ivy
/*
* ReImputeUpdatedTaxaByFILLINPlugin
*/
package net.maizegenetics.analysis.imputation;
import ch.systemsx.cisd.hdf5.HDF5Factory;
import ch.systemsx.cisd.hdf5.IHDF5Reader;
import ch.systemsx.cisd.hdf5.IHDF5Writer;
import java.awt.*;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.swing.*;
import net.maizegenetics.dna.map.PositionList;
import net.maizegenetics.dna.map.PositionListBuilder;
import net.maizegenetics.dna.snp.GenotypeTableBuilder;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.taxa.TaxaList;
import net.maizegenetics.taxa.TaxaListBuilder;
import net.maizegenetics.taxa.Taxon;
import net.maizegenetics.util.HDF5Utils;
import net.maizegenetics.util.Utils;
import org.apache.log4j.Logger;
/**
* Compares an unfinished HDF5 file containing raw genotypes to a corresponding
* unfinished HDF5 file containing FILLIN-imputed genotypes to find new taxa (or
* taxa with additional depth) in the raw geno file, then imputes (or reimputes)
* these with FILLIN and adds them to (or replaces them in) the imputed geno file.
*
* This is part of the Automated Production Pipeline.
*
* @author jcg233
*/
public class ReImputeUpdatedTaxaByFILLINPlugin extends AbstractPlugin {
private static final Logger myLogger = Logger.getLogger(ReImputeUpdatedTaxaByFILLINPlugin.class);
private PluginParameter rawHDF5GenotypeFile
= new PluginParameter.Builder<>("raw", null, String.class)
.guiName("Raw HDF5 Genotype File")
.required(true)
.inFile()
.description("Input, unfinished HDF5 (*.h5) file containing raw (unimputed) genotypes")
.build();
private PluginParameter imputedHDF5GenotypeFile
= new PluginParameter.Builder<>("imp", null, String.class)
.guiName("Imputed HDF5 Genotype File")
.required(true)
.inFile()
.description("Target, unfinished HDF5 (*.h5) file containing imputed genotypes to be updated")
.build();
private PluginParameter donorDir
= new PluginParameter.Builder<>("d", null, String.class)
.guiName("Donor Dir")
.inDir()
.required(true)
.description("Directory containing donor haplotype files from output of the FILLINFindHaplotypesPlugin. "
+"All files with '.gc' in the filename will be read in, only those with matching sites are used")
.build();
private PluginParameter positionSourceHDF5GenoFile
= new PluginParameter.Builder<>("pos", null, String.class)
.guiName("Position Source HDF5 Geno File")
.required(false)
.inFile()
.description("Finished (built) HDF5 (*.h5) file to be used as a PositionList source (containing a small number of [ignored] taxa)")
.build();
private PluginParameter preferredHaplotypeSize
= new PluginParameter.Builder<>("hapSize", 8000, Integer.class)
.guiName("Preferred haplotype size")
.required(false)
.description("Preferred haplotype block size in sites (use same as in FILLINFindHaplotypesPlugin)")
.build();
private PluginParameter keepTempGenotypes
= new PluginParameter.Builder<>("kt", false, Boolean.class)
.guiName("Keep Temp Genotypes")
.description("Keep the temporary hdf5 genotype files generated by this plugin (raw and imputed) rather than deleting them")
.build();
// TODO: add all possible FILLINImputationPlugin parameters? It seems that the default parameters were used for maize.
// global variables
IHDF5Reader rawGenosReader;
IHDF5Writer impGenosWriter;
String tempPath;
public ReImputeUpdatedTaxaByFILLINPlugin() {
super(null, false);
}
public ReImputeUpdatedTaxaByFILLINPlugin(Frame parentFrame, boolean isInteractive) {
super(parentFrame, false);
}
@Override
public String pluginDescription() {
return
"This plugin " +
"compares an unfinished HDF5 file containing raw genotypes to a corresponding " +
"unfinished HDF5 file containing FILLIN-imputed genotypes to find new taxa (or " +
"taxa with additional depth) in the raw geno file, then imputes (or reimputes) " +
"these with FILLIN and adds them to (or replaces them in) the imputed geno file."
;
}
@Override
protected void postProcessParameters() {
tempPath = Utils.getDirectory(imputedHDF5GenotypeFile()) + File.separator;
}
@Override
public DataSet processData(DataSet input) {
ReImputeUpdatedTaxaByFILLIN();
fireProgress(100);
return null;
}
private void ReImputeUpdatedTaxaByFILLIN() {
// open raw and target imputed genos (both are unfinished HDF5 genos)
openInputHDF5GenoFiles();
// compare taxa (exit if no change)
TaxaList modifiedTaxa = compareRawAndImputedTaxa();
if (modifiedTaxa.isEmpty()) {
myLogger.info(" No additional or updated taxa were found in the raw genotype input file.");
return;
}
// create temporary input HDF5 file (no depth needed) with taxa subset to feed to the FILLINFindHaplotypesPlugin
String tempInFile = createTempInputFileForFILLIN(modifiedTaxa);
// run FILLINFindHaplotypesPlugin, producing temporary output HDF5 imputed genotypes
String tempOutFile = runFILLIN(tempInFile);
// replace taxa & genotypes in target HDF5 imputed genotypes file
replaceTaxaInImputedFile(tempOutFile);
// delete temporary files (unless -kt option invoked)
if (!keepTempGenotypes()) {
deleteTemporaryFiles(tempInFile, tempOutFile);
}
}
private void openInputHDF5GenoFiles() {
myLogger.info("\nOpening input raw genotypes file:\n "+rawHDF5GenotypeFile()+"\n");
rawGenosReader=HDF5Factory.openForReading(rawHDF5GenotypeFile());
myLogger.info("\nOpening target imputed genotypes file:\n "+imputedHDF5GenotypeFile()+"\n");
impGenosWriter=HDF5Factory.open(imputedHDF5GenotypeFile());
}
private TaxaList compareRawAndImputedTaxa() {
myLogger.info("Comparing taxa in raw and imputed genotype files for additions or additional depth in the raw genotypes:\n");
StringBuilder modifiedTaxaReport = new StringBuilder("Modified taxa:\n");
ArrayList modifiedTaxa = new ArrayList();
// compare taxa & add to modified taxa if new or changed
List rawTaxaNames = HDF5Utils.getAllTaxaNames(rawGenosReader);
for (String taxonName : rawTaxaNames) {
if (!HDF5Utils.doTaxonCallsExist(impGenosWriter, taxonName)) {
Taxon modTax = HDF5Utils.getTaxon(rawGenosReader, taxonName);
modifiedTaxa.add(modTax);
modifiedTaxaReport.append(" "+taxonName+" (new taxon) "+modTax.toStringWithVCFAnnotation()+"\n");
} else if (flowcellLaneAdded(taxonName)) {
Taxon modTax = HDF5Utils.getTaxon(rawGenosReader, taxonName);
modifiedTaxa.add(modTax);
modifiedTaxaReport.append(" "+taxonName+" (additional depth) "+modTax.toStringWithVCFAnnotation()+"\n");
}
}
if (!modifiedTaxa.isEmpty()) myLogger.info(modifiedTaxaReport.toString());
return new TaxaListBuilder().addAll(modifiedTaxa).sortTaxaAlphabetically().build();
}
private boolean flowcellLaneAdded(String taxonName) {
Taxon rawTaxon = HDF5Utils.getTaxon(rawGenosReader, taxonName);
if (rawTaxon == null) {
throw new IllegalStateException("No corresponding Taxon found in the raw genotype file for the existing taxon name: "+taxonName);
}
Taxon impTaxon = HDF5Utils.getTaxon(impGenosWriter, taxonName);
if (impTaxon == null) return true;
String[] rawFlowCellLanes = rawTaxon.getAnnotation().getTextAnnotation("Flowcell_Lane");
String[] impFlowCellLanes = impTaxon.getAnnotation().getTextAnnotation("Flowcell_Lane");
for (String rawFlowCellLane : rawFlowCellLanes) {
boolean found = false;
for (String impFlowCellLane : impFlowCellLanes) {
if(impFlowCellLane.equals(rawFlowCellLane)) {
found = true;
continue;
}
}
if (!found) return true;
}
return false;
}
private String createTempInputFileForFILLIN(TaxaList modifiedTaxa) {
myLogger.info("Creating temporary HDF5 file to hold raw genos for modified taxa (input for FILLIN)");
String tempRawGenosFileName = "tempRawGenos" + new SimpleDateFormat("yyyy-MM-dd_HH-mm-ss_Z").format(new Date()) + ".h5";
PositionList positionList;
if (positionSourceHDF5GenoFile() == null) {
positionList = PositionListBuilder.getInstance(rawGenosReader);
} else {
IHDF5Reader posListReader = HDF5Factory.openForReading(positionSourceHDF5GenoFile());
positionList = PositionListBuilder.getInstance(posListReader);
}
GenotypeTableBuilder gtb = GenotypeTableBuilder.getTaxaIncremental(positionList, tempPath+tempRawGenosFileName);
for (Taxon modTaxon : modifiedTaxa) {
gtb.addTaxon(modTaxon, HDF5Utils.getHDF5GenotypesCalls(rawGenosReader, modTaxon.getName()));
}
gtb.build();
return tempRawGenosFileName;
}
private String runFILLIN(String tempInFile) {
myLogger.info("Running FILLIN on the modified taxa using default paramenters (preferredHaplotypeSize:"+preferredHaplotypeSize()+")");
String tempImpGenosFileName = tempInFile.replaceFirst("Raw", "Imp");
FILLINImputationPlugin fip = new FILLINImputationPlugin()
.targetFile(tempPath+tempInFile)
.outputFilename(tempPath+tempImpGenosFileName)
.donorDir(donorDir())
.preferredHaplotypeSize(preferredHaplotypeSize())
;
fip.performFunction(null);
return tempImpGenosFileName;
}
private void replaceTaxaInImputedFile(String tempImpFile) {
myLogger.info("Replacing modified taxa in the target file containing cumulative, imputed genotypes");
IHDF5Reader impGenosReader = HDF5Factory.openForReading(tempPath+tempImpFile);
List impTaxaNames = HDF5Utils.getAllTaxaNames(impGenosReader);
for (String taxonName : impTaxaNames) {
Taxon impTaxon = HDF5Utils.getTaxon(impGenosReader, taxonName);
byte[] genoCalls = HDF5Utils.getHDF5GenotypesCalls(impGenosReader, taxonName);
Taxon origTaxon = HDF5Utils.getTaxon(impGenosWriter, taxonName);
if (origTaxon == null) {
HDF5Utils.addTaxon(impGenosWriter, impTaxon);
HDF5Utils.writeHDF5GenotypesCalls(impGenosWriter, taxonName, genoCalls);
} else {
Taxon modTaxon = updateTaxonAnnotations(origTaxon, impTaxon);
HDF5Utils.replaceTaxonAnnotations(impGenosWriter, modTaxon);
HDF5Utils.replaceHDF5GenotypesCalls(impGenosWriter, taxonName, genoCalls);
}
}
}
private Taxon updateTaxonAnnotations(Taxon origTaxon, Taxon newTaxon) {
Map.Entry[] allNewAnnos = newTaxon.getAnnotation().getAllAnnotationEntries();
Map annosToAdd = new HashMap();
for (Map.Entry newAnno : allNewAnnos) {
if (!origTaxon.getAnnotation().isAnnotatedWithValue(newAnno.getKey(), newAnno.getValue())) {
annosToAdd.put(newAnno.getKey(), newAnno.getValue());
}
}
Taxon.Builder modTaxonBuilder = new Taxon.Builder(origTaxon);
for (Map.Entry annoToAdd : annosToAdd.entrySet()) {
modTaxonBuilder.addAnno(annoToAdd.getKey(), annoToAdd.getValue());
}
return modTaxonBuilder.build();
}
private void deleteTemporaryFiles(String tempInFile, String tempOutFile) {
myLogger.info("Deleting the temporary HDF5 files");
try {
Files.delete(Paths.get(tempPath+tempInFile));
} catch (Exception e) {
throw new IllegalStateException("Can't delete temporary HDF5 raw geno file: "+e);
}
try {
Files.delete(Paths.get(tempPath+tempOutFile));
} catch (Exception e) {
throw new IllegalStateException("Can't delete temporary HDF5 imputed geno file: "+e);
}
}
@Override
public ImageIcon getIcon() {
return null;
}
@Override
public String getButtonName() {
return "Update imputed genotypes";
}
@Override
public String getToolTipText() {
return "Update imputed genotypes file based on modified/new taxa in raw genotypes file";
}
// The following getters and setters were auto-generated.
// Please use this method to re-generate.
//
// public static void main(String[] args) {
// GeneratePluginCode.generate(ReImputeUpdatedTaxaByFILLINPlugin.class);
// }
// The following getters and setters were auto-generated.
// Please use this method to re-generate.
//
// public static void main(String[] args) {
// GeneratePluginCode.generate(ReImputeUpdatedTaxaByFILLINPlugin.class);
// }
/**
* Convenience method to run plugin with one return object.
*/
public DataSet runPlugin(DataSet input) {
return (DataSet) performFunction(input).getData(0).getData();
}
/**
* Input, unfinished HDF5 (*.h5) file containing raw (unimputed)
* genotypes
*
* @return Raw HDF5 Genotype File
*/
public String rawHDF5GenotypeFile() {
return rawHDF5GenotypeFile.value();
}
/**
* Set Raw HDF5 Genotype File. Input, unfinished HDF5
* (*.h5) file containing raw (unimputed) genotypes
*
* @param value Raw HDF5 Genotype File
*
* @return this plugin
*/
public ReImputeUpdatedTaxaByFILLINPlugin rawHDF5GenotypeFile(String value) {
rawHDF5GenotypeFile = new PluginParameter<>(rawHDF5GenotypeFile, value);
return this;
}
/**
* Target, unfinished HDF5 (*.h5) file containing imputed
* genotypes to be updated
*
* @return Imputed HDF5 Genotype File
*/
public String imputedHDF5GenotypeFile() {
return imputedHDF5GenotypeFile.value();
}
/**
* Set Imputed HDF5 Genotype File. Target, unfinished
* HDF5 (*.h5) file containing imputed genotypes to be
* updated
*
* @param value Imputed HDF5 Genotype File
*
* @return this plugin
*/
public ReImputeUpdatedTaxaByFILLINPlugin imputedHDF5GenotypeFile(String value) {
imputedHDF5GenotypeFile = new PluginParameter<>(imputedHDF5GenotypeFile, value);
return this;
}
/**
* Directory containing donor haplotype files from output
* of the FILLINFindHaplotypesPlugin. All files with '.gc'
* in the filename will be read in, only those with matching
* sites are used
*
* @return Donor Dir
*/
public String donorDir() {
return donorDir.value();
}
/**
* Set Donor Dir. Directory containing donor haplotype
* files from output of the FILLINFindHaplotypesPlugin.
* All files with '.gc' in the filename will be read in,
* only those with matching sites are used
*
* @param value Donor Dir
*
* @return this plugin
*/
public ReImputeUpdatedTaxaByFILLINPlugin donorDir(String value) {
donorDir = new PluginParameter<>(donorDir, value);
return this;
}
/**
* Finished (built) HDF5 (*.h5) file to be used as a PositionList
* source (containing a small number of [ignored] taxa)
*
* @return Position Source HDF5 Geno File
*/
public String positionSourceHDF5GenoFile() {
return positionSourceHDF5GenoFile.value();
}
/**
* Set Position Source HDF5 Geno File. Finished (built)
* HDF5 (*.h5) file to be used as a PositionList source
* (containing a small number of [ignored] taxa)
*
* @param value Position Source HDF5 Geno File
*
* @return this plugin
*/
public ReImputeUpdatedTaxaByFILLINPlugin positionSourceHDF5GenoFile(String value) {
positionSourceHDF5GenoFile = new PluginParameter<>(positionSourceHDF5GenoFile, value);
return this;
}
/**
* Preferred haplotype block size in sites (use same as
* in FILLINFindHaplotypesPlugin)
*
* @return Preferred haplotype size
*/
public Integer preferredHaplotypeSize() {
return preferredHaplotypeSize.value();
}
/**
* Set Preferred haplotype size. Preferred haplotype block
* size in sites (use same as in FILLINFindHaplotypesPlugin)
*
* @param value Preferred haplotype size
*
* @return this plugin
*/
public ReImputeUpdatedTaxaByFILLINPlugin preferredHaplotypeSize(Integer value) {
preferredHaplotypeSize = new PluginParameter<>(preferredHaplotypeSize, value);
return this;
}
/**
* Keep the temporary hdf5 genotype files generated by
* this plugin (raw and imputed) rather than deleting
* them
*
* @return Keep Temp Genotypes
*/
public Boolean keepTempGenotypes() {
return keepTempGenotypes.value();
}
/**
* Set Keep Temp Genotypes. Keep the temporary hdf5 genotype
* files generated by this plugin (raw and imputed) rather
* than deleting them
*
* @param value Keep Temp Genotypes
*
* @return this plugin
*/
public ReImputeUpdatedTaxaByFILLINPlugin keepTempGenotypes(Boolean value) {
keepTempGenotypes = new PluginParameter<>(keepTempGenotypes, value);
return this;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy