net.maizegenetics.analysis.imputation.ReImputeUpdatedTaxaByFILLINPlugin Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of tassel Show documentation
Show all versions of tassel Show documentation
TASSEL is a software package to evaluate traits associations, evolutionary patterns, and linkage
disequilibrium.
The newest version!
/*
* ReImputeUpdatedTaxaByFILLINPlugin
*/
package net.maizegenetics.analysis.imputation;
import ch.systemsx.cisd.hdf5.HDF5Factory;
import ch.systemsx.cisd.hdf5.IHDF5Reader;
import ch.systemsx.cisd.hdf5.IHDF5Writer;
import java.awt.*;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.swing.*;
import net.maizegenetics.dna.map.PositionList;
import net.maizegenetics.dna.map.PositionListBuilder;
import net.maizegenetics.dna.snp.GenotypeTableBuilder;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.taxa.TaxaList;
import net.maizegenetics.taxa.TaxaListBuilder;
import net.maizegenetics.taxa.Taxon;
import net.maizegenetics.util.HDF5Utils;
import net.maizegenetics.util.Utils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
/**
* Compares an unfinished HDF5 file containing raw genotypes to a corresponding
* unfinished HDF5 file containing FILLIN-imputed genotypes to find new taxa (or
* taxa with additional depth) in the raw geno file, then imputes (or reimputes)
* these with FILLIN and adds them to (or replaces them in) the imputed geno file.
*
* This is part of the Automated Production Pipeline.
*
* @author jcg233
*/
public class ReImputeUpdatedTaxaByFILLINPlugin extends AbstractPlugin {
private static final Logger myLogger = LogManager.getLogger(ReImputeUpdatedTaxaByFILLINPlugin.class);
private PluginParameter rawHDF5GenotypeFile
= new PluginParameter.Builder<>("raw", null, String.class)
.guiName("Raw HDF5 Genotype File")
.required(true)
.inFile()
.description("Input, unfinished HDF5 (*.h5) file containing raw (unimputed) genotypes")
.build();
private PluginParameter imputedHDF5GenotypeFile
= new PluginParameter.Builder<>("imp", null, String.class)
.guiName("Imputed HDF5 Genotype File")
.required(true)
.inFile()
.description("Target, unfinished HDF5 (*.h5) file containing imputed genotypes to be updated")
.build();
private PluginParameter donorDir
= new PluginParameter.Builder<>("d", null, String.class)
.guiName("Donor Dir")
.inDir()
.required(true)
.description("Directory containing donor haplotype files from output of the FILLINFindHaplotypesPlugin. "
+"All files with '.gc' in the filename will be read in, only those with matching sites are used")
.build();
private PluginParameter positionSourceHDF5GenoFile
= new PluginParameter.Builder<>("pos", null, String.class)
.guiName("Position Source HDF5 Geno File")
.required(false)
.inFile()
.description("Finished (built) HDF5 (*.h5) file to be used as a PositionList source (containing a small number of [ignored] taxa)")
.build();
private PluginParameter preferredHaplotypeSize
= new PluginParameter.Builder<>("hapSize", 8000, Integer.class)
.guiName("Preferred haplotype size")
.required(false)
.description("Preferred haplotype block size in sites (use same as in FILLINFindHaplotypesPlugin)")
.build();
private PluginParameter keepTempGenotypes
= new PluginParameter.Builder<>("kt", false, Boolean.class)
.guiName("Keep Temp Genotypes")
.description("Keep the temporary hdf5 genotype files generated by this plugin (raw and imputed) rather than deleting them")
.build();
// TODO: add all possible FILLINImputationPlugin parameters? It seems that the default parameters were used for maize.
// global variables
IHDF5Reader rawGenosReader;
IHDF5Writer impGenosWriter;
String tempPath;
public ReImputeUpdatedTaxaByFILLINPlugin() {
super(null, false);
}
public ReImputeUpdatedTaxaByFILLINPlugin(Frame parentFrame, boolean isInteractive) {
super(parentFrame, false);
}
@Override
public String pluginDescription() {
return
"This plugin " +
"compares an unfinished HDF5 file containing raw genotypes to a corresponding " +
"unfinished HDF5 file containing FILLIN-imputed genotypes to find new taxa (or " +
"taxa with additional depth) in the raw geno file, then imputes (or reimputes) " +
"these with FILLIN and adds them to (or replaces them in) the imputed geno file."
;
}
@Override
protected void postProcessParameters() {
tempPath = Utils.getDirectory(imputedHDF5GenotypeFile()) + File.separator;
}
@Override
public DataSet processData(DataSet input) {
ReImputeUpdatedTaxaByFILLIN();
fireProgress(100);
return null;
}
private void ReImputeUpdatedTaxaByFILLIN() {
// open raw and target imputed genos (both are unfinished HDF5 genos)
openInputHDF5GenoFiles();
// compare taxa (exit if no change)
TaxaList modifiedTaxa = compareRawAndImputedTaxa();
if (modifiedTaxa.isEmpty()) {
myLogger.info(" No additional or updated taxa were found in the raw genotype input file.");
return;
}
// create temporary input HDF5 file (no depth needed) with taxa subset to feed to the FILLINFindHaplotypesPlugin
String tempInFile = createTempInputFileForFILLIN(modifiedTaxa);
// run FILLINFindHaplotypesPlugin, producing temporary output HDF5 imputed genotypes
String tempOutFile = runFILLIN(tempInFile);
// replace taxa & genotypes in target HDF5 imputed genotypes file
replaceTaxaInImputedFile(tempOutFile);
// delete temporary files (unless -kt option invoked)
if (!keepTempGenotypes()) {
deleteTemporaryFiles(tempInFile, tempOutFile);
}
}
private void openInputHDF5GenoFiles() {
myLogger.info("\nOpening input raw genotypes file:\n "+rawHDF5GenotypeFile()+"\n");
rawGenosReader=HDF5Factory.openForReading(rawHDF5GenotypeFile());
myLogger.info("\nOpening target imputed genotypes file:\n "+imputedHDF5GenotypeFile()+"\n");
impGenosWriter=HDF5Factory.open(imputedHDF5GenotypeFile());
}
private TaxaList compareRawAndImputedTaxa() {
myLogger.info("Comparing taxa in raw and imputed genotype files for additions or additional depth in the raw genotypes:\n");
StringBuilder modifiedTaxaReport = new StringBuilder("Modified taxa:\n");
ArrayList modifiedTaxa = new ArrayList();
// compare taxa & add to modified taxa if new or changed
List rawTaxaNames = HDF5Utils.getAllTaxaNames(rawGenosReader);
for (String taxonName : rawTaxaNames) {
if (!HDF5Utils.doTaxonCallsExist(impGenosWriter, taxonName)) {
Taxon modTax = HDF5Utils.getTaxon(rawGenosReader, taxonName);
modifiedTaxa.add(modTax);
modifiedTaxaReport.append(" "+taxonName+" (new taxon) "+modTax.toStringWithVCFAnnotation()+"\n");
} else if (flowcellLaneAdded(taxonName)) {
Taxon modTax = HDF5Utils.getTaxon(rawGenosReader, taxonName);
modifiedTaxa.add(modTax);
modifiedTaxaReport.append(" "+taxonName+" (additional depth) "+modTax.toStringWithVCFAnnotation()+"\n");
}
}
if (!modifiedTaxa.isEmpty()) myLogger.info(modifiedTaxaReport.toString());
return new TaxaListBuilder().addAll(modifiedTaxa).sortTaxaAlphabetically().build();
}
private boolean flowcellLaneAdded(String taxonName) {
Taxon rawTaxon = HDF5Utils.getTaxon(rawGenosReader, taxonName);
if (rawTaxon == null) {
throw new IllegalStateException("No corresponding Taxon found in the raw genotype file for the existing taxon name: "+taxonName);
}
Taxon impTaxon = HDF5Utils.getTaxon(impGenosWriter, taxonName);
if (impTaxon == null) return true;
String[] rawFlowCellLanes = rawTaxon.getAnnotation().getTextAnnotation("Flowcell_Lane");
String[] impFlowCellLanes = impTaxon.getAnnotation().getTextAnnotation("Flowcell_Lane");
for (String rawFlowCellLane : rawFlowCellLanes) {
boolean found = false;
for (String impFlowCellLane : impFlowCellLanes) {
if(impFlowCellLane.equals(rawFlowCellLane)) {
found = true;
continue;
}
}
if (!found) return true;
}
return false;
}
private String createTempInputFileForFILLIN(TaxaList modifiedTaxa) {
myLogger.info("Creating temporary HDF5 file to hold raw genos for modified taxa (input for FILLIN)");
String tempRawGenosFileName = "tempRawGenos" + new SimpleDateFormat("yyyy-MM-dd_HH-mm-ss_Z").format(new Date()) + ".h5";
PositionList positionList;
if (positionSourceHDF5GenoFile() == null) {
positionList = PositionListBuilder.getInstance(rawGenosReader);
} else {
IHDF5Reader posListReader = HDF5Factory.openForReading(positionSourceHDF5GenoFile());
positionList = PositionListBuilder.getInstance(posListReader);
}
GenotypeTableBuilder gtb = GenotypeTableBuilder.getTaxaIncremental(positionList, tempPath+tempRawGenosFileName);
for (Taxon modTaxon : modifiedTaxa) {
gtb.addTaxon(modTaxon, HDF5Utils.getHDF5GenotypesCalls(rawGenosReader, modTaxon.getName()));
}
gtb.build();
return tempRawGenosFileName;
}
private String runFILLIN(String tempInFile) {
myLogger.info("Running FILLIN on the modified taxa using default paramenters (preferredHaplotypeSize:"+preferredHaplotypeSize()+")");
String tempImpGenosFileName = tempInFile.replaceFirst("Raw", "Imp");
FILLINImputationPlugin fip = new FILLINImputationPlugin()
.targetFile(tempPath+tempInFile)
.outputFilename(tempPath+tempImpGenosFileName)
.donorDir(donorDir())
.preferredHaplotypeSize(preferredHaplotypeSize())
;
fip.performFunction(null);
return tempImpGenosFileName;
}
private void replaceTaxaInImputedFile(String tempImpFile) {
myLogger.info("Replacing modified taxa in the target file containing cumulative, imputed genotypes");
IHDF5Reader impGenosReader = HDF5Factory.openForReading(tempPath+tempImpFile);
List impTaxaNames = HDF5Utils.getAllTaxaNames(impGenosReader);
for (String taxonName : impTaxaNames) {
Taxon impTaxon = HDF5Utils.getTaxon(impGenosReader, taxonName);
byte[] genoCalls = HDF5Utils.getHDF5GenotypesCalls(impGenosReader, taxonName);
Taxon origTaxon = HDF5Utils.getTaxon(impGenosWriter, taxonName);
if (origTaxon == null) {
HDF5Utils.addTaxon(impGenosWriter, impTaxon);
HDF5Utils.writeHDF5GenotypesCalls(impGenosWriter, taxonName, genoCalls);
} else {
Taxon modTaxon = updateTaxonAnnotations(origTaxon, impTaxon);
HDF5Utils.replaceTaxonAnnotations(impGenosWriter, modTaxon);
HDF5Utils.replaceHDF5GenotypesCalls(impGenosWriter, taxonName, genoCalls);
}
}
}
private Taxon updateTaxonAnnotations(Taxon origTaxon, Taxon newTaxon) {
Map.Entry[] allNewAnnos = newTaxon.getAnnotation().getAllAnnotationEntries();
Map annosToAdd = new HashMap();
for (Map.Entry newAnno : allNewAnnos) {
if (!origTaxon.getAnnotation().isAnnotatedWithValue(newAnno.getKey(), newAnno.getValue())) {
annosToAdd.put(newAnno.getKey(), newAnno.getValue());
}
}
Taxon.Builder modTaxonBuilder = new Taxon.Builder(origTaxon);
for (Map.Entry annoToAdd : annosToAdd.entrySet()) {
modTaxonBuilder.addAnno(annoToAdd.getKey(), annoToAdd.getValue());
}
return modTaxonBuilder.build();
}
private void deleteTemporaryFiles(String tempInFile, String tempOutFile) {
myLogger.info("Deleting the temporary HDF5 files");
try {
Files.delete(Paths.get(tempPath+tempInFile));
} catch (Exception e) {
throw new IllegalStateException("Can't delete temporary HDF5 raw geno file: "+e);
}
try {
Files.delete(Paths.get(tempPath+tempOutFile));
} catch (Exception e) {
throw new IllegalStateException("Can't delete temporary HDF5 imputed geno file: "+e);
}
}
@Override
public ImageIcon getIcon() {
return null;
}
@Override
public String getButtonName() {
return "Update imputed genotypes";
}
@Override
public String getToolTipText() {
return "Update imputed genotypes file based on modified/new taxa in raw genotypes file";
}
// The following getters and setters were auto-generated.
// Please use this method to re-generate.
//
// public static void main(String[] args) {
// GeneratePluginCode.generate(ReImputeUpdatedTaxaByFILLINPlugin.class);
// }
// The following getters and setters were auto-generated.
// Please use this method to re-generate.
//
// public static void main(String[] args) {
// GeneratePluginCode.generate(ReImputeUpdatedTaxaByFILLINPlugin.class);
// }
/**
* Convenience method to run plugin with one return object.
*/
public DataSet runPlugin(DataSet input) {
return (DataSet) performFunction(input).getData(0).getData();
}
/**
* Input, unfinished HDF5 (*.h5) file containing raw (unimputed)
* genotypes
*
* @return Raw HDF5 Genotype File
*/
public String rawHDF5GenotypeFile() {
return rawHDF5GenotypeFile.value();
}
/**
* Set Raw HDF5 Genotype File. Input, unfinished HDF5
* (*.h5) file containing raw (unimputed) genotypes
*
* @param value Raw HDF5 Genotype File
*
* @return this plugin
*/
public ReImputeUpdatedTaxaByFILLINPlugin rawHDF5GenotypeFile(String value) {
rawHDF5GenotypeFile = new PluginParameter<>(rawHDF5GenotypeFile, value);
return this;
}
/**
* Target, unfinished HDF5 (*.h5) file containing imputed
* genotypes to be updated
*
* @return Imputed HDF5 Genotype File
*/
public String imputedHDF5GenotypeFile() {
return imputedHDF5GenotypeFile.value();
}
/**
* Set Imputed HDF5 Genotype File. Target, unfinished
* HDF5 (*.h5) file containing imputed genotypes to be
* updated
*
* @param value Imputed HDF5 Genotype File
*
* @return this plugin
*/
public ReImputeUpdatedTaxaByFILLINPlugin imputedHDF5GenotypeFile(String value) {
imputedHDF5GenotypeFile = new PluginParameter<>(imputedHDF5GenotypeFile, value);
return this;
}
/**
* Directory containing donor haplotype files from output
* of the FILLINFindHaplotypesPlugin. All files with '.gc'
* in the filename will be read in, only those with matching
* sites are used
*
* @return Donor Dir
*/
public String donorDir() {
return donorDir.value();
}
/**
* Set Donor Dir. Directory containing donor haplotype
* files from output of the FILLINFindHaplotypesPlugin.
* All files with '.gc' in the filename will be read in,
* only those with matching sites are used
*
* @param value Donor Dir
*
* @return this plugin
*/
public ReImputeUpdatedTaxaByFILLINPlugin donorDir(String value) {
donorDir = new PluginParameter<>(donorDir, value);
return this;
}
/**
* Finished (built) HDF5 (*.h5) file to be used as a PositionList
* source (containing a small number of [ignored] taxa)
*
* @return Position Source HDF5 Geno File
*/
public String positionSourceHDF5GenoFile() {
return positionSourceHDF5GenoFile.value();
}
/**
* Set Position Source HDF5 Geno File. Finished (built)
* HDF5 (*.h5) file to be used as a PositionList source
* (containing a small number of [ignored] taxa)
*
* @param value Position Source HDF5 Geno File
*
* @return this plugin
*/
public ReImputeUpdatedTaxaByFILLINPlugin positionSourceHDF5GenoFile(String value) {
positionSourceHDF5GenoFile = new PluginParameter<>(positionSourceHDF5GenoFile, value);
return this;
}
/**
* Preferred haplotype block size in sites (use same as
* in FILLINFindHaplotypesPlugin)
*
* @return Preferred haplotype size
*/
public Integer preferredHaplotypeSize() {
return preferredHaplotypeSize.value();
}
/**
* Set Preferred haplotype size. Preferred haplotype block
* size in sites (use same as in FILLINFindHaplotypesPlugin)
*
* @param value Preferred haplotype size
*
* @return this plugin
*/
public ReImputeUpdatedTaxaByFILLINPlugin preferredHaplotypeSize(Integer value) {
preferredHaplotypeSize = new PluginParameter<>(preferredHaplotypeSize, value);
return this;
}
/**
* Keep the temporary hdf5 genotype files generated by
* this plugin (raw and imputed) rather than deleting
* them
*
* @return Keep Temp Genotypes
*/
public Boolean keepTempGenotypes() {
return keepTempGenotypes.value();
}
/**
* Set Keep Temp Genotypes. Keep the temporary hdf5 genotype
* files generated by this plugin (raw and imputed) rather
* than deleting them
*
* @param value Keep Temp Genotypes
*
* @return this plugin
*/
public ReImputeUpdatedTaxaByFILLINPlugin keepTempGenotypes(Boolean value) {
keepTempGenotypes = new PluginParameter<>(keepTempGenotypes, value);
return this;
}
}