net.maizegenetics.pangenome.db_loading.AddRefRangeAsAssemblyPlugin Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
/**
*
*/
package net.maizegenetics.pangenome.db_loading;
import java.awt.Frame;
import java.sql.Connection;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.swing.ImageIcon;
import net.maizegenetics.pangenome.api.ConvertVariantContextToVariantInfo;
import net.maizegenetics.plugindef.GeneratePluginCode;
import org.apache.log4j.Logger;
import com.google.common.collect.Range;
import htsjdk.variant.variantcontext.VariantContext;
import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.dna.map.GeneralPosition;
import net.maizegenetics.dna.map.GenomeSequence;
import net.maizegenetics.dna.map.GenomeSequenceBuilder;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.pangenome.api.CreateGraphUtils;
import net.maizegenetics.pangenome.api.ReferenceRange;
import net.maizegenetics.pangenome.processAssemblyGenomes.AssemblyProcessingUtils;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.PluginParameter;
import static net.maizegenetics.pangenome.db_loading.DBLoadingUtils.createPathNodesForGameteGrp;
/**
* This function is fairly obsolete. IT was used before the graph was able to take multiple methods
* as parameters.
*
* This class created to load a reference genome as an assembly. The method is hard-coded to "mummer4", which
* is the assembly method name at the time of this writing. All the variants are reference records. The
* addition of these haplotypes allows for pulling
*
*
*
* TODO: IT it is deemd this class is still useful, it should be updated to create a gvcf file as is done in LoadAllIntervalsToPHGdbPlugin,
* store the gvcf file data to the genomeFileData table, and save the gvcfFileId to the anchorDataPHG object.
* @author lcj34
*
*/
public class AddRefRangeAsAssemblyPlugin extends AbstractPlugin {
private static final Logger myLogger = Logger.getLogger(AddRefRangeAsAssemblyPlugin.class);
private PluginParameter refGenome = new PluginParameter.Builder("ref", null, String.class).guiName("Reference Genome File").required(true)
.description("Referemce Genome File for aligning against ").build();
private PluginParameter configFile = new PluginParameter.Builder("configFile", null, String.class).guiName("Genome Data File").required(true)
.description("Path to config file for accessing/loading the DB")
.build();
private PluginParameter haplotypeMethod = new PluginParameter.Builder("haplotypeMethod", null, String.class).guiName("Haplotype Method Name").required(true)
.description("Name of method used for processing the haplotypes. Should match the method used to load assemblies ")
.build();
private PluginParameter pathMethod = new PluginParameter.Builder("pathMethod", "mummer4_PATH", String.class)
.guiName("Path Method Name").required(false)
.description("OPTIONAL: Name of method used to create PHG Path. Should match the assembly path method - that default is mummer4_PATH")
.build();
private PluginParameter lineName = new PluginParameter.Builder("lineName", null, String.class).guiName("Line Name").required(true)
.description("Line name to be stored in the genotypes table, e.g. B73_Assembly")
.build();
// This data should consistently be as below
private String line_data = "Reference genome stored as an assembly, identical to the reference" ;
private int ploidy = 1 ;
private int hapNumber = 0;
private boolean genesPhased = true;
private boolean chromsPhased = true;
private float conf = 1;
static GenomeSequence myRefSequence = null;
public AddRefRangeAsAssemblyPlugin() {
super(null, false);
}
public AddRefRangeAsAssemblyPlugin(Frame parentFrame) {
super(parentFrame, false);
}
public AddRefRangeAsAssemblyPlugin(Frame parentFrame, boolean isInteractive) {
super(parentFrame, isInteractive);
}
@Override
public DataSet processData(DataSet input) {
long totalTime = System.nanoTime();
long time=System.nanoTime();
Connection dbConnect = DBLoadingUtils.connection(configFile(), false);
if (dbConnect == null) {
throw new IllegalStateException("AddRefRangeAsAssembly: no connection supplied!");
}
myLogger.info("AddRefRangeAsAssembly: have connection, create PHGdbAccess object");
PHGDataWriter phg = new PHGdbAccess(dbConnect);
myRefSequence = GenomeSequenceBuilder.instance(refGenome());
myLogger.info("AddRefRangeAsAssembly: finished GenomeSequenceBuilder for ref genome");
// get reference range map from API
Map refRangeMap = CreateGraphUtils.referenceRangeMap(dbConnect);
Map anchorSequences = new HashMap();
int gamete_grp_id = -1;
try {
String prevChrom = "";
String curChrom = "";
GenoHaploData ghd = new GenoHaploData(ploidy,false,lineName(), line_data,genesPhased, chromsPhased, hapNumber, conf);
phg.putGenoAndHaploTypeData(ghd);
int method_id = phg.getMethodIdFromName(haplotypeMethod()); // might already be in the db
if (method_id == 0) {
myLogger.info("Adding method " + haplotypeMethod() + " to db methods table");
// Assuming this will never be TEST_ASSEMBLY_HAPLOTYPES
method_id = phg.putMethod(haplotypeMethod(), DBLoadingUtils.MethodType.ASSEMBLY_HAPLOTYPES,pluginParameters() );
}
// Load the gamete_groups and gamete_haplotypes table
String nameWithHap = lineName() + "_" + hapNumber;
List gameteGroupList = new ArrayList();
gameteGroupList.add(nameWithHap);
phg.putGameteGroupAndHaplotypes(gameteGroupList);
gamete_grp_id = phg.getGameteGroupIDFromTaxaList(gameteGroupList);
// When ordered numerically, the refRangeIds are in order for chrom 1-10 for
// all the anchors, followed by all the interanchors. So you'll have a streamd of
// them for chroms 1-10, then it will repeat (assuming you are on a species that has
// 10 chromosomes).
for (Map.Entry entry : refRangeMap.entrySet()) {
ReferenceRange refRange = entry.getValue();
int refRangeID = entry.getKey();
int anchorStart = refRange.start();
int anchorEnd = refRange.end();
Chromosome chr = refRange.chromosome();
curChrom = chr.getName();
if (!chr.getName().equals(prevChrom) ) {
if (anchorSequences.size() > 0) {
// load haplotypes for this chrom
myLogger.info("Calling putHaplotypesData for chrom " + prevChrom);
phg.putHaplotypesData(gamete_grp_id, method_id, anchorSequences, prevChrom,-1, -1);
anchorSequences.clear();
}
prevChrom = curChrom;
}
String anchorString = myRefSequence.genotypeAsString(chr, anchorStart, anchorEnd);
Position intervalStart = new GeneralPosition.Builder( chr,anchorStart).build();
Position intervalEnd = new GeneralPosition.Builder( chr,anchorEnd).build();
Range intervalRange = Range.closed(intervalStart, intervalEnd);
// Create VCList:
List rangeVCList = new ArrayList<>();
// ref and asm are the same here for encodeVariantContextListToByteArray
VariantContext vc = AssemblyProcessingUtils.createRefRangeVC(myRefSequence, lineName(), intervalStart, intervalEnd, intervalStart, intervalEnd);
rangeVCList.add(vc);
// Changes for PHG-485: assembly coordianates are the same as the reference
// coordinates. AsmFileId is the same as the reference asmFileId (genomeFileId)
// which is always 1 as reference is loaded before anything else when the db is created.
AnchorDataPHG adata = new AnchorDataPHG( intervalRange, chr.getName(),anchorStart,anchorEnd,".",
refGenome(), anchorString,1,-1);
anchorSequences.put(refRangeID,adata);
}
if (anchorSequences.size() > 0) {
// load haplotypes for this chrom
myLogger.info("calling putHaplotypesData for last chrom: " + curChrom);
//If the reference has already been created, then perhaps the genomeFileId and gvcfFileId fields
// should be determined by pulling the ref data. You'd need to get the ref genoid based on the
// line_name from genotypes that has is_reference set to true. Tnen grab the id based on genoid
// and type from the genomeFileData table.
phg.putHaplotypesData(gamete_grp_id, method_id, anchorSequences, curChrom, -1, -1);
}
} catch (Exception exc) {
exc.printStackTrace();
}
// Create a path
List hapidList = createPathNodesForGameteGrp(lineName(), dbConnect, gamete_grp_id);
byte[] pathBytes = DBLoadingUtils.encodePathsFromIntArray(hapidList);
String pathMethod = pathName();
HashMap methodParams = new HashMap<>();
methodParams.put("notes","path created when assembly was loaded");
// Assuming here we never create refRanges as assemblies as a test
int pathid = phg.putPathsData(pathMethod, methodParams, lineName(), null, pathBytes, false);
myLogger.info("Paths added to db for reference as assembly");
try {
((PHGdbAccess)phg).close();
} catch (Exception exc) {
exc.printStackTrace();
}
myLogger.info("\nFinished, TotalTime for AddRefRangeAsAssemblyPlugin was " + (System.nanoTime() - totalTime) / 1e9 + " seconds");
return null;
}
public static void main(String[] args) {
GeneratePluginCode.generate(AddRefRangeAsAssemblyPlugin.class);
}
@Override
public ImageIcon getIcon() {
return null;
}
@Override
public String getButtonName() {
return ("Load intervals to reference_ranges table");
}
@Override
public String getToolTipText() {
return ("Load intervals to reference_ranges table");
}
/**
* Referemce Genome File for aligning against
*
* @return Reference Genome File
*/
public String refGenome() {
return refGenome.value();
}
/**
* Set Reference Genome File. Referemce Genome File for
* aligning against
*
* @param value Reference Genome File
*
* @return this plugin
*/
public AddRefRangeAsAssemblyPlugin refGenome(String value) {
refGenome = new PluginParameter<>(refGenome, value);
return this;
}
/**
* Path to config file for db loading
*
* @return config file
*/
public String configFile() {
return configFile.value();
}
/**
* Set config file for db access
*
* @param value config file
*
* @return this plugin
*/
public AddRefRangeAsAssemblyPlugin configFile(String value) {
configFile = new PluginParameter<>(configFile, value);
return this;
}
/**
* Name of method to be stored in db
*
* @return methodName file
*/
public String haplotypeMethod() {
return haplotypeMethod.value();
}
/**
* Set methodName
*
* @param value methodName
*
* @return this plugin
*/
public AddRefRangeAsAssemblyPlugin haplotypeMethod(String value) {
haplotypeMethod = new PluginParameter<>(haplotypeMethod, value);
return this;
}
/**
* Name of method used for create PHG Path. Should match
* the assembly path method - that default is mummer4_PATH
*
* @return Path Method Name
*/
public String pathName() {
return pathMethod.value();
}
/**
* Set Path Method Name. Name of method used for create
* PHG Path. Should match the assembly path method -
* that default is mummer4_PATH
*
* @param value Path Method Name
*
* @return this plugin
*/
public AddRefRangeAsAssemblyPlugin pathName(String value) {
pathMethod = new PluginParameter<>(pathMethod, value);
return this;
}
/**
* Name name to be stored in the genotypes table, e.g.
* B73_Assembly
*
* @return Line Name
*/
public String lineName() {
return lineName.value();
}
/**
* Set Line Name. Name name to be stored in the genotypes
* table, e.g. B73_Assembly
*
* @param value Line Name
*
* @return this plugin
*/
public AddRefRangeAsAssemblyPlugin lineName(String value) {
lineName = new PluginParameter<>(lineName, value);
return this;
}
}