net.maizegenetics.pangenome.processAssemblyGenomes.CreateContigFastaFromAssemblyGenomePlugin Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
/**
*
*/
package net.maizegenetics.pangenome.processAssemblyGenomes;
import java.awt.Frame;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import javax.swing.ImageIcon;
import org.apache.log4j.Logger;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.util.Utils;
/**
* The sequence for each chromosome in the genome fasta file will be split based on
* N's. Read until an N is encountered, write the sequence as its own contig
* in the contig fastq output file. Skip past the "N's", start the next contig.
*
* This algorithm is subject to change during pipeline testing.
*
* Each sequence idline will be
* >assemblyChrom:assemblyStart:assemblyEnd
*
* INPUT:
* 1. The assembly genome file to process
*
* OUTPUT:
* 1. The newly created fasta file of contigs.
*
* @author lcj34
*
*/
public class CreateContigFastaFromAssemblyGenomePlugin extends AbstractPlugin {
private static final Logger myLogger = Logger.getLogger(CreateContigFastaFromAssemblyGenomePlugin.class);
private PluginParameter myGenomeFile = new PluginParameter.Builder("genomeFile", null, String.class).guiName("Assembly Genome").required(true).inFile()
.description("Input assembly genome file from which to pull sequence").build();
private PluginParameter myAssembly = new PluginParameter.Builder("assembly", null, String.class).guiName("Assembly Name").required(true)
.description("Name of assembly to append to output files").build();
private PluginParameter myOutputDir = new PluginParameter.Builder("o", null, String.class).guiName("Output Directory").required(true).outDir()
.description("Output directory including trailing / for writing fasta and fastq files to use as input for BWA-MEM and other tools").build();
public CreateContigFastaFromAssemblyGenomePlugin() {
super(null, false);
}
public CreateContigFastaFromAssemblyGenomePlugin(Frame parentFrame) {
super(parentFrame, false);
}
public CreateContigFastaFromAssemblyGenomePlugin(Frame parentFrame, boolean isInteractive) {
super(parentFrame, isInteractive);
}
@Override
public DataSet processData(DataSet input) {
myLogger.info("In processData");
String fastaFile = outputDir() + assembly() + "_asContigs.fa";
try (BufferedReader rd = Utils.getBufferedReader(genomeFile());
BufferedWriter fastaBW = Utils.getBufferedWriter(fastaFile)) {
String fileLine;
String fastaChr = "-1";
StringBuilder contigSB = new StringBuilder();
boolean scaffold = false;
// Current decision is to include scaffolds. If that decision changes,
// filter them out here.
while ((fileLine = rd.readLine()) != null) {
if (fileLine.startsWith(">")) {
if (contigSB.length() > 0) { // all chroms
createContigs(fastaChr,contigSB.toString().toUpperCase(),fastaBW);
}
contigSB.setLength(0);
fastaChr = fileLine.replace(">", "");
fastaChr = fastaChr.split(" ")[0];
} else {
contigSB.append(fileLine); // keep appending until we hit next id line.
}
}
// process last one
if (contigSB.length() > 0) {
createContigs(fastaChr,contigSB.toString().toUpperCase(),fastaBW);
}
} catch (Exception exc) {
myLogger.error(exc.getMessage(), exc);
throw new IllegalStateException("CreateContigFastaFromAssemblyGenomePlugin: error reading or writing file " + exc.getMessage());
}
return null;
}
// This method chops sequence into contigs, splitting by N's
private static void createContigs(String chrom, String sequence,
BufferedWriter fastaBW) {
int contigStart=1;
int contigCount = 0;
myLogger.info(" createContigs, processing chrom : " + chrom + ", length " + sequence.length());
StringBuilder contigSB = new StringBuilder();
for (int idx = 0; idx < sequence.length(); idx++) {
if (sequence.charAt(idx) != 'N') {
contigSB.append(sequence.charAt(idx));
} else {
// write this contig
if (contigSB.length() > 0) {
String idline = ">" + chrom + ":" + contigStart + ":" + idx;
writeFasta(idline,contigSB.toString(),fastaBW);
contigSB.setLength(0);
contigCount++;
}
contigStart = idx+2; // +1 cur to 0 vs 1 based, +1 again to move past N
}
}
if (contigSB.length () > 0) {
String idline = ">" + chrom + ":" + contigStart + ":" + sequence.length();
writeFasta(idline,contigSB.toString(),fastaBW);
}
}
// Creates a fastA file of contigs
private static void writeFasta(String idline,String sequence, BufferedWriter fastaBW) {
try {
fastaBW.write(idline + "\n" + sequence + "\n");
} catch (IOException ioe) {
myLogger.error(ioe.getMessage(),ioe);
throw new IllegalStateException("CreateContigFastaFromAssemblyGenomePlugin: error writing fasta file " + ioe.getMessage());
}
}
@Override
public String getButtonName() {
return ("Contig Fasta from Assembly Genome");
}
@Override
public String getToolTipText() {
return ("Contig fasta from Assembly Genome");
}
// The following getters and setters were auto-generated.
// Please use this method to re-generate.
//
// public static void main(String[] args) {
// GeneratePluginCode.generate(CreateContigFastaFromAssemblyGenomePlugin.class);
// }
/**
* Convenience method to run plugin with one return object.
*/
// TODO: Replace with specific type.
// public runPlugin(DataSet input) {
// return () performFunction(input).getData(0).getData();
// }
/**
* Input assembly genome file from which to pull sequence
*
* @return Assembly Genome
*/
public String genomeFile() {
return myGenomeFile.value();
}
/**
* Set Assembly Genome. Input assembly genome file from
* which to pull sequence
*
* @param value Assembly Genome
*
* @return this plugin
*/
public CreateContigFastaFromAssemblyGenomePlugin genomeFile(String value) {
myGenomeFile = new PluginParameter<>(myGenomeFile, value);
return this;
}
/**
* Output fastq file to use as input for BWA-MEM
*
* @return Output File
*/
public String assembly() {
return myAssembly.value();
}
/**
* Set Output File. Output fastq file to use as input
* for BWA-MEM
*
* @param value Output File
*
* @return this plugin
*/
public CreateContigFastaFromAssemblyGenomePlugin assembly(String value) {
myAssembly = new PluginParameter<>(myAssembly, value);
return this;
}
/**
* Output fastq file to use as input for BWA-MEM
*
* @return Output File
*/
public String outputDir() {
return myOutputDir.value();
}
/**
* Set Output File. Output fastq file to use as input
* for BWA-MEM
*
* @param value Output File
*
* @return this plugin
*/
public CreateContigFastaFromAssemblyGenomePlugin outputDir(String value) {
myOutputDir = new PluginParameter<>(myOutputDir, value);
return this;
}
@Override
public ImageIcon getIcon() {
// TODO Auto-generated method stub
return null;
}
}