net.maizegenetics.pangenome.db_loading.SplitFastaByChromPlugin Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
/**
*
*/
package net.maizegenetics.pangenome.db_loading;
import java.awt.Frame;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.swing.ImageIcon;
import org.apache.log4j.Logger;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.GeneratePluginCode;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.util.Utils;
/**
* Splits fasta by chromosome. IT assumes the id line has a chromosome of
* the form >1, >chr1 or >chromosome1
* ALl of the above forms (case insensitive) will be written with an id line of just
* the number, e.g. >1 or >2
*
* The "name" parameter is used as the basis of the name. To this name will be appended
* the "chr" and chrom number and .fa.
*
* For example: If the user gives "w22" as the name, the code will write files:
* w22chr1.fa, w22chr2.fa, etc.
*
* The isGca parameter: The assemblies starting with GCA have long text in the idLIne,
* with the chromosome stuck in the middle. This plugin will correctly parse these
* lines. OTher weird id lines are not supported and may need to be corrected
* manually before running through this plugin.
*
* Seems each set of assemblies that arrives has a different signature for the idline.
* So I keep customizing. Consider this code to the "base" plugin. On each run, if
* the idlines don't adhere to chr/chromosome/X or GCA, then user should run an altered
* version of this, or fix the idlines first.
*
* @author lcj34
*
*/
public class SplitFastaByChromPlugin extends AbstractPlugin {
private static final Logger myLogger = Logger.getLogger(SplitFastaByChromPlugin.class);
private PluginParameter fasta = new PluginParameter.Builder("fasta", null, String.class).guiName("Fasta File ").required(true).inFile()
.description("Fasta File to split by chromosome ").build();
private PluginParameter name = new PluginParameter.Builder("name", null, String.class).guiName("Name ").required(true)
.description("Name to give each file, e.g w22. To this name will be appended 'chr' plus the chrom number plus .fa").build();
private PluginParameter outputDir = new PluginParameter.Builder("outputDir", null, String.class).guiName("Output Directory").required(true).outDir()
.description("Path to write the split files")
.build();
private PluginParameter isGca = new PluginParameter.Builder("isGca", false, Boolean.class).guiName("Is GCA").required(false)
.description("GCA fastas have long text as idLines. These fasta will have their id lines specially parsed to extract the chromosome number.")
.build();
Pattern chrPattern = Pattern.compile("([^\\s]+)?(\\s)+"); // define global to file to avoid multiple recompiles
public SplitFastaByChromPlugin() {
super(null, false);
}
public SplitFastaByChromPlugin(Frame parentFrame) {
super(parentFrame, false);
}
public SplitFastaByChromPlugin(Frame parentFrame, boolean isInteractive) {
super(parentFrame, isInteractive);
}
@Override
public DataSet processData(DataSet input) {
long totalTime = System.nanoTime();
long time=System.nanoTime();
String chromFileBase = outputDir() + name();
try (BufferedReader br = Utils.getBufferedReader(fasta())) {
String line;
StringBuilder sb = new StringBuilder();
String currChrom = "none";
while ((line = br.readLine()) != null) {
if (line.startsWith(">")) {
if (sb.length() > 0) {
writeChromFasta(currChrom, chromFileBase,sb.toString());
sb.setLength(0);
}
//currChrom = parseBauerFlint(line);
//currChrom = parseChromAsRomanNumerals(line);
//currChrom = parseChromFirstTab(line); // for Baoxing GWHAACS00000000
currChrom = isGca() ? parseGCA(line) : parseChromosome(line); // remove beginning ">" and trailing space
myLogger.info("processing chrom with parsedName as " + currChrom);
} else {
sb.append(line);
}
}
if (sb.length() > 0) {
writeChromFasta(currChrom, chromFileBase, sb.toString());
}
} catch (Exception exc) {
exc.printStackTrace();
throw new IllegalStateException("Error reading output contigs file " + exc.getMessage());
}
return null;
}
// Returns a chromsome parsed to just a number. Works if the
// chromosome is 1, chr1, or chromosome1. OTherse need pre-parsing
public String parseChromosome(String chromString) {
String chrS = chromString.replace(">","");
chrS = chrS.toUpperCase();
chrS = chrS.replace("CHROMOSOME", "");
chrS = chrS.replace("CHR", ""); // keep chromosome string, minus any leading "chr" or "chromosome"
int spaceIndex = chrS.indexOf(" ");
if (spaceIndex > 0) {
chrS = chrS.substring(0,chrS.indexOf(" "));
}
String finalChr = chrS;
try { // strip off leading 0's to match maize reference chromosome names (1 vs 01)
int chrInt = Integer.parseInt(chrS);
finalChr = String.valueOf(chrInt);
} catch (NumberFormatException ne) {
// finalChr remains unchanged
}
return finalChr;
}
// Setraia has Roman numerals as the chrom name,
// ex: >IV dna:chromosome chromosome:Setaria_italica_v2.0:IV:1:40407879:1 REF
// and other lines, e.g.
// >KQ475381 dna:supercontig supercontig:Setaria_italica_v2.0:KQ475381:1:423243:1 REF
public String parseChromAsRomanNumerals(String chromString) {
String chrS = chromString.replace(">","");
int spaceIndex = chrS.indexOf(" ");
if (spaceIndex > 0) {
chrS = chrS.substring(0,chrS.indexOf(" "));
}
Integer intValue = romanToDecimal(chrS);
if (intValue == null) { // String isn't a roman numeral
return chrS;
}
return (String.valueOf(intValue));
}
// parses idlines from Eva Bauer's 4 flint lines, whose idLines look like:
// DK105_chr_1 and DK105_scaffold701
public String parseBauerFlint(String chromString) {
// Need to pass the name case sensitive to what is stored in the fasta
String chrS = chromString.replace(">","");
int lastUS = chrS.lastIndexOf("_");
chrS = chrS.substring(lastUS+1);
int spaceIndex = chrS.indexOf(" ");
if (spaceIndex > 0) {
chrS = chrS.substring(0,chrS.indexOf(" "));
}
String finalChr = chrS;
try { // strip off leading 0's to match maize reference chromosome names (1 vs 01)
int chrInt = Integer.parseInt(chrS);
finalChr = String.valueOf(chrInt);
} catch (NumberFormatException ne) {
// finalChr remains unchanged
}
return finalChr;
}
// THis file parses chromosomes (initially for GWHAACS00000000 from web site
// http://bigd.big.ac.cn/gwh/Assembly/123/show where the idline looks like:
// >GWHAACS00000001 Chromosome 1 Complete=T Circular=F OriSeqID=chr1 Len=301378628
public String parseChromFirstTab(String chromString) {
// Need to pass the name case sensitive to what is stored in the fasta
String chrS = chromString.replace(">","");
int firstTab = chrS.indexOf("\t");
chrS = chrS.substring(firstTab+1);
// Only want chromosome alignments
if (!chrS.startsWith("Chromosome")) return chrS;
int spaceIndex = chrS.indexOf(" ");
if (spaceIndex > 0) {
chrS = chrS.substring(chrS.indexOf(" ")+1);
int nextTab = chrS.indexOf("\t");
if (nextTab > 0) {
chrS = chrS.substring(0, nextTab);
}
}
String finalChr = chrS;
try { // strip off leading 0's to match maize reference chromosome names (1 vs 01)
int chrInt = Integer.parseInt(chrS);
finalChr = String.valueOf(chrInt);
} catch (NumberFormatException ne) {
// finalChr remains unchanged
}
return finalChr;
}
// Parses the idlines from GCA fasta files
public String parseGCA(String chromString) {
String chromName = chromString;
int chromIndex = chromName.indexOf("chromosome");
if (chromIndex > 0) {
chromName = chromName.substring(chromIndex);
final Matcher matcher = chrPattern.matcher(chromName);
matcher.find(); // finds first white space
chromName = chromName.substring(matcher.end()); // substrings from the first character after the white space
chromName = chromName.substring(0, chromName.indexOf(","));
try { // strip off leading 0's
int chrInt = Integer.parseInt(chromName);
chromName = String.valueOf(chrInt);
} catch (NumberFormatException ne) {
// chromName remains unchanged
}
}
return chromName;
}
private void writeChromFasta(String chrom, String outputFileBase, String sequence) {
String outFile = outputFileBase + "chr" + chrom + ".fa";
try (BufferedWriter bw = Utils.getBufferedWriter(outFile)) {
StringBuilder sb = new StringBuilder();
sb.append(">").append(chrom).append("\n");
sb.append(sequence);
sb.append("\n");
bw.write(sb.toString());
} catch (Exception exc) {
throw new IllegalStateException("SplitFastaByChrom:writeChromFasta - exception writing file " + outFile + ":" + exc.getMessage());
}
}
public static Integer romanToDecimal(String romanNumber) {
int decimal = 0;
int lastNumber = 0;
String romanNumeral = romanNumber.toUpperCase();
for (int x = romanNumeral.length() - 1; x >= 0 ; x--) {
// Process all as upper case
char convertToDecimal = romanNumeral.charAt(x);
switch (convertToDecimal) {
case 'M':
decimal = processDecimal(1000, lastNumber, decimal);
lastNumber = 1000;
break;
case 'D':
decimal = processDecimal(500, lastNumber, decimal);
lastNumber = 500;
break;
case 'C':
decimal = processDecimal(100, lastNumber, decimal);
lastNumber = 100;
break;
case 'L':
decimal = processDecimal(50, lastNumber, decimal);
lastNumber = 50;
break;
case 'X':
decimal = processDecimal(10, lastNumber, decimal);
lastNumber = 10;
break;
case 'V':
decimal = processDecimal(5, lastNumber, decimal);
lastNumber = 5;
break;
case 'I':
decimal = processDecimal(1, lastNumber, decimal);
lastNumber = 1;
break;
default:
// Not a roman numeral - return null
return null;
}
}
return decimal;
}
public static int processDecimal(int decimal, int lastNumber, int lastDecimal) {
if (lastNumber > decimal) {
return lastDecimal - decimal;
} else {
return lastDecimal + decimal;
}
}
// // The following getters and setters were auto-generated.
// // Please use this method to re-generate.
public static void main(String[] args) {
GeneratePluginCode.generate(SplitFastaByChromPlugin.class);
}
@Override
public ImageIcon getIcon() {
// TODO Auto-generated method stub
return null;
}
@Override
public String getButtonName() {
// TODO Auto-generated method stub
return null;
}
@Override
public String getToolTipText() {
// TODO Auto-generated method stub
return null;
}
/**
* Fasta File to split by chromosome
*
* @return Fasta File
*/
public String fasta() {
return fasta.value();
}
/**
* Set Fasta File . Fasta File to split by chromosome
*
*
* @param value Fasta File
*
* @return this plugin
*/
public SplitFastaByChromPlugin fasta(String value) {
fasta = new PluginParameter<>(fasta, value);
return this;
}
/**
* Name to give each file, e.g w22. To this name will
* be appended 'chr' plus the chrom number plus .fa
*
* @return Name
*/
public String name() {
return name.value();
}
/**
* Set Name . Name to give each file, e.g w22. To this
* name will be appended 'chr' plus the chrom number plus
* .fa
*
* @param value Name
*
* @return this plugin
*/
public SplitFastaByChromPlugin name(String value) {
name = new PluginParameter<>(name, value);
return this;
}
/**
* Path to write the split files
*
* @return Output Directory
*/
public String outputDir() {
return outputDir.value();
}
/**
* Set Output Directory. Path to write the split files
*
* @param value Output Directory
*
* @return this plugin
*/
public SplitFastaByChromPlugin outputDir(String value) {
outputDir = new PluginParameter<>(outputDir, value);
return this;
}
/**
* GCA fastas have long text as idLines. These fasta
* will have their id lines specially parsed to extract
* the chromosome number.
*
* @return Is GCA
*/
public Boolean isGca() {
return isGca.value();
}
/**
* Set Is GCA. GCA fastas have long text as idLines.
* These fasta will have their id lines specially parsed
* to extract the chromosome number.
*
* @param value Is GCA
*
* @return this plugin
*/
public SplitFastaByChromPlugin isGca(Boolean value) {
isGca = new PluginParameter<>(isGca, value);
return this;
}
}