net.maizegenetics.pangenome.db_loading.SplitFastaByChromPlugin Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
There is a newer version: 1.10
/**
 * 
 */
package net.maizegenetics.pangenome.db_loading;

import java.awt.Frame;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.swing.ImageIcon;

import org.apache.log4j.Logger;

import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.GeneratePluginCode;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.util.Utils;

/**
 * Splits fasta by chromosome.  IT assumes the id line has a chromosome of
 * the form >1, >chr1 or >chromosome1
 * ALl of the above forms (case insensitive) will be written with an id line of just
 * the number, e.g. >1 or >2
 * 
 * The "name" parameter is used as the basis of the name.  To this name will be appended
 * the "chr" and chrom number and .fa.
 * 
 * For example:  If the user gives "w22" as the name, the code will write files:
 *  w22chr1.fa, w22chr2.fa, etc.
 *  
 * The isGca parameter:  The assemblies starting with GCA have long text in the idLIne,
 * with the chromosome stuck in the middle.  This plugin will correctly parse these
 * lines.  OTher weird id lines are not supported and may need to be corrected
 * manually before running through this plugin.
 * 
 * Seems each set of assemblies that arrives has a different signature for the idline.
 * So I keep customizing.  Consider this code to the "base" plugin.  On each run, if
 * the idlines don't adhere to chr/chromosome/X or GCA, then user should run an altered
 * version of this, or fix the idlines first.
 * 
 * @author lcj34
 *
 */
public class SplitFastaByChromPlugin extends AbstractPlugin {
    private static final Logger myLogger = Logger.getLogger(SplitFastaByChromPlugin.class);

    private PluginParameter fasta = new PluginParameter.Builder("fasta", null, String.class).guiName("Fasta File ").required(true).inFile()
            .description("Fasta File to split by chromosome ").build();
    private PluginParameter name = new PluginParameter.Builder("name", null, String.class).guiName("Name ").required(true)
            .description("Name to give each file, e.g w22.  To this name will be appended 'chr' plus the chrom number plus .fa").build();
    private PluginParameter outputDir = new PluginParameter.Builder("outputDir", null, String.class).guiName("Output Directory").required(true).outDir()
            .description("Path to write the split files")
            .build();
    private PluginParameter isGca = new PluginParameter.Builder("isGca", false, Boolean.class).guiName("Is GCA").required(false)
            .description("GCA fastas have long text as idLines.  These fasta will have their id lines specially parsed to extract the chromosome number.")
            .build();
    
    Pattern chrPattern = Pattern.compile("([^\\s]+)?(\\s)+");  // define global to file to avoid multiple recompiles
    
    public SplitFastaByChromPlugin() {
        super(null, false);
    }

    public SplitFastaByChromPlugin(Frame parentFrame) {
        super(parentFrame, false);
    }

    public SplitFastaByChromPlugin(Frame parentFrame, boolean isInteractive) {
        super(parentFrame, isInteractive);
    }
    
    
    @Override
    public DataSet processData(DataSet input) {

        long totalTime = System.nanoTime();
        long time=System.nanoTime();

        String chromFileBase = outputDir() + name();
        try (BufferedReader br = Utils.getBufferedReader(fasta())) {
            String line;
            StringBuilder sb = new StringBuilder();
            String currChrom = "none";
 
            while ((line = br.readLine()) != null) {                
                if (line.startsWith(">")) {  
                    if (sb.length() > 0) {
                        writeChromFasta(currChrom, chromFileBase,sb.toString());
                        sb.setLength(0);
                    }
                    //currChrom = parseBauerFlint(line);
                    //currChrom = parseChromAsRomanNumerals(line);
                    //currChrom = parseChromFirstTab(line); // for Baoxing GWHAACS00000000
                    currChrom = isGca() ? parseGCA(line) : parseChromosome(line); // remove beginning ">" and trailing space
                    myLogger.info("processing chrom with parsedName as " + currChrom);
                } else {
                    sb.append(line);
                }
            }
            if (sb.length() > 0) {
                writeChromFasta(currChrom, chromFileBase, sb.toString());
            }
        } catch (Exception exc) {
            exc.printStackTrace();
            throw new IllegalStateException("Error reading output contigs file " + exc.getMessage());
        }

        
        return null;
    }
    
    // Returns a chromsome parsed to just a number.  Works if the 
    // chromosome is 1, chr1, or chromosome1.  OTherse need pre-parsing
    public String parseChromosome(String chromString) {
        
        String chrS = chromString.replace(">","");
        chrS = chrS.toUpperCase();
        chrS = chrS.replace("CHROMOSOME", ""); 
        chrS = chrS.replace("CHR", ""); // keep chromosome string, minus any leading "chr" or "chromosome"
        
        int spaceIndex = chrS.indexOf(" ");
        if (spaceIndex > 0) {                   
            chrS = chrS.substring(0,chrS.indexOf(" "));                         
        } 
        String finalChr = chrS;
        try { // strip off leading 0's to match maize reference chromosome names (1 vs 01)
            int chrInt = Integer.parseInt(chrS);
            finalChr = String.valueOf(chrInt);
        } catch (NumberFormatException ne) {
            // finalChr remains unchanged 
        }
        return finalChr;       
    }

    // Setraia has Roman numerals as the chrom name,
    // ex: >IV dna:chromosome chromosome:Setaria_italica_v2.0:IV:1:40407879:1 REF
    // and other lines, e.g.
    //  >KQ475381 dna:supercontig supercontig:Setaria_italica_v2.0:KQ475381:1:423243:1 REF
    public String parseChromAsRomanNumerals(String chromString) {

        String chrS = chromString.replace(">","");

        int spaceIndex = chrS.indexOf(" ");
        if (spaceIndex > 0) {
            chrS = chrS.substring(0,chrS.indexOf(" "));
        }

        Integer intValue = romanToDecimal(chrS);
        if (intValue == null) { // String isn't a roman numeral
            return chrS;
        }
        return (String.valueOf(intValue));
    }
    
    // parses idlines from Eva Bauer's 4 flint lines, whose idLines look like:
    // DK105_chr_1 and DK105_scaffold701
    public String parseBauerFlint(String chromString) {
        // Need to pass the name case sensitive to what is stored in the fasta        
        String chrS = chromString.replace(">","");
        int lastUS = chrS.lastIndexOf("_");
        chrS = chrS.substring(lastUS+1);
                
        int spaceIndex = chrS.indexOf(" ");
        if (spaceIndex > 0) {                   
            chrS = chrS.substring(0,chrS.indexOf(" "));                         
        } 
        String finalChr = chrS;
        try { // strip off leading 0's to match maize reference chromosome names (1 vs 01)
            int chrInt = Integer.parseInt(chrS);
            finalChr = String.valueOf(chrInt);
        } catch (NumberFormatException ne) {
            // finalChr remains unchanged 
        }
        return finalChr;       
        
    }
    
    // THis file parses chromosomes (initially for GWHAACS00000000 from web site
    // http://bigd.big.ac.cn/gwh/Assembly/123/show where the idline looks like:
    // >GWHAACS00000001        Chromosome 1    Complete=T      Circular=F      OriSeqID=chr1   Len=301378628
    public String parseChromFirstTab(String chromString) {
        // Need to pass the name case sensitive to what is stored in the fasta        
        String chrS = chromString.replace(">","");
        int firstTab = chrS.indexOf("\t");
        chrS = chrS.substring(firstTab+1);
                
        // Only want chromosome alignments
        if (!chrS.startsWith("Chromosome")) return chrS;
        int spaceIndex = chrS.indexOf(" ");
        if (spaceIndex > 0) {                   
            chrS = chrS.substring(chrS.indexOf(" ")+1);
            int nextTab = chrS.indexOf("\t");
            if (nextTab > 0) {
                chrS = chrS.substring(0, nextTab);               
            }
            
        } 
        String finalChr = chrS;
        try { // strip off leading 0's to match maize reference chromosome names (1 vs 01)
            int chrInt = Integer.parseInt(chrS);
            finalChr = String.valueOf(chrInt);
        } catch (NumberFormatException ne) {
            // finalChr remains unchanged 
        }
        return finalChr;       
        
    }
    // Parses the idlines from GCA fasta files
    public String parseGCA(String chromString) {
        String chromName = chromString;
        int chromIndex = chromName.indexOf("chromosome");
        if (chromIndex > 0) {
            chromName = chromName.substring(chromIndex);         
            final Matcher matcher = chrPattern.matcher(chromName);
            matcher.find(); // finds first white space
            chromName = chromName.substring(matcher.end()); // substrings from the first character after the white space
            chromName = chromName.substring(0, chromName.indexOf(",")); 
            try { // strip off leading 0's
                int chrInt = Integer.parseInt(chromName);
                chromName = String.valueOf(chrInt);
            } catch (NumberFormatException ne) {
                // chromName remains unchanged
            }
        }
        return chromName;
    }
    
 
    private void writeChromFasta(String chrom, String outputFileBase, String sequence) {
        String outFile = outputFileBase + "chr" + chrom + ".fa";
        try (BufferedWriter bw = Utils.getBufferedWriter(outFile)) {
            StringBuilder sb = new StringBuilder();
            sb.append(">").append(chrom).append("\n");
            sb.append(sequence);
            sb.append("\n");
            bw.write(sb.toString());
            
        } catch (Exception exc) {
            throw new IllegalStateException("SplitFastaByChrom:writeChromFasta - exception writing file " + outFile + ":" + exc.getMessage());
        }
    }


        public static Integer romanToDecimal(String romanNumber) {
            int decimal = 0;
            int lastNumber = 0;
            String romanNumeral = romanNumber.toUpperCase();

            for (int x = romanNumeral.length() - 1; x >= 0 ; x--) {
                // Process all as upper case
                char convertToDecimal = romanNumeral.charAt(x);

                switch (convertToDecimal) {
                    case 'M':
                        decimal = processDecimal(1000, lastNumber, decimal);
                        lastNumber = 1000;
                        break;

                    case 'D':
                        decimal = processDecimal(500, lastNumber, decimal);
                        lastNumber = 500;
                        break;

                    case 'C':
                        decimal = processDecimal(100, lastNumber, decimal);
                        lastNumber = 100;
                        break;

                    case 'L':
                        decimal = processDecimal(50, lastNumber, decimal);
                        lastNumber = 50;
                        break;

                    case 'X':
                        decimal = processDecimal(10, lastNumber, decimal);
                        lastNumber = 10;
                        break;

                    case 'V':
                        decimal = processDecimal(5, lastNumber, decimal);
                        lastNumber = 5;
                        break;

                    case 'I':
                        decimal = processDecimal(1, lastNumber, decimal);
                        lastNumber = 1;
                        break;
                    default:
                        // Not a roman numeral - return null
                        return null;
                }
            }
            return decimal;
        }

        public static int processDecimal(int decimal, int lastNumber, int lastDecimal) {
            if (lastNumber > decimal) {
                return lastDecimal - decimal;
            } else {
                return lastDecimal + decimal;
            }
        }

        //    // The following getters and setters were auto-generated.
    //    // Please use this method to re-generate.    
    public static void main(String[] args) {
        GeneratePluginCode.generate(SplitFastaByChromPlugin.class);
    }
    
    @Override
    public ImageIcon getIcon() {
        // TODO Auto-generated method stub
        return null;
    }

    @Override
    public String getButtonName() {
        // TODO Auto-generated method stub
        return null;
    }

    @Override
    public String getToolTipText() {
        // TODO Auto-generated method stub
        return null;
    }
    
    /**
     * Fasta File to split by chromosome 
     *
     * @return Fasta File 
     */
    public String fasta() {
        return fasta.value();
    }

    /**
     * Set Fasta File . Fasta File to split by chromosome
     * 
     *
     * @param value Fasta File 
     *
     * @return this plugin
     */
    public SplitFastaByChromPlugin fasta(String value) {
        fasta = new PluginParameter<>(fasta, value);
        return this;
    }

    /**
     * Name to give each file, e.g w22.  To this name will
     * be appended 'chr' plus the chrom number plus .fa
     *
     * @return Name 
     */
    public String name() {
        return name.value();
    }

    /**
     * Set Name . Name to give each file, e.g w22.  To this
     * name will be appended 'chr' plus the chrom number plus
     * .fa
     *
     * @param value Name 
     *
     * @return this plugin
     */
    public SplitFastaByChromPlugin name(String value) {
        name = new PluginParameter<>(name, value);
        return this;
    }

    /**
     * Path to write the split files
     *
     * @return Output Directory
     */
    public String outputDir() {
        return outputDir.value();
    }

    /**
     * Set Output Directory. Path to write the split files
     *
     * @param value Output Directory
     *
     * @return this plugin
     */
    public SplitFastaByChromPlugin outputDir(String value) {
        outputDir = new PluginParameter<>(outputDir, value);
        return this;
    }

    /**
     * GCA fastas have long text as idLines.  These fasta
     * will have their id lines specially parsed to extract
     * the chromosome number.
     *
     * @return Is GCA
     */
    public Boolean isGca() {
        return isGca.value();
    }

    /**
     * Set Is GCA. GCA fastas have long text as idLines. 
     * These fasta will have their id lines specially parsed
     * to extract the chromosome number.
     *
     * @param value Is GCA
     *
     * @return this plugin
     */
    public SplitFastaByChromPlugin isGca(Boolean value) {
        isGca = new PluginParameter<>(isGca, value);
        return this;
    }
}