All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.analysis.monetdb.ColumnsToBinaryFullGenomeTablePlugin Maven / Gradle / Ivy

Go to download

TASSEL is a software package to evaluate traits associations, evolutionary patterns, and linkage disequilibrium.

There is a newer version: 5.2.94
Show newest version
/**
 */
package net.maizegenetics.analysis.monetdb;

import java.awt.Frame;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Scanner;

import javax.swing.ImageIcon;

import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.dna.map.GenomeSequence;
import net.maizegenetics.dna.map.GenomeSequenceBuilder;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.GeneratePluginCode;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.util.DirectoryCrawler;
import net.maizegenetics.util.Utils;

/**
 * This plugin is copied from the ColumnsToBinarySNPOnlyTablePlugin (which was copied Kelly's
 * ColumnsToBinaryPlugin, which re-worked  various lynn methods) to create 
 * binary files for loading into the hmp321_snp table in the maizeFullGEnomeDB of the
 * Rare Alleles monetdb instance.  THe difference between this plugin and the 
 * ColumnsToBinarySNPOnlyTablePlugin is the latter creates entries only for identified hmp321 SNPs
 * This method (ColumnsToBinaryFullGenomePlugin) creates entries  for all positions listed in 
 * in the reference genome.
 * 
 * To be consistent with the existing columns in monetdb maize tables, the reference file must
 * be a link to a copy of the Zea_mays.AGPv3.20.dna.genome.fa file stored 
 * on andersonii in Research/Zea/Genotypes/Annotations/monetDB/refGenomeFiles.
 * 
 * The "inputFile" parameter can be either a single file containing data for all chromosomes or a 
 * directory of files split by chromosome.  If all chroms are in a single file, that file
 * must be sorted by chromosome and position, and must contain a header line that contains
 * the columns "chr" and "pos" along with user specified data columns.
 * 
 * If the "inputFile" parameter is a directory, the only *.txt files it holds must be files
 * intended for this processing.  These files must be split by chromosome and must be named
 * such that they will sort lexicographically in chromosome order.  For example:  files named
 * chr01.txt, chr02.txt ... chr09.txt, chr10.txt will sort from 1-10.  But files named
 * chr1.txt, chr2,txt ... chr9.txt, chr10.txt will not.  In the latter case, chr10.txt will
 * be processed before the other files.
 * 
 * This plugin may also be used to create binaries for the maizeChrom10DB.  In this case,
 * we still use the full reference genome, but only chrom 10 is processed.  The inputFile
 * paramaeter shoudl be just 1 file containing chromosome 10 data.
 * 
 * @author kelly
 * @author lcj34
 *
 */
public class ColumnsToBinaryFullGenomeTablePlugin extends AbstractPlugin {
    private PluginParameter inputFile= new PluginParameter.Builder<>("inputFile",null,String.class).guiName("Input File").required(true)
            .description("Input File containing a header line and entries sorted by chromosome and position, or Directory containing Tab-delimited files split by chromosome and sorted by position.  \nIf parameter is a directory, each file must contain a header line, and the files must end with .txt and be named in a maaner that sorts lexicographically by chromosome").build();
    private PluginParameter refFile= new PluginParameter.Builder<>("refFile",null,String.class).guiName("refFile").required(true)
            .description("A link to the species reference file. This must be the same reference file used by other columns in the monetdb table for your species.").build();
    private PluginParameter outBase= new PluginParameter.Builder<>("outBase",null,String.class).guiName("outBase").required(true)
            .description("Output directory and base filename to hold the binary files. Will make directory if neccesary").build();
    private PluginParameter colsFloat= new PluginParameter.Builder<>("colNamesFloat",null,String.class).guiName("Columns keep as Real (Float)").required(false)
            .description("Comma separated list of column names to generate real binaries for").build();
    private PluginParameter colsInt= new PluginParameter.Builder<>("colNamesInt",null,String.class).guiName("Columns keep as Int").required(false)
            .description("Comma separated list of column names to generate int  binaries for").build();
    private PluginParameter colsShort= new PluginParameter.Builder<>("colNamesShort",null,String.class).guiName("Columns keep as Short").required(false)
            .description("Comma separated list of column names to generate short binaries for").build();
    private PluginParameter colsLong= new PluginParameter.Builder<>("colNamesLong",null,String.class).guiName("Columns keep as Long").required(false)
            .description("Comma separated list of column names to generate long binaries for").build();
    private PluginParameter colsChar= new PluginParameter.Builder<>("colNamesChar",null,String.class).guiName("Columns keep as Char String").required(false)
            .description("Comma separated list of column names to generate character/text binaries for").build();
    private PluginParameter colsByte= new PluginParameter.Builder<>("colNamesByte",null,String.class).guiName("Columns keep as Byte").required(false)
            .description("Comma separated list of column names to generate byte binaries for").build();
    private PluginParameter colsAllele= new PluginParameter.Builder<>("colNamesAllele",null,String.class).guiName("Columns translate Alleles").required(false)
            .description("Comma separated list of column names to generate for single char alleles to be translated to 0-5 for A/C/G/T/+/-").build();
    private PluginParameter colsLog10= new PluginParameter.Builder<>("colNamesLog10",null,String.class).guiName("Columns to Keep and transform -log10").required(false)
            .description("Comma separated list of column names to first transform using -log10 then generate binaries for").build();
    private PluginParameter range= new PluginParameter.Builder<>("range",false,Boolean.class).guiName("Range information?").required(false)
            .description("Columns for range data. If true, will look for 'start' and 'end' (inclusive exclusive) or 'first' 'last' (inclusive inclusive) instead of 'Pos' ").build();
    private PluginParameter negToZero= new PluginParameter.Builder<>("negToZero",false,Boolean.class).guiName("Negative floats to zero?").required(false)
            .description("Will set negative column values to zero (otherwise they the negative value is stored").build();
    private PluginParameter missToZero= new PluginParameter.Builder<>("missToZero",false,Boolean.class).guiName("Missing float values to zero?").required(false)
            .description("Will set missing  column values to zero (otherwise they are set to null").build();
    private PluginParameter oneBased= new PluginParameter.Builder<>("oneBased",true,Boolean.class).guiName("positions are 1 based?").required(false)
            .description("Will assume all positions and ranges are 1-based unless this value is set to false").build();
    
    private HashMap log10HM= null;
    private HashMap log10Writers= null;
    private HashMap realHM= null;
    private HashMap realWriters= null;
    private HashMap shortHM= null;
    private HashMap shortWriters= null;
    private HashMap longHM= null;
    private HashMap longWriters= null;
    ///PUT CHARACTER BACK (figure out what monetDB wants)
    private HashMap charHM= null; //Strings written as Strings, because that's what MonetDB likes
    private HashMap charWriters= null;
    private HashMap intHM= null;
    private HashMap intWriters= null;
    private HashMap byteHM= null;
    private HashMap byteWriters= null;
    private HashMap alleleHM= null;
    private HashMap alleleWriters= null;
    private int chrCol= -1;
    private String chrName= "CHR";
    private int posCol= -1;
    private int totalLines;
    private int totalZeroLines;
    private int totalNonZeroLines;
    private int linesForChr;
    private FileInputStream fis= null;
    private Scanner scanner= null;
    private boolean inclusive= false;
    private int startCol= -1;
    private int endCol= -1;
    
    private GenomeSequence myRefSequence = null;
    
    List infiles;
    
    public ColumnsToBinaryFullGenomeTablePlugin(Frame parentFrame, boolean isInteractive) {
        super(parentFrame, isInteractive);
    }
    public ColumnsToBinaryFullGenomeTablePlugin() {
        super(null, false);
    }
    
    @Override
    protected void preProcessParameters(DataSet input) {
    }
    
    @Override
    protected void postProcessParameters() {
 
        File out= new File(outBase.value());
        if (!out.getParentFile().exists()) out.getParentFile().mkdirs();
        
        // create list of directory files
        File dirList = new File(inputFile());
        if (!(dirList.exists())) {
            throw new IllegalStateException("Input file or directory not found !!") ;
        }
 
        if (dirList.isDirectory()) {
            String inputFileGlob="glob:*{.txt}";
            infiles= DirectoryCrawler.listPaths(inputFileGlob, Paths.get(inputFile.value()).toAbsolutePath());
            if (infiles.size() < 1) {
                throw new IllegalStateException("no .txt files found in input directory !!");
            }
        } else {
            infiles=new ArrayList<>();
            Path filePath= Paths.get(inputFile()).toAbsolutePath(); 
            infiles.add(filePath);
        }
        Collections.sort(infiles); // files should be chr01.txt ...chr10.txt so they will sort correctly
        // all chroms should have same file headers - get columns from first file
        // This is a requirement to have headers in the files.
        FindColumns(infiles.get(0).toString());
    }

    @Override
    public DataSet processData(DataSet input) {
        long time= System.nanoTime();
        Initialize();
        
        // Create reference genome
        myRefSequence = GenomeSequenceBuilder.instance(refFile());
        try {
            String currChr = null;
            Chromosome currChrom = null; // for getting reference sequence
            System.out.println("processData: number of infiles: " + infiles.size());
            for (Path filepath: infiles) {
                String inputFile = filepath.toString();
                int refChromPosSize = 0;
                int lastStart= -1; int lastEnd= -1;
                // Full genome lastLinePos defaults to 1, not 0, as our ref genome is 1-based.
                int lastLinePos = 1; //lastLinePos holds the first position not already written. 
                boolean first = true;
                fis = new FileInputStream(inputFile);
                System.out.println("Processing file: " + inputFile);
                scanner = new Scanner(fis);  
 
                while (scanner.hasNextLine()) {
                    String line = scanner.nextLine();
                    if ((line.isEmpty() || line.startsWith("#")) && first) continue; // skip comment lines
                    if (line.isEmpty() && !first) break;
                    String[] next = line.split("\t");
                    if (next[chrCol].toUpperCase().equals(chrName)) {continue;} // this is header line - skip it
                    if (first) { //initialize the reference position list                       
                        currChr = next[chrCol];
                        System.out.println("...working on chr "+currChr);
                        currChrom = new Chromosome(currChr);
                        refChromPosSize = myRefSequence.chromosomeSize(currChrom);
                        first= false;
                    }
 
                    //When the current line changes chromosome, finish writing end of last, reinitialize for new
                    // The code below is not hit when files are broken up by chromosome.  It IS hit if all
                    // chromosome data is contained in a single file.
                    if (!currChr.equals(next[chrCol])) {
                        // Write everything to null between the last line and the end of the chromosome
                        writeNull(lastLinePos,refChromPosSize);
                        System.out.println("Chrom within file changed, " + linesForChr+" lines output for chr "+currChr);
                        linesForChr= 0; lastLinePos = 1;
                        //if chromosomes not ordered 1-10, shut everything down and throw exception
                        if (Integer.parseInt(currChr)!=(Integer.parseInt(next[chrCol])-1)) {
                            Shutdown();
                            String message = "Chromosomes files must be named so they sort by chromosomes in ascending order !!\nYour chromosome " +
                               currChr + " came before chromosome " + next[chrCol];
                            throw new IllegalStateException(message);
                        }
                        currChr= next[chrCol]; lastStart= -1; lastEnd= -1;
                        currChrom = new Chromosome(currChr);
                        refChromPosSize = myRefSequence.chromosomeSize(currChrom);
                        System.out.println("...working on chr "+currChr);
                    }
                    int position= -1; //problems if in exponential notation (sometimes R does this) so do it as a try as double and cast if problems
                    try {
                        position= (posCol>-1)?Integer.parseInt(next[posCol]):Integer.parseInt(next[startCol]);
                        if (!oneBased()) { 
                            position++; // increment position so it matches the 1-based stored in reference files
                        }
                    } catch(Exception e) {
                        System.out.println(line); 
                        position= Double.valueOf(next[posCol]).intValue();
                        if (!oneBased()) position++; // increment if input file has 0-based positions
                    }
                    int currPosOnChr= position; // 1-based position
                    //Only take the first one, if position/chromosome is duplicated, as long as lines are (if range, just checks start)
                    if (lastStart==currPosOnChr) {
                        System.out.println("WARNING: Already recorded value for this position. Excluded\n"+line); continue;
                    }
                    if (range.value() && inclusive && lastEnd==currPosOnChr) { //if inclusive and lastEnd position same as start of next throw exception
                        throw new IllegalStateException("Ranges are overlapping and supposed to be inclusive!!!!!. If not inclusive, denote ranges by start, end");
                    }
                    // Write everything to null between the last line and the current line
                    // This only writes null if the start (lastLinePos) is less than the end (currPosOnChr)
                    // Because we are doing this all as 1-based, it must be set to 1 initially, not 0.
                    // SNP-only uses 0 as it is indexing an array, which is 0-based.
                    writeNull(lastLinePos,currPosOnChr-1); 
                    //Write out the current line for debugging
//                    if (currPosOnChr==refChromPosList.length-1 || linesForChr>currPosOnChr || (currPosOnChr>9584780 && currPosOnChr<9584800)) {
//                        //System.out.println(currPosOnChr+"\t"+(refChromPosList.length-1)+"\t"+totalLines);
//                        System.out.println(line);
//                    }
                    if (posCol>-1) { // process position-based data file
                        WriteValues(next); //Write out values for this line  ("next" is the line we're processing)
                        lastLinePos= currPosOnChr+1; lastStart= currPosOnChr;
                        totalLines++; linesForChr++; totalNonZeroLines++;
                    } else { // processing ranges
                        // Looking for the value from the refChromPosList array for this chromosome.
                        int end = Integer.parseInt(next[endCol]);
                        if (!oneBased()) end++;
                        end= (inclusive?end+1:end);
                        for (int i = currPosOnChr; i < end; i++) {
                            WriteValues(next); //Write out values for this line
                            totalLines++; linesForChr++; totalNonZeroLines++;
                        }
                        lastLinePos= end; //end, because even if inclusive has been transformed to exclusive
                        lastStart= currPosOnChr; //last is just for making sure that lines aren't duplicated, and checks start of range
                        lastEnd= end-1;
                    }
                } //  end while loop for processing this chrom file
                // Write everything to null between the last line and the end of the chromosome when it hits the end of the file
                // add +1 because writeNull writes until " < end", assuming 0-based, but this is 1-based.
                writeNull(lastLinePos,refChromPosSize); 
                System.out.println("File changed, " + linesForChr+"  lines output for chr "+currChr  
                        + " lastLinePos " + lastLinePos + " refChromPosSize:" + refChromPosSize);
                linesForChr = 0;
            } //  end for loop processing each file
 
            Shutdown(); // close out writers
        } catch (Exception e) {
            e.printStackTrace();
            Shutdown();
            return null;
        }
        System.out.println("Process took " + (System.nanoTime() - time)/1e9 + " seconds.");
        System.out.println("Wrote the files with totalReads: " + totalLines 
                + " totalZeroLines written: " + totalZeroLines + " totalNonZeroLines written: " + totalNonZeroLines);
        return null;
    } 
    
    
    // NOTE:  This method altered from the SNPOnly version, which used "end"
    // as "exclusive" (ie, it looped until posIdx was < end)
    private void writeNull(int start, int end) {
        try {
        for (int posIdx=start;posIdx <= end; posIdx++) {
            if (realHM!=null) {for (Integer i:realWriters.keySet()) {
                realWriters.get(i).writeFloat(missToZero.value()?0:-Float.MAX_VALUE);
            }}
            if (log10HM!=null) {for (Integer i:log10Writers.keySet()) {
                log10Writers.get(i).writeFloat(missToZero()?0:-Float.MAX_VALUE);
            }}
            if (shortHM!=null) {for (Integer i:shortWriters.keySet()) {
                shortWriters.get(i).writeShort(missToZero() ? 0:Short.MIN_VALUE);
            }}
            if (longHM!=null) {for (Integer i:longWriters.keySet()) {
                longWriters.get(i).writeLong(missToZero() ? 0 :Long.MIN_VALUE);
            }}
            // You ALWAYS need the \n, this worked in TE_LTRSuperFamily ... for MIchelle
            //String fam = sampLine.substring(tabPos[12] + 1, tabPos[13]) + "\n";
            //writerFam.writeChars(fam.getBytes()); 
            if (charHM!=null) {
                for (Integer i:charWriters.keySet()) {
                    String nullString = "NULL\n";
                  charWriters.get(i).writeChars(nullString.getBytes()); // LCJ - added NULL - see if it worksZZ
                }
            }
          
            if (intHM!=null) {for (Integer i:intWriters.keySet()) {
                intWriters.get(i).writeInt(missToZero() ? 0 : Integer.MIN_VALUE);
            }}
            if (byteHM!=null) {for (Integer i:byteWriters.keySet()) {
                byteWriters.get(i).writeByte(missToZero() ? 0:Byte.MIN_VALUE);
            }}
            if (alleleHM!=null) {for (Integer i:alleleWriters.keySet()) {
                alleleWriters.get(i).writeByte(Byte.MIN_VALUE); // for missing alleles write null
            }}
            totalLines++; linesForChr++; totalZeroLines++;
        }
        } catch (Exception e) {
            e.printStackTrace();
            System.out.println("Problem writing nulls between "+start+" and "+end);
        }
    }
    
    private void WriteValues(String[] next) {
        try {
            if (realHM!=null) {for (Integer i:realWriters.keySet()) {
                double val= Double.MIN_VALUE; float storedVal;
                try {
                    val = Double.parseDouble(next[i.intValue()]);
                } catch (NumberFormatException nfe) {
                    val = Double.MIN_VALUE;
                }
                if (val==Double.MIN_VALUE) storedVal= missToZero.value()?0:-Float.MAX_VALUE;
                else if (negToZero.value() && val<=0) storedVal= 0;
                else storedVal= (float) val;
                realWriters.get(i).writeFloat(storedVal);
            }}
            if (log10HM!=null) {for (Integer i:log10Writers.keySet()) {
                double val= Double.MIN_VALUE; double storedVal;
                try {
                    val = Double.parseDouble(next[i.intValue()]);
                } catch (NumberFormatException nfe) {
                    val = Double.MIN_VALUE;
                }
                if (val==Double.MIN_VALUE) storedVal= missToZero.value()?0:-Float.MAX_VALUE;
                else if (negToZero.value() && val<=0) storedVal= 0;
                else storedVal = val==Double.MIN_VALUE?-Float.MAX_VALUE:(float)(-Math.log10(val));
                log10Writers.get(i).writeFloat((float)storedVal);
                //log10Writers.get(i).writeFloat(val==Double.MIN_VALUE?-Float.MAX_VALUE:(float)(-Math.log10(val)));
            }}
            if (shortHM!=null) {for (Integer i:shortWriters.keySet()) {
                short val= Short.MIN_VALUE; short storedVal;
                try {
                    val = Short.parseShort(next[i.intValue()]);
                } catch (NumberFormatException nfe) {
                    val = Short.MIN_VALUE;
                }
                if (val==Short.MIN_VALUE) storedVal= missToZero.value()?0:Short.MIN_VALUE;
                else if (negToZero.value() && val<=0) storedVal= 0;
                else storedVal = val;
                shortWriters.get(i).writeShort(storedVal);
                //shortWriters.get(i).writeShort(val);
            }}
            if (longHM!=null) {for (Integer i:longWriters.keySet()) {
                long val= Long.MIN_VALUE;long storedVal;
                try {
                    val = Long.parseLong(next[i.intValue()]);
                } catch (NumberFormatException nfe) {
                    val = Long.MIN_VALUE;
                }
                if (val==Long.MIN_VALUE) storedVal= missToZero.value()?0:Long.MIN_VALUE;
                else if (negToZero.value() && val<=0) storedVal= 0;
                else storedVal = val;
                longWriters.get(i).writeLong(storedVal);
                //longWriters.get(i).writeLong(val);
            }}
            // always want \n !!
            if (charHM!=null) {
                for (Integer i:charWriters.keySet()) {
                  String charData = next[i.intValue()] + "\n";
                  charWriters.get(i).writeChars(charData.getBytes());
                }
            }
            if (intHM!=null) {for (Integer i:intWriters.keySet()) {
                int val= Integer.MIN_VALUE;int storedVal;
                try {val = Integer.parseInt(next[i.intValue()]);
                } catch (NumberFormatException nfe) {val = Integer.MIN_VALUE;}
                if (val==Integer.MIN_VALUE) storedVal= missToZero.value()?0:Integer.MIN_VALUE;
                else if (negToZero.value() && val<=0) storedVal= 0;
                else storedVal = val;
                intWriters.get(i).writeInt(storedVal);
                //intWriters.get(i).writeInt(val);
            }}
            if (byteHM!=null) {
                for (Integer i:byteWriters.keySet()) {
                  byte val= Byte.MIN_VALUE;byte storedVal;                 
                  try {
                    val = (byte)(Integer.parseInt(next[i.intValue()]));                    
                  } catch (NumberFormatException nfe) {
                    val = Byte.MIN_VALUE;
                  }
                  if (val==Byte.MIN_VALUE) storedVal= missToZero.value()?0:Byte.MIN_VALUE;
                  else if (negToZero.value() && val<=0) storedVal= 0;
                  else storedVal = val;
                  byteWriters.get(i).writeByte(val);
                }
             }
            if (alleleHM!=null){
                // Not checking missToZero or negToZero - these are character alleles, missing is null
                // otherwise stored as is done in TASSEL as 0-5 for A/G/C/T/+/-
                for (Integer idx:alleleWriters.keySet()) {
                    byte val = Byte.MIN_VALUE;// monetdb stores Byte.MIN_VALUE as null
                    try {
                        String allele = next[idx.intValue()];
                        int alleleInt = ColumnsToBinaryUtils.getIntFromSeq(allele);
                        // getIntFromSeq allows for ins/del as 4/5. 
                        if (alleleInt >= 0 || alleleInt <= 5) val =(byte) alleleInt; 
                        else 
                            throw new IllegalStateException("Illegal allele char, must be A/C/G/T/+/- but found " + allele);
                        alleleWriters.get(idx).writeByte(val); // store null
                    } catch (Exception exc) {
                        exc.printStackTrace();
                        throw new IllegalStateException("Error parsing  allele column " + alleleWriters.get(idx));
                    }
                }
            }
                
        } catch (Exception e) {
            for (String i:next) {System.out.println(i);}
            e.printStackTrace();
        }
    }
    
    private void FindColumns(String filename) {
        // When broken by chromosome, column names shoudl be the
        // same for all files.  Send in any chromosome file and use
        // it to set the columns names
        String[] next= null;
        try { 
            FileInputStream fis = new FileInputStream(filename);
            Scanner scanner = new Scanner(fis); boolean first= true;
            while (scanner.hasNextLine()) {
                if (!first) break;
                String line = scanner.nextLine();
                if (line.isEmpty() || line.startsWith("#")) {continue;}
                next = line.split("\t");
                if (first) {
                    first= false;
                }
            }
            scanner.close();
            fis.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
        //get index of chromosome
        int search= -1;
        for (int col = 0; col < next.length; col++) {
            if (next[col].toLowerCase().startsWith("chr")) {search= col; break;}
        }
        if (search<0) {throw new IllegalStateException("Couldn't find chromosome in first non-empty line of input file");
        } else {
            chrCol= search;
            //chrName= next[search]; // now defaulting this to "CHR"
            System.out.println("Chr found in column "+search);
        }
        //Get position
        search= -1;
        if (range.value()) { //if range, looks for first last (inclusive) or start end (exclusive)
            for (int col = 0; col < next.length; col++) {
                if (next[col].equalsIgnoreCase("start")) {
                    search= col; break;
                }
            }
            if (search<0) {
                inclusive= true;
                for (int col = 0; col < next.length; col++) {
                    if (next[col].equalsIgnoreCase("first")) {
                        search= col; break;
                    }
                }
                if (search<0) throw new IllegalStateException("Couldn't find first or start in first line of input file to indicate range");
                else startCol= search;
            } else startCol= search;
            search= -1;
            // look for "last" or "end" for end of range
            for (int col = 0; col < next.length; col++) {
                if (next[col].equalsIgnoreCase(inclusive?"last":"end")) {
                    search= col; break;
                }
            }
            if (search<0) {
                if (search<0) throw new IllegalStateException("Couldn't find "+(inclusive?"last":"end")+" in first line of input file to indicate range");
            } else endCol= search;
            System.out.println("Using columns "+startCol+" and "+endCol+" to indicate ends of ranges, "+(inclusive?"inclusive":"exclusive"));
        } else {
            for (int col = 0; col < next.length; col++) {
                if (next[col].toLowerCase().startsWith("pos")) {
                    search= col; 
                    break;
                }
            }
            if (search<0) {System.out.println("Cannot find column for pos");
            } else {posCol= search;System.out.println("Pos found in column "+search);}
            if (chrCol<0 || posCol<0) throw new IllegalStateException("Couldn't find chromosome or position in first line of input file");
        }
        int ncols= 0;
        //Get indices of column names to keep and transform with -log10, if not null. Don't use binary search, because not sorted and won't work
        if (colsLog10.value()!=null && !colsLog10.value().equalsIgnoreCase("null") && !colsLog10.value().isEmpty()) {
            log10HM= new HashMap<>(); search= -1;
            for (String s:colsLog10.value().split(",")) {
                search= -1;
                for (int col = 0; col < next.length; col++) {
                    if (s.equalsIgnoreCase(next[col])) {search= col; break;}
                }
                if (search<0) {System.out.println("Cannot find column "+s);
                } else {
                    log10HM.put(search, s);
                    log10Writers= new HashMap<>();
                    ncols++;
                    System.out.println("Found "+s+" in column "+search+" as -log10 transformed float");
                }
            }
        }
        //Get indices of column names to keep as float
        if (colsFloat.value()!=null && !colsFloat.value().equalsIgnoreCase("null") &&!colsFloat.value().isEmpty()) {
        realHM= new HashMap<>(); realWriters= new HashMap<>(); search= -1;
            for (String s:colsFloat.value().split(",")) {
                for (int col = 0; col < next.length; col++) {
                    if (s.equalsIgnoreCase(next[col])) {search= col; break;}
                }
                if (search<0) {System.out.println("Cannot find column "+s);
                } else {realHM.put(search, s);ncols++;System.out.println("Found "+s+" in column "+search+" as real (float)");}
            }
        }
        //Get indices of column names to keep as int
        if (colsInt.value()!=null && !colsInt.value().equalsIgnoreCase("null") && !colsInt.value().isEmpty()) {
        intHM= new HashMap<>(); intWriters= new HashMap<>(); search= -1;
            for (String s:colsInt.value().split(",")) {
                for (int col = 0; col < next.length; col++) {
                    if (s.equalsIgnoreCase(next[col])) {search= col; break;}
                }
                if (search<0) {System.out.println("Cannot find column "+s);
                } else {intHM.put(search, s);ncols++;System.out.println("Found "+s+" in column "+search+" as int");}
            }
        }
        //Get indices of column names to keep as short
        if (colsShort.value()!=null && !colsShort.value().equalsIgnoreCase("null") && !colsShort.value().isEmpty()) {
        shortHM= new HashMap<>(); shortWriters= new HashMap<>(); search= -1;
            for (String s:colsShort.value().split(",")) {
                for (int col = 0; col < next.length; col++) {
                    if (s.equalsIgnoreCase(next[col])) {search= col; break;}
                }
                if (search<0) {System.out.println("Cannot find column "+s);
                } else {shortHM.put(search, s);ncols++;System.out.println("Found "+s+" in column "+search+" as short");}
            }
        }
        //Get indices of column names to keep as long
        if (colsLong.value()!=null && !colsLong.value().equalsIgnoreCase("null") && !colsLong.value().isEmpty()) {
        longHM= new HashMap<>(); longWriters= new HashMap<>(); search= -1;
            for (String s:colsLong.value().split(",")) {
                for (int col = 0; col < next.length; col++) {
                    if (s.equalsIgnoreCase(next[col])) {search= col; break;}
                }
                if (search<0) {System.out.println("Cannot find column "+s);
                } else {longHM.put(search, s);ncols++;System.out.println("Found "+s+" in column "+search+" as long");}
            }
        }
        //Get indices of column names to keep as char
        if (colsChar.value()!=null && !colsChar.value().equalsIgnoreCase("null") && !colsChar.value().isEmpty()) {
        charHM= new HashMap<>(); charWriters= new HashMap<>(); search= -1;
            for (String s:colsChar.value().split(",")) {
                for (int col = 0; col < next.length; col++) {
                    if (s.equalsIgnoreCase(next[col])) {
                        search= col; break;
                    }
                }
                if (search<0) {System.out.println("Cannot find column "+s);
                } else {charHM.put(search, s);ncols++;System.out.println("Found "+s+" in column "+search+" as char");}
            }
        }  
        //Get indices of column names to keep as char
        if (colsByte.value()!=null && !colsByte.value().equalsIgnoreCase("null") && !colsByte.value().isEmpty()) {
        byteHM= new HashMap<>(); byteWriters= new HashMap<>(); search= -1;
            for (String s:colsByte.value().split(",")) {
                for (int col = 0; col < next.length; col++) {
                    if (s.equalsIgnoreCase(next[col])) {
                        search= col; break;
                    }
                }
                if (search<0) {System.out.println("Cannot find column "+s);
                } else {byteHM.put(search, s);ncols++;System.out.println("Found "+s+" in column "+search+" as byte");}
            }
        } 
        //Get indices of column names to keep as alleles translated to 0-5
        if (colsAllele.value()!=null && !colsAllele.value().equalsIgnoreCase("null") && !colsAllele.value().isEmpty()) {
        alleleHM= new HashMap<>(); alleleWriters= new HashMap<>(); search= -1;
            for (String s:colsAllele.value().split(",")) {
                for (int col = 0; col < next.length; col++) {
                    if (s.equalsIgnoreCase(next[col])) {
                        search= col; break;
                    }
                }
                if (search<0) {System.out.println("Cannot find column "+s);
                } else {alleleHM.put(search, s);ncols++;System.out.println("Found "+s+" in column "+search+" as allele");}
            }
        }  
         if (ncols<1) throw new IllegalStateException("No valid columns to read in!");   
    }
    
    private void Initialize() {
        // This method creates the .sql files for creating the table and copying into the table.
        // These values can be appended to the larger table.  Table name must be filled in by
        // user, but the skeleton of the sql command is there to feed into monetdb.
        totalLines = 0;
        totalZeroLines = 0;
        totalNonZeroLines = 0;
        linesForChr = 0;
        try {
            System.out.println("\nColumnsToBInaryFullGenomeTable: outBase is : " + outBase() + "\n\n");
            DataOutputStream outCopy = Utils.getDataOutputStream(outBase.value()+"copy.sql", 1040);
            DataOutputStream outCreate = Utils.getDataOutputStream(outBase.value()+"create.sql", 1040);
            outCopy.writeBytes("COPY binary into ... from ('snpChrFile.bin','snpPosFile.bin',"); boolean first= true;
            outCreate.writeBytes("CREATE TABLE ... (chr int,pos int,"); 
            String lastChar = outBase().substring(outBase().length()-1,outBase().length());
            String out = null;
            if (!(lastChar.equals("/"))) {
                out = new File(outBase.value()).getName();
            }
            //String outDir= new File(outBase.value()).getParent();
            if (realHM!=null) {
                for (Integer i:realHM.keySet()) {
                    String copyFile;
                    String createFile;
                    if (out != null) {
                        copyFile = out + realHM.get(i) + "_real.bin";
                        createFile = out +  realHM.get(i) + " real";
                    } else {
                        copyFile = realHM.get(i) + "_real.bin";
                        createFile = realHM.get(i) + " real";
                    }
                    String outFile = outBase() + realHM.get(i) + "_real.bin";
                    if (!first) {outCopy.writeBytes(",");outCreate.writeBytes(",");}
                    first= false;
                    outCopy.writeBytes("'"+copyFile+"'"); first= false;
                    outCreate.writeBytes( createFile );
                   // outCreate.writeBytes(out + realHM.get(i)+" real"); 
                    realWriters.put(i,new LittleEndianDataOutputStream(new BufferedOutputStream(new FileOutputStream(outFile))));
                    //realWriters.put(i,new LittleEndianDataOutputStream(new BufferedOutputStream(new FileOutputStream(outDir+"/"+outFile))));
                }
            }
            if (intHM!=null) {
                for (Integer i:intHM.keySet()) {
                    String copyFile;
                    String createFile;
                    if (out != null) {
                        copyFile = out +  intHM.get(i) + "_int.bin";
                        createFile = out +  intHM.get(i) + " int";
                    } else {
                        copyFile = intHM.get(i) + "_int.bin";
                        createFile = intHM.get(i) + " int";
                    }
                    
                    String outFile = outBase() +  intHM.get(i) + "_int.bin";
                    if (!first) {outCopy.writeBytes(",");outCreate.writeBytes(",");}
                    first= false;
                    outCopy.writeBytes("'"+copyFile+"'"); 
                    outCreate.writeBytes(createFile );
                    intWriters.put(i,new LittleEndianDataOutputStream(new BufferedOutputStream(new FileOutputStream(outFile))));
                }
            }
            if (byteHM!=null) { // tinyint is a byte
                for (Integer i:byteHM.keySet()) {
                    String copyFile;
                    String createFile;
                    if (out != null) {
                        copyFile = out + byteHM.get(i) + "_byte.bin";
                        createFile = out + byteHM.get(i) + " byte";
                    } else {
                        copyFile = byteHM.get(i) + "_byte.bin";
                        createFile = byteHM.get(i) + " byte";
                    }
                    String outFile = outBase() + byteHM.get(i) + "_byte.bin";
                    if (!first) {outCopy.writeBytes(",");outCreate.writeBytes(",");}
                    first= false;
                    outCopy.writeBytes("'"+copyFile+"'"); 
                    outCreate.writeBytes(createFile); // monetdb doesn't have "byte", tinyint instead
                    byteWriters.put(i,new LittleEndianDataOutputStream(new BufferedOutputStream(new FileOutputStream(outFile))));
                }
            }
            if (alleleHM!=null) { // alleles are stored as bytes, here as tinyints
                for (Integer i:alleleHM.keySet()) {
                    String copyFile;
                    String createFile;
                    if (out != null) {
                        copyFile = out + alleleHM.get(i) + "_allelebyte.bin";
                        createFile = out + alleleHM.get(i) + " tinyint";
                    } else {
                        copyFile = alleleHM.get(i) + "_allelebyte.bin";
                        createFile = alleleHM.get(i) + " tinyint";
                    }
                    String outFile = outBase() + alleleHM.get(i) + "_allelebyte.bin";
                    if (!first) {outCopy.writeBytes(",");outCreate.writeBytes(",");}
                    first= false;
                    outCopy.writeBytes("'"+copyFile+"'"); 
                    outCreate.writeBytes(createFile); // monetdb doesn't have "byte", tinyint instead
                    alleleWriters.put(i,new LittleEndianDataOutputStream(new BufferedOutputStream(new FileOutputStream(outFile))));
                }
            }
            if (log10HM!=null) {
                for (Integer i:log10HM.keySet()) {
                    String copyFile;
                    String createFile;
                    if (out != null) {
                        copyFile = out +  log10HM.get(i) + "_neglog10_real.bin";
                        createFile = out +  log10HM.get(i) + "_neglog10 real";
                    } else {
                        copyFile = log10HM.get(i) + "_neglog10_real.bin";
                        createFile = log10HM.get(i) + "_neglog10 real";
                    }
                    String outFile = outBase() +  log10HM.get(i) + "_neglog10_real.bin";
                    if (!first) {outCopy.writeBytes(",");outCreate.writeBytes(",");}
                    first= false;
                    outCopy.writeBytes("'"+copyFile+"'");
                    outCreate.writeBytes(createFile); 
                    log10Writers.put(i,new LittleEndianDataOutputStream(new BufferedOutputStream(new FileOutputStream(outFile))));
                }
            }
            if (shortHM!=null) {
                for (Integer i:shortHM.keySet()) {
                    String copyFile;
                    String createFile;
                    if (out != null) {
                        copyFile = out + shortHM.get(i) + "_short.bin";
                        createFile = out + shortHM.get(i) + " short";
                    } else {
                        copyFile = shortHM.get(i) + "_short.bin";
                        createFile = shortHM.get(i) + " short";
                    }
                    String outFile = outBase() + shortHM.get(i) + "_short.bin";
                    if (!first) {outCopy.writeBytes(",");outCreate.writeBytes(",");}
                    first= false;
                    outCopy.writeBytes("'"+copyFile+"'");
                    outCreate.writeBytes(createFile); 
                    shortWriters.put(i,new LittleEndianDataOutputStream(new BufferedOutputStream(new FileOutputStream(outFile))));
                }
            }
            if (longHM!=null) {
                for (Integer i:longHM.keySet()) {
                    String copyFile;
                    String createFile;
                    if (out != null) {
                        copyFile = out + "_"+longHM.get(i) + "_long.bin";
                        createFile = out + "_"+longHM.get(i) + " long";
                    } else {
                        copyFile = longHM.get(i) + "_long.bin";
                        createFile = longHM.get(i) + " long";
                    }
                    String outFile = outBase() + longHM.get(i) + "_long.bin";
                    if (!first) {outCopy.writeBytes(",");outCreate.writeBytes(",");}
                    first= false;
                    outCopy.writeBytes("'"+copyFile+"'");
                    outCreate.writeBytes(createFile); 
                    longWriters.put(i,new LittleEndianDataOutputStream(new BufferedOutputStream(new FileOutputStream(outFile))));
                }
            }
            if (charHM!=null) {
                for (Integer i:charHM.keySet()) {
                    String copyFile;
                    String createFile;
                    if (out != null) {
                        copyFile = out + charHM.get(i) + "_char.bin";
                        createFile = out + charHM.get(i) + " char";
                    } else {
                        copyFile = charHM.get(i) + "_char.bin";
                        createFile = charHM.get(i) + " char";
                    }
                    String outFile = outBase() + charHM.get(i) + "_char.bin";
                    if (!first) {outCopy.writeBytes(",");outCreate.writeBytes(",");}
                    first= false;
                    outCopy.writeBytes("'"+copyFile+"'");
                    outCreate.writeBytes(createFile); 
                    charWriters.put(i,new LittleEndianDataOutputStream(new BufferedOutputStream(new FileOutputStream(outFile))));
                }
            }
        outCopy.writeBytes(");"); outCopy.close();
        outCreate.writeBytes(");"); outCreate.close();
        } catch (Exception e) {
            e.printStackTrace();
        }

    }
    
    private void Shutdown() {
        try {
            scanner.close();
            fis.close();
            if (realHM!=null) {for (Integer i:realWriters.keySet()) {realWriters.get(i).close();}}
            if (intHM!=null) {for (Integer i:intWriters.keySet()) {intWriters.get(i).close();}}
            if (shortHM!=null) {for (Integer i:shortWriters.keySet()) {shortWriters.get(i).close();}}
            if (longHM!=null) {for (Integer i:longWriters.keySet()) {longWriters.get(i).close();}}
            if (charHM!=null) {for (Integer i:charWriters.keySet()) {charWriters.get(i).close();}}
            if (log10HM!=null) {for (Integer i:log10Writers.keySet()) {log10Writers.get(i).close();}}
            if (byteHM!=null) {for (Integer i:byteWriters.keySet()) {byteWriters.get(i).close();}}

        } catch (Exception e) {
            System.out.println("Problem with shutdown");
            e.printStackTrace();
        }
    }
    @Override
    public ImageIcon getIcon() {
        // TODO Auto-generated method stub
        return null;
    }

    @Override
    public String getButtonName() {
        // TODO Auto-generated method stub
        return null;
    }

    @Override
    public String getToolTipText() {
        // TODO Auto-generated method stub
        return null;
    }
    
    // The following getters and setters were auto-generated.
    // Please use this method to re-generate.
    //
     public static void main(String[] args) {
         GeneratePluginCode.generate(ColumnsToBinaryFullGenomeTablePlugin.class);
     }
     /**
      * Convenience method to run plugin with one return object.
      */
     // TODO: Replace  with specific type.
//     public  runPlugin(DataSet input) {
//         return () performFunction(input).getData(0).getData();
//     }

     /**
      * File or a Directory containing Tab-delimited files with data
      * to add to the database. Files must be named chr01.txt,
      * chr02.txt etc!
      *
      * @return Input File 
      */
     public String inputFile() {
         return inputFile.value();
     }

     /**
      * Set Input Directory. Directory containing Tab-delimited
      * files with data to add to the database. Files must
      * be named chr01.txt, chr02.txt etc!
      *
      * @param value Input Directory or File
      *
      * @return this plugin
      */
     public ColumnsToBinaryFullGenomeTablePlugin inputFile(String value) {
         inputFile = new PluginParameter<>(inputFile, value);
         return this;
     }

     /**
      * Link to maize reference file.
      *
      * @return refFile
      */
     public String refFile() {
         return refFile.value();
     }

     /**
      * Set refFile.
      *
      * @param value refFile
      *
      * @return this plugin
      */
     public ColumnsToBinaryFullGenomeTablePlugin refFile(String value) {
         refFile = new PluginParameter<>(refFile, value);
         return this;
     }

     /**
      * Output directory and base filename to hold the binary
      * files. Will make directory if neccesary
      *
      * @return outBase
      */
     public String outBase() {
         return outBase.value();
     }

     /**
      * Set outBase. Output directory and base filename to
      * hold the binary files. Will make directory if neccesary
      *
      * @param value outBase
      *
      * @return this plugin
      */
     public ColumnsToBinaryFullGenomeTablePlugin outBase(String value) {
         outBase = new PluginParameter<>(outBase, value);
         return this;
     }

     /**
      * Comma separated list of column names to generate real
      * binaries for
      *
      * @return Columns keep as Real (Float)
      */
     public String colsFloat() {
         return colsFloat.value();
     }

     /**
      * Set Columns keep as Real (Float). Comma separated list
      * of column names to generate real binaries for
      *
      * @param value Columns keep as Real (Float)
      *
      * @return this plugin
      */
     public ColumnsToBinaryFullGenomeTablePlugin colsFloat(String value) {
         colsFloat = new PluginParameter<>(colsFloat, value);
         return this;
     }

     /**
      * Comma separated list of column names to generate int
      *  binaries for
      *
      * @return Columns keep as Int
      */
     public String colsInt() {
         return colsInt.value();
     }

     /**
      * Set Columns keep as Int. Comma separated list of column
      * names to generate int  binaries for
      *
      * @param value Columns keep as Int
      *
      * @return this plugin
      */
     public ColumnsToBinaryFullGenomeTablePlugin colsInt(String value) {
         colsInt = new PluginParameter<>(colsInt, value);
         return this;
     }

     /**
      * Comma separated list of column names to generate short
      * binaries for
      *
      * @return Columns keep as Short
      */
     public String colsShort() {
         return colsShort.value();
     }

     /**
      * Set Columns keep as Short. Comma separated list of
      * column names to generate short binaries for
      *
      * @param value Columns keep as Short
      *
      * @return this plugin
      */
     public ColumnsToBinaryFullGenomeTablePlugin colsShort(String value) {
         colsShort = new PluginParameter<>(colsShort, value);
         return this;
     }

     /**
      * Comma separated list of column names to generate long
      * binaries for
      *
      * @return Columns keep as Long
      */
     public String colsLong() {
         return colsLong.value();
     }

     /**
      * Set Columns keep as Long. Comma separated list of column
      * names to generate long binaries for
      *
      * @param value Columns keep as Long
      *
      * @return this plugin
      */
     public ColumnsToBinaryFullGenomeTablePlugin colsLong(String value) {
         colsLong = new PluginParameter<>(colsLong, value);
         return this;
     }

     /**
      * Comma separated list of column names to generate byte
      * binaries for
      *
      * @return Columns keep as Byte
      */
     public String colsByte() {
         return colsByte.value();
     }

     /**
      * Set Columns keep as Byte. Comma separated list of column
      * names to generate byte binaries for
      *
      * @param value Columns keep as bhte
      *
      * @return this plugin
      */
     public ColumnsToBinaryFullGenomeTablePlugin colByte(String value) {
         colsByte = new PluginParameter<>(colsByte, value);
         return this;
     }
     /**
      * Comma separated list of column names to generate byte
      * binaries for
      *
      * @return Columns keep as Byte
      */
     public String colsAllele() {
         return colsAllele.value();
     }

     /**
      * Set Columns translate allele values to 0-5. Comma separated list of column
      * names of single character alleles to be translated from A/C/G/T/+/- to
      * 0-5.
      *
      * @param value Columns keep for translated alleles stored as bytes
      *
      * @return this plugin
      */
     public ColumnsToBinaryFullGenomeTablePlugin colsAllele(String value) {
         colsAllele = new PluginParameter<>(colsAllele, value);
         return this;
     }     
     /**
      * Comma separated list of column names to generate char
      * binaries for
      *
      * @return Columns keep as Char
      */
     public String colsChar() {
         return colsChar.value();
     }

     /**
      * Set Columns keep as Char. Comma separated list of column
      * names to generate char binaries for
      *
      * @param value Columns keep as Char
      *
      * @return this plugin
      */
     public ColumnsToBinaryFullGenomeTablePlugin colsChar(String value) {
         colsChar = new PluginParameter<>(colsChar, value);
         return this;
     }

     /**
      * Comma separated list of column names to first transform
      * using -log10 then generate binaries for
      *
      * @return Columns to Keep and transform -log10
      */
     public String colsLog10() {
         return colsLog10.value();
     }

     /**
      * Set Columns to Keep and transform -log10. Comma separated
      * list of column names to first transform using -log10
      * then generate binaries for
      *
      * @param value Columns to Keep and transform -log10
      *
      * @return this plugin
      */
     public ColumnsToBinaryFullGenomeTablePlugin colsLog10(String value) {
         colsLog10 = new PluginParameter<>(colsLog10, value);
         return this;
     }

     /**
      * Columns for range data. If true, will look for 'start'
      * and 'end' (inclusive exclusive) or 'first' 'last' (inclusive
      * inclusive) instead of 'Pos' 
      *
      * @return Range information?
      */
     public Boolean range() {
         return range.value();
     }

     /**
      * Set Range information?. Columns for range data. If
      * true, will look for 'start' and 'end' (inclusive exclusive)
      * or 'first' 'last' (inclusive inclusive) instead of
      * 'Pos' 
      *
      * @param value Range information?
      *
      * @return this plugin
      */
     public ColumnsToBinaryFullGenomeTablePlugin range(Boolean value) {
         range = new PluginParameter<>(range, value);
         return this;
     }

     /**
      * Will set negative float denoted column values to zero
      * (otherwise they are set to Float.MIN
      *
      * @return Negative floats to zero?
      */
     public Boolean negToZero() {
         return negToZero.value();
     }

     /**
      * Set Negative floats to zero?. Will set negative float
      * denoted column values to zero (otherwise they are set
      * to Float.MIN
      *
      * @param value Negative floats to zero?
      *
      * @return this plugin
      */
     public ColumnsToBinaryFullGenomeTablePlugin negToZero(Boolean value) {
         negToZero = new PluginParameter<>(negToZero, value);
         return this;
     }

     /**
      * Will set missing float denoted column values to zero
      * (otherwise they are set to Float.MIN
      *
      * @return Missing float values to zero?
      */
     public Boolean missToZero() {
         return missToZero.value();
     }

     /**
      * Set Missing float values to zero?. Will set missing
      * float denoted column values to zero (otherwise they
      * are set to Float.MIN
      *
      * @param value Missing float values to zero?
      *
      * @return this plugin
      */
     public ColumnsToBinaryFullGenomeTablePlugin missToZero(Boolean value) {
         missToZero = new PluginParameter<>(missToZero, value);
         return this;
     }
     /**
      * Will assume positions and ranges are 1-based
      * unless otherwise set.
      * 
      * @return positions are 1-based?
      */
     public Boolean oneBased() {
         return oneBased.value();
     }

     /**
      * Set positions to 1-based unless this variable
      * is "false".  
      *
      * @param value 1-based positions?
      *
      * @return this plugin
      */
     public ColumnsToBinaryFullGenomeTablePlugin oneBased(Boolean value) {
         oneBased = new PluginParameter<>(oneBased, value);
         return this;
     }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy