Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
package net.maizegenetics.util;
import java.awt.Frame;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.swing.ImageIcon;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.google.common.collect.ImmutableMap;
import net.maizegenetics.analysis.gbs.Barcode;
import net.maizegenetics.analysis.gbs.v2.BarcodeTrie;
import net.maizegenetics.analysis.gbs.v2.EnzymeList;
import net.maizegenetics.analysis.gbs.v2.GBSUtils;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.taxa.TaxaList;
import net.maizegenetics.taxa.TaxaListBuilder;
import net.maizegenetics.taxa.TaxaListIOUtils;
import net.maizegenetics.taxa.Taxon;
/**
* This plugin takes a single fastq or a directory of fastq files
* in the "old" format and converts them to the "modern" illumina format.
*
* The old format had multiple taxa combined in a single file, with
* barcodes attaached to each read sequence.
*
* The new fastQ format has the taxa (samples) each in individual files.
*
* THis method takes the old fastq and for each read block, subsets
* the sequence to start after the barcode (but including the initial cut site)
* and end after the ending cut site. The quality score is subsetted
* identically to the sequence so the 2 match.
*
* Files are written to a user defined directory.
*
* @author lcj34
*
*/
public class ConvertOldFastqToModernFormatPlugin extends AbstractPlugin {
private static final Logger myLogger = LogManager.getLogger(ConvertOldFastqToModernFormatPlugin.class);
private PluginParameter inputFile = new PluginParameter.Builder<>("i", null, String.class).guiName("Input Directory").required(true)
.description("Input file or directory containing FASTQ files in text or gzipped text.\n"
+ " NOTE: Directory will be searched recursively").build();
private PluginParameter keyFile = new PluginParameter.Builder<>("k", null, String.class).guiName("Key File").required(true).inFile()
.description("Key file listing barcodes distinguishing the samples").build();
private PluginParameter projectName = new PluginParameter.Builder<>("p", null, String.class).guiName("Project File").required(true)
.description("Name for this project. Project name becomes part of the newly created fastq file name").build();
private PluginParameter outputDir = new PluginParameter.Builder<>("o", null, String.class).guiName("Output Directory").required(true).outDir()
.description("Output directory where new fastQ files will be written").build();
private PluginParameter myEnzyme = new PluginParameter.Builder<>("e", null, String.class).guiName("Enzyme").required(true)
.description("Enzyme used to create the GBS library").build();
public static final String inputFileGlob="glob:*{.fq,fq.gz,fastq,fastq.txt,fastq.gz,fastq.txt.gz,_sequence.txt,_sequence.txt.gz}";
public static final String sampleNameField="FullSampleName";
public static final String flowcellField="Flowcell";
public static final String laneField="Lane";
public static final String barcodeField="Barcode";
public static final String tissueNameField = "Tissue";
public static final String fileNameField = "FileName";
static List infiles;
static String[] likelyReadEndStrings;
protected static int readEndCutSiteRemnantLength;
public ConvertOldFastqToModernFormatPlugin() {
super(null, false);
}
public ConvertOldFastqToModernFormatPlugin(Frame parentFrame, boolean isInteractive) {
super(parentFrame, isInteractive);
}
@Override
protected void postProcessParameters() {
// create list of directory files
File dirList = new File(inputFile());
if (!(dirList.exists())) {
throw new IllegalStateException("Input file or directory not found !!") ;
}
if (dirList.isDirectory()) {
infiles = DirectoryCrawler.listPaths(GBSUtils.inputFileGlob, Paths.get(inputFile.value()).toAbsolutePath());
if(infiles.isEmpty()) {
myLogger.warn("No files matching:"+ GBSUtils.inputFileGlob);
throw new IllegalStateException("no .txt files found in input directory !!");
}
} else {
infiles=new ArrayList<>();
Path filePath= Paths.get(inputFile()).toAbsolutePath();
infiles.add(filePath);
}
if (!myEnzyme.isEmpty()) {
// Add likelyReadEnds for later processing
EnzymeList.Enzyme enzyme = EnzymeList.defaultCache.getEnzyme(enzyme());
likelyReadEndStrings = enzyme.likelyReadEnd; // for removeSecondCutSiteIndexOf()
readEndCutSiteRemnantLength = enzyme.readEndCutSiteRemnantLength;
}
}
@Override
public DataSet processData(DataSet input) {
// Check if keyFile contains FullSampleName field.
// If not, create masterTaxaList using
// sample:Flowcell:lane:barcode concatenated columns as header
myLogger.info("Checking Key format");
boolean keyFormatIsNew = determineKeyFormat(keyFile());
TaxaList masterTaxaList;
if (keyFormatIsNew) {
masterTaxaList= TaxaListIOUtils.readTaxaAnnotationFile(keyFile(), GBSUtils.sampleNameField,
new HashMap<>(),true);
} else {
masterTaxaList = readOldTaxaAnnotationFile(keyFile(), new HashMap<>(),true);
}
myLogger.info("Created masterTaxaList - start processing fastq files");
int batchSize = 8;
for (int idx = 0; idx < infiles.size(); idx+=batchSize) {
int end = idx+batchSize;
if (end > infiles.size()) end = infiles.size();
ArrayList sub = new ArrayList();
for (int j = idx; j < end; j++) sub.add(infiles.get(j));
myLogger.info("\nStart processing batch " + String.valueOf(idx/batchSize+1));
sub.parallelStream()
.forEach(inputSeqFile -> {
try {
//
Path keyPath= Paths.get(keyFile()).toAbsolutePath();
myLogger.info(" - processing file " + inputSeqFile);
Map> taxaWriterMap = new HashMap>();
processFastQFile(keyPath, inputSeqFile, enzyme(), masterTaxaList,taxaWriterMap,
outputDir(), projectName(), keyFormatIsNew);
// End of file - close the writers;
myLogger.info("Calling Shutdown to close the writers for " + inputSeqFile);
Shutdown(taxaWriterMap);
} catch (StringIndexOutOfBoundsException oobe) {
return;
}
});
}
return null;
}
// Determine the key file format. IF headers do not include "FullSampleName",
// it is an old format.
public static boolean determineKeyFormat(String keyFile){
try (BufferedReader br = Utils.getBufferedReader(keyFile)) {
String header = br.readLine();
if (header.contains(sampleNameField)) return true;
else return false;
} catch (IOException ioe) {
ioe.printStackTrace();
}
return false;
}
public static ArrayList getLaneAnnotatedTaxaList(Path keyPath, Path fastQpath, boolean useNew) {
String[] filenameField = fastQpath.getFileName().toString().split("_");
ArrayList annoTL;
if (filenameField.length == 3) {
if (useNew) {
annoTL = TaxaListIOUtils.readTaxaAnnotationFileAL(keyPath.toAbsolutePath().toString(), sampleNameField,
ImmutableMap.of(flowcellField, filenameField[0], laneField, filenameField[1]));
} else {
annoTL = readOldFormatKeyFile(keyPath.toAbsolutePath().toString(),
ImmutableMap.of(flowcellField, filenameField[0], laneField, filenameField[1]), true);
}
} else if (filenameField.length == 4) {
if (useNew) {
annoTL = TaxaListIOUtils.readTaxaAnnotationFileAL(keyPath.toAbsolutePath().toString(),sampleNameField,
ImmutableMap.of(flowcellField, filenameField[0], laneField, filenameField[2]));
} else {
annoTL = readOldFormatKeyFile(keyPath.toAbsolutePath().toString(),
ImmutableMap.of(flowcellField, filenameField[0], laneField, filenameField[2]),true);
}
}
else if (filenameField.length == 5) {
if (useNew) {
annoTL = TaxaListIOUtils.readTaxaAnnotationFileAL(keyPath.toAbsolutePath().toString(),sampleNameField,
ImmutableMap.of(flowcellField, filenameField[1], laneField, filenameField[3]));
} else {
annoTL = readOldFormatKeyFile(keyPath.toAbsolutePath().toString(),
ImmutableMap.of(flowcellField, filenameField[1], laneField, filenameField[3]),true);
}
} else {
myLogger.error("Error in parsing file name: " + fastQpath.toString());
myLogger.error(" The filename does not contain either 3, 4, or 5 underscore-delimited values.");
myLogger.error(" Expect: flowcell_lane_fastq.txt.gz OR flowcell_s_lane_fastq.txt.gz OR code_flowcell_s_lane_fastq.txt.gz");
return null;
}
return annoTL;
}
public static TaxaList readOldTaxaAnnotationFile(String fileName, Map filters, boolean mergeSameNames) {
// create list
ArrayList taxaAL = readOldFormatKeyFile( fileName, filters, mergeSameNames);
if (taxaAL == null) return null;
TaxaListBuilder tlb = new TaxaListBuilder();
taxaAL.stream().forEach(taxa -> {
if (mergeSameNames) {
tlb.addOrMerge(taxa);
} else {
tlb.add(taxa);
}
});
return tlb.sortTaxaAlphabetically().build();
}
// Call used for old format of files
public static ArrayList readOldFormatKeyFile(String fileName, Map filters, boolean mergeSameNames) {
try {
BufferedReader fileIn = Utils.getBufferedReader(fileName, 1000000);
fileIn.mark(1 << 16);
String line = fileIn.readLine();
ArrayList taxaAL = new ArrayList();
// Name becomes concatenation of 4 fields: Sample, Flowcell, Lane, Barcode - in that order
int indexOfSample = 0;
int indexOfFlowcell = 0;
int indexOfLane = 0;
int indexOfBarcode = 0;
//parse headers
List headers = new ArrayList<>();
List isQuant = new ArrayList<>();
if (line.contains("Flowcell")) {
int idx = 0;
for (String header : line.split("\\t")) {
if (header.equalsIgnoreCase("Sample")) {
indexOfSample= idx;
}
if (header.equalsIgnoreCase("Flowcell")) {
indexOfFlowcell= idx;
}
if (header.equalsIgnoreCase("Lane")) {
indexOfLane= idx;
}
if (header.equalsIgnoreCase("Barcode")) {
indexOfBarcode= idx;
}
isQuant.add(header.startsWith("#") || header.startsWith("<#"));
headers.add(header.replace(">", "").replace("<", "").replace("#", ""));
idx++;
}
} else {
fileIn.reset();
}
//parse taxa rows
while ((line = fileIn.readLine()) != null) {
String[] stokens = line.split("\\t");
StringBuilder taxonSB = new StringBuilder();
taxonSB.append(stokens[indexOfSample]).append(":");
taxonSB.append(stokens[indexOfFlowcell]).append(":");
taxonSB.append(stokens[indexOfLane]).append(":");
taxonSB.append(stokens[indexOfBarcode]);
String aTaxonName = taxonSB.toString();
Taxon.Builder anID = new Taxon.Builder(aTaxonName);
for (int idx = 0; idx < stokens.length; idx++) {
String[] cs = stokens[idx].split(";");
for (String ta : cs) {
if (ta == null || ta.isEmpty()) {
continue;
}
if (isQuant.get(idx)) {
if (ta.equals("NA")) {
anID.addAnno(headers.get(idx), Double.NaN);
} else {
anID.addAnno(headers.get(idx), Double.parseDouble(ta));
}
} else {
anID.addAnno(headers.get(idx), ta);
}
}
}
Taxon myTaxon = anID.build();
if (TaxaListIOUtils.doesTaxonHaveAllAnnotations(myTaxon, filters)) {
taxaAL.add(myTaxon);
}
}
// Sort alphabetically based on name. This is to remain consistent
// with the readTaxaAnnotationFile(), which alphabetizes the taxaList
Collections.sort(taxaAL, new Comparator() {
@Override
public int compare(Taxon taxa1, Taxon taxa2) {
return taxa1.getName().compareTo(taxa2.getName());
}
});
return taxaAL;
} catch (Exception e) {
System.err.println("Error in Reading Annotated Taxon File:" + fileName);
e.printStackTrace();
}
return null;
}
// THis method will take the taxon/outputWriter map, a fastqfile, and an enzyme name.
// From this data a barcode trie is created. Then for each "read" in the fastq file,
// the taxa is identified from the barcode. The barcode is stripped, and the read
// is written to the appropriate file based on the taxaFileMap
public static void processFastQFile(Path keyPath, Path fastQPath, String enzymeName,
TaxaList masterTaxaList,Map> taxaFileMap,
String outputDir, String projectName, boolean keyFormatIsNew){
int allReads=0, goodBarcodedReads = 0, barCodeNotFound = 0, badEndReads = 0;
int currentTotal = 0;
int first20hasN = 0;
ArrayList tl = getLaneAnnotatedTaxaList(keyPath, fastQPath, keyFormatIsNew);;
// Creating a hashmap of taxon name to taxon object. The file name we want to
// write to has flowcell and lane in the name. These annotations are stored with
// the tl arraylist taxons, but NOT in the barcode taxons. It is necessary to
// take the taxon name from the barcode, find the taxon in the tl list, then
// retrieve the flowcell and lane.
Map nameToTaxon = new HashMap();
for (Taxon taxon : tl) {
String name = taxon.getName();
nameToTaxon.put(name, taxon);
}
if (tl.size() == 0) return;
BarcodeTrie barcodeTrie=GBSUtils.initializeBarcodeTrie(tl, masterTaxaList, EnzymeList.defaultCache.getEnzyme(enzymeName));
String[] seqAndQual;
BufferedReader br = Utils.getBufferedReader(fastQPath.toString(), 1 << 22);
long time=System.nanoTime();
try {
while ((seqAndQual=readAllFastQBlock(br,allReads)) != null) {
// Unlike GBS, we read all 4 lines, as all 4 lines will be written to new file
allReads++;
currentTotal++;
//After quality score is read, decode barcode using the current sequence & quality score
Barcode barcode=barcodeTrie.longestPrefix(seqAndQual[1]);
if(barcode==null) {
if (seqAndQual[1].contains("N")) first20hasN++;
barCodeNotFound++;
continue; // couldn't find taxa - skip this read
}
int barcodeLen = barcode.getBarLength();
String adjustedSeq = seqAndQual[1].substring(barcodeLen);
String adjustedQual = seqAndQual[3].substring(barcodeLen);
Tuple stringStartEnd = truncateToSecondCutSite(adjustedSeq,enzymeName);
if (stringStartEnd == null) {
badEndReads++;
continue;
}
adjustedSeq = adjustedSeq.substring(stringStartEnd.x, stringStartEnd.y);
adjustedQual = adjustedQual.substring(stringStartEnd.x, stringStartEnd.y);
goodBarcodedReads++;
// write entries to file for this taxon
Taxon readTaxon = barcode.getTaxon();
Tuple origTuple = taxaFileMap.get(readTaxon);
if (origTuple == null) {
// add entry
Taxon taxonWithAnno = nameToTaxon.get(readTaxon.getName());
GeneralAnnotation annotation = taxonWithAnno.getAnnotation();
String outFile = outputDir + "/" + projectName + "_" + readTaxon.getName() + "_" +
annotation.getTextAnnotation(flowcellField)[0] + "_" + annotation.getTextAnnotation("Lane")[0] +
"_" + barcode.getBarcodeString() + ".fastq";
taxaFileMap.put(readTaxon,new Tuple(null,outFile));
origTuple = taxaFileMap.get(readTaxon);
}
BufferedWriter bw = origTuple.x;
if (bw == null) {
// First time we've written this file - create writer and store it
bw = Utils.getBufferedWriter(origTuple.y);
taxaFileMap.put(readTaxon, new Tuple(bw,origTuple.y));
}
bw.write(seqAndQual[0]);
bw.write("\n");
bw.write(adjustedSeq);
bw.write("\n");
bw.write(seqAndQual[2]);
bw.write("\n");
bw.write(adjustedQual);
bw.write("\n");
if (currentTotal > 1000000) {
System.out.println("Reads processed so far: " + allReads);
currentTotal = 0;
}
}
myLogger.info("Summary for "+fastQPath.toString()+"\n"+
"Total number of reads in lane=" + allReads +"\n"+
"Total number of good barcoded reads=" + goodBarcodedReads+"\n"+
"Total number of bar codes not found=" + barCodeNotFound+"\n"+
"Total number where first 20 bps of seq has N =" + first20hasN+"\n"+
"Total number of read End not found=" + badEndReads+"\n"+
"Timing process (parsing, collapsing, and writing readBlock to file)."+"\n"+
"Process took " + (System.nanoTime() - time)/1e9 + " seconds.");
} catch (Exception exc) {
exc.printStackTrace();
}
}
public static String[] readAllFastQBlock(BufferedReader bw, int currentRead) throws IOException {
//consider converting this into a stream of String[]
String[] result=new String[4];
try{
result[0]=bw.readLine();
result[1] = bw.readLine();
if(result[0]==null) {
return null;
}
result[2]=bw.readLine();
result[3] = bw.readLine();
return result;
} catch (IOException e) {
e.printStackTrace();
myLogger.error("Unable to correctly parse the fastQ read nead line: " + currentRead*4
+ " from fastq file. Your fastq file may have been corrupted.");
return null;
}
}
// This method determines the start and total length of the passed sequence
// based on the enzyme and its likely read-end strings.
// The start position may change based on handling of the overlapping
// cut sites. The end site should change based on likely read ends.
// These values are used in calling method to adjust both the sequence
// and the quality strings.
private static Tuple truncateToSecondCutSite(String seq,String enzyme) {
int seqStart = 0;
// handle overlapping cutsite for ApeKI enzyme
if (enzyme.equalsIgnoreCase("ApeKI")) {
if (seq.startsWith("CAGCTGC") || seq.startsWith("CTGCAGC")) {
seq = seq.substring(3,seq.length());
seqStart = 3;
}
}
int indexOfReadEnd = -1;
// Starts at 20 so don't have overlapping start/end cutsites.
String shortSeq = seq.substring(20); // substr from 20 to end of sequence
for (String readEnd: likelyReadEndStrings){
int indx = shortSeq.indexOf(readEnd);
if (indx > 0 ) {
if (indexOfReadEnd < 0 || indx < indexOfReadEnd) {
indexOfReadEnd = indx;
}
}
}
if (indexOfReadEnd < 0) {
return new Tuple(seqStart,seq.length()-seqStart);
} else {
// Add back the 20 from the beginning that we chopped off.
int tagLen = indexOfReadEnd + 20 + readEndCutSiteRemnantLength;
return new Tuple(seqStart,tagLen);
}
}
private static void Shutdown(Map> taxaWriterMap) {
if (taxaWriterMap == null) return;
try {
for (Map.Entry> entry: taxaWriterMap.entrySet()) {
BufferedWriter bw = entry.getValue().x;
if (bw != null) bw.close();
}
} catch (Exception e) {
System.out.println("Problem with shutdown");
e.printStackTrace();
}
}
// The following getters and setters were auto-generated.
// Please use this method to re-generate.
//
// public static void main(String[] args) {
// //GeneratePluginCode.generate(ConvertOldFastqPlugin.class);
// }
@Override
public ImageIcon getIcon() {
return null;
}
@Override
public String getButtonName() {
return ("ConvertOldFastqToModernFormat");
}
@Override
public String getToolTipText() {
return ("Fastq files formatted with multiple taxa and bar codes are transformed into Illumina format with 1 taxa per file and no bar codes in the read sequence");
}
/**
* Input file or directory containing FASTQ files in text
* or gzipped text.
* NOTE: Directory will be searched recursively
*
* @return Input Directory
*/
public String inputFile() {
return inputFile.value();
}
/**
* Set Input Directory. Input file or directory containing
* FASTQ files in text or gzipped text.
* NOTE: Directory will be searched recursively
*
* @param value Input Directory
*
* @return this plugin
*/
public ConvertOldFastqToModernFormatPlugin inputFile(String value) {
inputFile = new PluginParameter<>(inputFile, value);
return this;
}
/**
* Key file listing barcodes distinguishing the samples
*
* @return Key File
*/
public String keyFile() {
return keyFile.value();
}
/**
* Set Key File. Key file listing barcodes distinguishing
* the samples
*
* @param value Key File
*
* @return this plugin
*/
public ConvertOldFastqToModernFormatPlugin keyFile(String value) {
keyFile = new PluginParameter<>(keyFile, value);
return this;
}
/**
* Name for this project. Project name becomes part of
* the newly created fastq file name
*
* @return Project File
*/
public String projectName() {
return projectName.value();
}
/**
* Set Project File. Name for this project. Project name
* becomes part of the newly created fastq file name
*
* @param value Project File
*
* @return this plugin
*/
public ConvertOldFastqToModernFormatPlugin projectName(String value) {
projectName = new PluginParameter<>(projectName, value);
return this;
}
/**
* Output directory where new fastQ files will be written
*
* @return Output Directory
*/
public String outputDir() {
return outputDir.value();
}
/**
* Set Output Directory. Output directory where new fastQ
* files will be written
*
* @param value Output Directory
*
* @return this plugin
*/
public ConvertOldFastqToModernFormatPlugin outputDir(String value) {
outputDir = new PluginParameter<>(outputDir, value);
return this;
}
/**
* Enzyme used to create the GBS library
*
* @return Enzyme
*/
public String enzyme() {
return myEnzyme.value();
}
/**
* Set Enzyme. Enzyme used to create the GBS library
*
* @param value Enzyme
*
* @return this plugin
*/
public ConvertOldFastqToModernFormatPlugin enzyme(String value) {
myEnzyme = new PluginParameter<>(myEnzyme, value);
return this;
}
}