net.maizegenetics.util.ConvertOldFastqToModernFormatPlugin Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of tassel Show documentation
Show all versions of tassel Show documentation
TASSEL is a software package to evaluate traits associations, evolutionary patterns, and linkage
disequilibrium.
package net.maizegenetics.util;
import java.awt.Frame;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.swing.ImageIcon;
import org.apache.log4j.Logger;
import com.google.common.collect.ImmutableMap;
import net.maizegenetics.analysis.gbs.Barcode;
import net.maizegenetics.analysis.gbs.v2.BarcodeTrie;
import net.maizegenetics.analysis.gbs.v2.EnzymeList;
import net.maizegenetics.analysis.gbs.v2.GBSUtils;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.taxa.TaxaList;
import net.maizegenetics.taxa.TaxaListBuilder;
import net.maizegenetics.taxa.TaxaListIOUtils;
import net.maizegenetics.taxa.Taxon;
/**
* This plugin takes a single fastq or a directory of fastq files
* in the "old" format and converts them to the "modern" illumina format.
*
* The old format had multiple taxa combined in a single file, with
* barcodes attaached to each read sequence.
*
* The new fastQ format has the taxa (samples) each in individual files.
*
* THis method takes the old fastq and for each read block, subsets
* the sequence to start after the barcode (but including the initial cut site)
* and end after the ending cut site. The quality score is subsetted
* identically to the sequence so the 2 match.
*
* Files are written to a user defined directory.
*
* @author lcj34
*
*/
public class ConvertOldFastqToModernFormatPlugin extends AbstractPlugin {
private static final Logger myLogger = Logger.getLogger(ConvertOldFastqToModernFormatPlugin.class);
private PluginParameter inputFile = new PluginParameter.Builder<>("i", null, String.class).guiName("Input Directory").required(true)
.description("Input file or directory containing FASTQ files in text or gzipped text.\n"
+ " NOTE: Directory will be searched recursively").build();
private PluginParameter keyFile = new PluginParameter.Builder<>("k", null, String.class).guiName("Key File").required(true).inFile()
.description("Key file listing barcodes distinguishing the samples").build();
private PluginParameter projectName = new PluginParameter.Builder<>("p", null, String.class).guiName("Project File").required(true)
.description("Name for this project. Project name becomes part of the newly created fastq file name").build();
private PluginParameter outputDir = new PluginParameter.Builder<>("o", null, String.class).guiName("Output Directory").required(true).outDir()
.description("Output directory where new fastQ files will be written").build();
private PluginParameter myEnzyme = new PluginParameter.Builder<>("e", null, String.class).guiName("Enzyme").required(true)
.description("Enzyme used to create the GBS library").build();
public static final String inputFileGlob="glob:*{.fq,fq.gz,fastq,fastq.txt,fastq.gz,fastq.txt.gz,_sequence.txt,_sequence.txt.gz}";
public static final String sampleNameField="FullSampleName";
public static final String flowcellField="Flowcell";
public static final String laneField="Lane";
public static final String barcodeField="Barcode";
public static final String tissueNameField = "Tissue";
public static final String fileNameField = "FileName";
static List infiles;
static String[] likelyReadEndStrings;
protected static int readEndCutSiteRemnantLength;
public ConvertOldFastqToModernFormatPlugin() {
super(null, false);
}
public ConvertOldFastqToModernFormatPlugin(Frame parentFrame, boolean isInteractive) {
super(parentFrame, isInteractive);
}
@Override
protected void postProcessParameters() {
// create list of directory files
File dirList = new File(inputFile());
if (!(dirList.exists())) {
throw new IllegalStateException("Input file or directory not found !!") ;
}
if (dirList.isDirectory()) {
infiles = DirectoryCrawler.listPaths(GBSUtils.inputFileGlob, Paths.get(inputFile.value()).toAbsolutePath());
if(infiles.isEmpty()) {
myLogger.warn("No files matching:"+ GBSUtils.inputFileGlob);
throw new IllegalStateException("no .txt files found in input directory !!");
}
} else {
infiles=new ArrayList<>();
Path filePath= Paths.get(inputFile()).toAbsolutePath();
infiles.add(filePath);
}
if (!myEnzyme.isEmpty()) {
// Add likelyReadEnds for later processing
EnzymeList.Enzyme enzyme = EnzymeList.defaultCache.getEnzyme(enzyme());
likelyReadEndStrings = enzyme.likelyReadEnd; // for removeSecondCutSiteIndexOf()
readEndCutSiteRemnantLength = enzyme.readEndCutSiteRemnantLength;
}
}
@Override
public DataSet processData(DataSet input) {
// Check if keyFile contains FullSampleName field.
// If not, create masterTaxaList using
// sample:Flowcell:lane:barcode concatenated columns as header
myLogger.info("Checking Key format");
boolean keyFormatIsNew = determineKeyFormat(keyFile());
TaxaList masterTaxaList;
if (keyFormatIsNew) {
masterTaxaList= TaxaListIOUtils.readTaxaAnnotationFile(keyFile(), GBSUtils.sampleNameField,
new HashMap<>(),true);
} else {
masterTaxaList = readOldTaxaAnnotationFile(keyFile(), new HashMap<>(),true);
}
myLogger.info("Created masterTaxaList - start processing fastq files");
int batchSize = 8;
for (int idx = 0; idx < infiles.size(); idx+=batchSize) {
int end = idx+batchSize;
if (end > infiles.size()) end = infiles.size();
ArrayList sub = new ArrayList();
for (int j = idx; j < end; j++) sub.add(infiles.get(j));
myLogger.info("\nStart processing batch " + String.valueOf(idx/batchSize+1));
sub.parallelStream()
.forEach(inputSeqFile -> {
try {
//
Path keyPath= Paths.get(keyFile()).toAbsolutePath();
myLogger.info(" - processing file " + inputSeqFile);
Map> taxaWriterMap = new HashMap>();
processFastQFile(keyPath, inputSeqFile, enzyme(), masterTaxaList,taxaWriterMap,
outputDir(), projectName(), keyFormatIsNew);
// End of file - close the writers;
myLogger.info("Calling Shutdown to close the writers for " + inputSeqFile);
Shutdown(taxaWriterMap);
} catch (StringIndexOutOfBoundsException oobe) {
return;
}
});
}
return null;
}
// Determine the key file format. IF headers do not include "FullSampleName",
// it is an old format.
public static boolean determineKeyFormat(String keyFile){
try (BufferedReader br = Utils.getBufferedReader(keyFile)) {
String header = br.readLine();
if (header.contains(sampleNameField)) return true;
else return false;
} catch (IOException ioe) {
ioe.printStackTrace();
}
return false;
}
public static ArrayList getLaneAnnotatedTaxaList(Path keyPath, Path fastQpath, boolean useNew) {
String[] filenameField = fastQpath.getFileName().toString().split("_");
ArrayList annoTL;
if (filenameField.length == 3) {
if (useNew) {
annoTL = TaxaListIOUtils.readTaxaAnnotationFileAL(keyPath.toAbsolutePath().toString(), sampleNameField,
ImmutableMap.of(flowcellField, filenameField[0], laneField, filenameField[1]));
} else {
annoTL = readOldFormatKeyFile(keyPath.toAbsolutePath().toString(),
ImmutableMap.of(flowcellField, filenameField[0], laneField, filenameField[1]), true);
}
} else if (filenameField.length == 4) {
if (useNew) {
annoTL = TaxaListIOUtils.readTaxaAnnotationFileAL(keyPath.toAbsolutePath().toString(),sampleNameField,
ImmutableMap.of(flowcellField, filenameField[0], laneField, filenameField[2]));
} else {
annoTL = readOldFormatKeyFile(keyPath.toAbsolutePath().toString(),
ImmutableMap.of(flowcellField, filenameField[0], laneField, filenameField[2]),true);
}
}
else if (filenameField.length == 5) {
if (useNew) {
annoTL = TaxaListIOUtils.readTaxaAnnotationFileAL(keyPath.toAbsolutePath().toString(),sampleNameField,
ImmutableMap.of(flowcellField, filenameField[1], laneField, filenameField[3]));
} else {
annoTL = readOldFormatKeyFile(keyPath.toAbsolutePath().toString(),
ImmutableMap.of(flowcellField, filenameField[1], laneField, filenameField[3]),true);
}
} else {
myLogger.error("Error in parsing file name: " + fastQpath.toString());
myLogger.error(" The filename does not contain either 3, 4, or 5 underscore-delimited values.");
myLogger.error(" Expect: flowcell_lane_fastq.txt.gz OR flowcell_s_lane_fastq.txt.gz OR code_flowcell_s_lane_fastq.txt.gz");
return null;
}
return annoTL;
}
public static TaxaList readOldTaxaAnnotationFile(String fileName, Map filters, boolean mergeSameNames) {
// create list
ArrayList taxaAL = readOldFormatKeyFile( fileName, filters, mergeSameNames);
if (taxaAL == null) return null;
TaxaListBuilder tlb = new TaxaListBuilder();
taxaAL.stream().forEach(taxa -> {
if (mergeSameNames) {
tlb.addOrMerge(taxa);
} else {
tlb.add(taxa);
}
});
return tlb.sortTaxaAlphabetically().build();
}
// Call used for old format of files
public static ArrayList readOldFormatKeyFile(String fileName, Map filters, boolean mergeSameNames) {
try {
BufferedReader fileIn = Utils.getBufferedReader(fileName, 1000000);
fileIn.mark(1 << 16);
String line = fileIn.readLine();
ArrayList taxaAL = new ArrayList();
// Name becomes concatenation of 4 fields: Sample, Flowcell, Lane, Barcode - in that order
int indexOfSample = 0;
int indexOfFlowcell = 0;
int indexOfLane = 0;
int indexOfBarcode = 0;
//parse headers
List headers = new ArrayList<>();
List isQuant = new ArrayList<>();
if (line.contains("Flowcell")) {
int idx = 0;
for (String header : line.split("\\t")) {
if (header.equalsIgnoreCase("Sample")) {
indexOfSample= idx;
}
if (header.equalsIgnoreCase("Flowcell")) {
indexOfFlowcell= idx;
}
if (header.equalsIgnoreCase("Lane")) {
indexOfLane= idx;
}
if (header.equalsIgnoreCase("Barcode")) {
indexOfBarcode= idx;
}
isQuant.add(header.startsWith("#") || header.startsWith("<#"));
headers.add(header.replace(">", "").replace("<", "").replace("#", ""));
idx++;
}
} else {
fileIn.reset();
}
//parse taxa rows
while ((line = fileIn.readLine()) != null) {
String[] stokens = line.split("\\t");
StringBuilder taxonSB = new StringBuilder();
taxonSB.append(stokens[indexOfSample]).append(":");
taxonSB.append(stokens[indexOfFlowcell]).append(":");
taxonSB.append(stokens[indexOfLane]).append(":");
taxonSB.append(stokens[indexOfBarcode]);
String aTaxonName = taxonSB.toString();
Taxon.Builder anID = new Taxon.Builder(aTaxonName);
for (int idx = 0; idx < stokens.length; idx++) {
String[] cs = stokens[idx].split(";");
for (String ta : cs) {
if (ta == null || ta.isEmpty()) {
continue;
}
if (isQuant.get(idx)) {
if (ta.equals("NA")) {
anID.addAnno(headers.get(idx), Double.NaN);
} else {
anID.addAnno(headers.get(idx), Double.parseDouble(ta));
}
} else {
anID.addAnno(headers.get(idx), ta);
}
}
}
Taxon myTaxon = anID.build();
if (TaxaListIOUtils.doesTaxonHaveAllAnnotations(myTaxon, filters)) {
taxaAL.add(myTaxon);
}
}
// Sort alphabetically based on name. This is to remain consistent
// with the readTaxaAnnotationFile(), which alphabetizes the taxaList
Collections.sort(taxaAL, new Comparator() {
@Override
public int compare(Taxon taxa1, Taxon taxa2) {
return taxa1.getName().compareTo(taxa2.getName());
}
});
return taxaAL;
} catch (Exception e) {
System.err.println("Error in Reading Annotated Taxon File:" + fileName);
e.printStackTrace();
}
return null;
}
// THis method will take the taxon/outputWriter map, a fastqfile, and an enzyme name.
// From this data a barcode trie is created. Then for each "read" in the fastq file,
// the taxa is identified from the barcode. The barcode is stripped, and the read
// is written to the appropriate file based on the taxaFileMap
public static void processFastQFile(Path keyPath, Path fastQPath, String enzymeName,
TaxaList masterTaxaList,Map> taxaFileMap,
String outputDir, String projectName, boolean keyFormatIsNew){
int allReads=0, goodBarcodedReads = 0, barCodeNotFound = 0, badEndReads = 0;
int currentTotal = 0;
int first20hasN = 0;
ArrayList tl = getLaneAnnotatedTaxaList(keyPath, fastQPath, keyFormatIsNew);;
// Creating a hashmap of taxon name to taxon object. The file name we want to
// write to has flowcell and lane in the name. These annotations are stored with
// the tl arraylist taxons, but NOT in the barcode taxons. It is necessary to
// take the taxon name from the barcode, find the taxon in the tl list, then
// retrieve the flowcell and lane.
Map nameToTaxon = new HashMap();
for (Taxon taxon : tl) {
String name = taxon.getName();
nameToTaxon.put(name, taxon);
}
if (tl.size() == 0) return;
BarcodeTrie barcodeTrie=GBSUtils.initializeBarcodeTrie(tl, masterTaxaList, EnzymeList.defaultCache.getEnzyme(enzymeName));
String[] seqAndQual;
BufferedReader br = Utils.getBufferedReader(fastQPath.toString(), 1 << 22);
long time=System.nanoTime();
try {
while ((seqAndQual=readAllFastQBlock(br,allReads)) != null) {
// Unlike GBS, we read all 4 lines, as all 4 lines will be written to new file
allReads++;
currentTotal++;
//After quality score is read, decode barcode using the current sequence & quality score
Barcode barcode=barcodeTrie.longestPrefix(seqAndQual[1]);
if(barcode==null) {
if (seqAndQual[1].contains("N")) first20hasN++;
barCodeNotFound++;
continue; // couldn't find taxa - skip this read
}
int barcodeLen = barcode.getBarLength();
String adjustedSeq = seqAndQual[1].substring(barcodeLen);
String adjustedQual = seqAndQual[3].substring(barcodeLen);
Tuple stringStartEnd = truncateToSecondCutSite(adjustedSeq,enzymeName);
if (stringStartEnd == null) {
badEndReads++;
continue;
}
adjustedSeq = adjustedSeq.substring(stringStartEnd.x, stringStartEnd.y);
adjustedQual = adjustedQual.substring(stringStartEnd.x, stringStartEnd.y);
goodBarcodedReads++;
// write entries to file for this taxon
Taxon readTaxon = barcode.getTaxon();
Tuple origTuple = taxaFileMap.get(readTaxon);
if (origTuple == null) {
// add entry
Taxon taxonWithAnno = nameToTaxon.get(readTaxon.getName());
GeneralAnnotation annotation = taxonWithAnno.getAnnotation();
String outFile = outputDir + "/" + projectName + "_" + readTaxon.getName() + "_" +
annotation.getTextAnnotation(flowcellField)[0] + "_" + annotation.getTextAnnotation("Lane")[0] +
"_" + barcode.getBarcodeString() + ".fastq";
taxaFileMap.put(readTaxon,new Tuple(null,outFile));
origTuple = taxaFileMap.get(readTaxon);
}
BufferedWriter bw = origTuple.x;
if (bw == null) {
// First time we've written this file - create writer and store it
bw = Utils.getBufferedWriter(origTuple.y);
taxaFileMap.put(readTaxon, new Tuple(bw,origTuple.y));
}
bw.write(seqAndQual[0]);
bw.write("\n");
bw.write(adjustedSeq);
bw.write("\n");
bw.write(seqAndQual[2]);
bw.write("\n");
bw.write(adjustedQual);
bw.write("\n");
if (currentTotal > 1000000) {
System.out.println("Reads processed so far: " + allReads);
currentTotal = 0;
}
}
myLogger.info("Summary for "+fastQPath.toString()+"\n"+
"Total number of reads in lane=" + allReads +"\n"+
"Total number of good barcoded reads=" + goodBarcodedReads+"\n"+
"Total number of bar codes not found=" + barCodeNotFound+"\n"+
"Total number where first 20 bps of seq has N =" + first20hasN+"\n"+
"Total number of read End not found=" + badEndReads+"\n"+
"Timing process (parsing, collapsing, and writing readBlock to file)."+"\n"+
"Process took " + (System.nanoTime() - time)/1e9 + " seconds.");
} catch (Exception exc) {
exc.printStackTrace();
}
}
public static String[] readAllFastQBlock(BufferedReader bw, int currentRead) throws IOException {
//consider converting this into a stream of String[]
String[] result=new String[4];
try{
result[0]=bw.readLine();
result[1] = bw.readLine();
if(result[0]==null) {
return null;
}
result[2]=bw.readLine();
result[3] = bw.readLine();
return result;
} catch (IOException e) {
e.printStackTrace();
myLogger.error("Unable to correctly parse the fastQ read nead line: " + currentRead*4
+ " from fastq file. Your fastq file may have been corrupted.");
return null;
}
}
// This method determines the start and total length of the passed sequence
// based on the enzyme and its likely read-end strings.
// The start position may change based on handling of the overlapping
// cut sites. The end site should change based on likely read ends.
// These values are used in calling method to adjust both the sequence
// and the quality strings.
private static Tuple truncateToSecondCutSite(String seq,String enzyme) {
int seqStart = 0;
// handle overlapping cutsite for ApeKI enzyme
if (enzyme.equalsIgnoreCase("ApeKI")) {
if (seq.startsWith("CAGCTGC") || seq.startsWith("CTGCAGC")) {
seq = seq.substring(3,seq.length());
seqStart = 3;
}
}
int indexOfReadEnd = -1;
// Starts at 20 so don't have overlapping start/end cutsites.
String shortSeq = seq.substring(20); // substr from 20 to end of sequence
for (String readEnd: likelyReadEndStrings){
int indx = shortSeq.indexOf(readEnd);
if (indx > 0 ) {
if (indexOfReadEnd < 0 || indx < indexOfReadEnd) {
indexOfReadEnd = indx;
}
}
}
if (indexOfReadEnd < 0) {
return new Tuple(seqStart,seq.length()-seqStart);
} else {
// Add back the 20 from the beginning that we chopped off.
int tagLen = indexOfReadEnd + 20 + readEndCutSiteRemnantLength;
return new Tuple(seqStart,tagLen);
}
}
private static void Shutdown(Map> taxaWriterMap) {
if (taxaWriterMap == null) return;
try {
for (Map.Entry> entry: taxaWriterMap.entrySet()) {
BufferedWriter bw = entry.getValue().x;
if (bw != null) bw.close();
}
} catch (Exception e) {
System.out.println("Problem with shutdown");
e.printStackTrace();
}
}
// The following getters and setters were auto-generated.
// Please use this method to re-generate.
//
// public static void main(String[] args) {
// //GeneratePluginCode.generate(ConvertOldFastqPlugin.class);
// }
@Override
public ImageIcon getIcon() {
return null;
}
@Override
public String getButtonName() {
return ("ConvertOldFastqToModernFormat");
}
@Override
public String getToolTipText() {
return ("Fastq files formatted with multiple taxa and bar codes are transformed into Illumina format with 1 taxa per file and no bar codes in the read sequence");
}
/**
* Input file or directory containing FASTQ files in text
* or gzipped text.
* NOTE: Directory will be searched recursively
*
* @return Input Directory
*/
public String inputFile() {
return inputFile.value();
}
/**
* Set Input Directory. Input file or directory containing
* FASTQ files in text or gzipped text.
* NOTE: Directory will be searched recursively
*
* @param value Input Directory
*
* @return this plugin
*/
public ConvertOldFastqToModernFormatPlugin inputFile(String value) {
inputFile = new PluginParameter<>(inputFile, value);
return this;
}
/**
* Key file listing barcodes distinguishing the samples
*
* @return Key File
*/
public String keyFile() {
return keyFile.value();
}
/**
* Set Key File. Key file listing barcodes distinguishing
* the samples
*
* @param value Key File
*
* @return this plugin
*/
public ConvertOldFastqToModernFormatPlugin keyFile(String value) {
keyFile = new PluginParameter<>(keyFile, value);
return this;
}
/**
* Name for this project. Project name becomes part of
* the newly created fastq file name
*
* @return Project File
*/
public String projectName() {
return projectName.value();
}
/**
* Set Project File. Name for this project. Project name
* becomes part of the newly created fastq file name
*
* @param value Project File
*
* @return this plugin
*/
public ConvertOldFastqToModernFormatPlugin projectName(String value) {
projectName = new PluginParameter<>(projectName, value);
return this;
}
/**
* Output directory where new fastQ files will be written
*
* @return Output Directory
*/
public String outputDir() {
return outputDir.value();
}
/**
* Set Output Directory. Output directory where new fastQ
* files will be written
*
* @param value Output Directory
*
* @return this plugin
*/
public ConvertOldFastqToModernFormatPlugin outputDir(String value) {
outputDir = new PluginParameter<>(outputDir, value);
return this;
}
/**
* Enzyme used to create the GBS library
*
* @return Enzyme
*/
public String enzyme() {
return myEnzyme.value();
}
/**
* Set Enzyme. Enzyme used to create the GBS library
*
* @param value Enzyme
*
* @return this plugin
*/
public ConvertOldFastqToModernFormatPlugin enzyme(String value) {
myEnzyme = new PluginParameter<>(myEnzyme, value);
return this;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy