net.maizegenetics.analysis.gbs.v2.GBSUtils Maven / Gradle / Ivy
/**
*
*/
package net.maizegenetics.analysis.gbs.v2;
import java.io.BufferedReader;
import java.io.IOException;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import net.maizegenetics.analysis.gbs.Barcode;
import net.maizegenetics.taxa.TaxaList;
import net.maizegenetics.taxa.TaxaListIOUtils;
import net.maizegenetics.taxa.Taxon;
import net.maizegenetics.util.GeneralAnnotation;
import net.maizegenetics.util.Utils;
import org.apache.log4j.Logger;
import com.google.common.collect.ImmutableListMultimap;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ListMultimap;
import com.google.common.collect.Ordering;
/**
* This class contains methods and constants used by various classes in the GBSv2
* pipeline.
* @author lcj34
*
*/
public class GBSUtils {
private static final Logger myLogger = Logger.getLogger(GBSUtils.class);
public static final String inputFileGlob="glob:*{.fq,fq.gz,fastq,fastq.txt,fastq.gz,fastq.txt.gz,_sequence.txt,_sequence.txt.gz}";
public static final String sampleNameField="FullSampleName";
public static final String flowcellField="Flowcell";
public static final String laneField="Lane";
public static final String barcodeField="Barcode";
public static final String tissueNameField = "Tissue";
public static final String fileNameField = "FileName";
private GBSUtils() {
}
/**
* Method for reading FastQ four line structure, and returning a string array with [sequence, qualityScore]
*/
public static String[] readFastQBlock(BufferedReader bw, int currentRead) throws IOException {
//consider converting this into a stream of String[]
String[] result=new String[2];
try{
bw.readLine();
result[0]=bw.readLine();
bw.readLine();
result[1]=bw.readLine();
if(result[0]==null) {
return null;
}
return result;
} catch (IOException e) {
e.printStackTrace();
myLogger.error("Unable to correctly parse the sequence and quality score near line: " + currentRead*4
+ " from fastq file. Your fastq file may have been corrupted.");
return null;
}
}
/**
* Method for reading FastQ four line structure, and returning a string array with [sequence, qualityScore]
*/
public static String[] readDeMultiPlexFastQBlock(BufferedReader bw, int currentRead) throws IOException {
//consider converting this into a stream of String[]
String[] result=new String[3];
try{
// Grab the barcode from the first line of the fastq sequence
String barCode = bw.readLine();
if (barCode == null) {
return null;
}
int index = barCode.lastIndexOf(":");
result[2] = barCode.substring(index+1);
// sequence
result[0]=bw.readLine();
bw.readLine();//quality header - thrown away
// Second entry is the quality score
result[1]=bw.readLine();
return result;
} catch (IOException e) {
e.printStackTrace();
myLogger.error("Unable to correctly parse the sequence and quality score near line: " + currentRead*4
+ " from fastq file. Your fastq file may have been corrupted.");
return null;
}
}
/**
* Method for reading FastQ four line structure, and returning a string array with [sequence, qualityScore]
*/
public static int determineQualityScoreBase(Path fastqFile) throws IOException {
try{BufferedReader bw = Utils.getBufferedReader(fastqFile.toString());
int headerParts=bw.readLine().split(":").length;
int base=(headerParts<5)?64:33;
myLogger.info(fastqFile.toString()+": Quality score base:"+base);
return base;
} catch (IOException e) {
e.printStackTrace();
myLogger.error("Unable to correctly parse the quality score base from fastq file. " +
"Your fastq file may have been corrupted.");
return 0;
}
}
/**
* Returns an annotated taxaList based on a Keyfile for GBS
* @param keyPath
* @param fastQpath
* @return
*/
public static ArrayList getLaneAnnotatedTaxaList(Path keyPath, Path fastQpath) {
String[] filenameField = fastQpath.getFileName().toString().split("_");
ArrayList annoTL;
if (filenameField.length == 3) {
annoTL = TaxaListIOUtils.readTaxaAnnotationFileAL(keyPath.toAbsolutePath().toString(), sampleNameField,
ImmutableMap.of(flowcellField, filenameField[0], laneField, filenameField[1]));
} else if (filenameField.length == 4) {
annoTL = TaxaListIOUtils.readTaxaAnnotationFileAL(keyPath.toAbsolutePath().toString(),sampleNameField,
ImmutableMap.of(flowcellField, filenameField[0], laneField, filenameField[2]));
}
else if (filenameField.length == 5) {
annoTL = TaxaListIOUtils.readTaxaAnnotationFileAL(keyPath.toAbsolutePath().toString(),sampleNameField,
ImmutableMap.of(flowcellField, filenameField[1], laneField, filenameField[3]));
} else {
myLogger.error("Error in parsing file name: " + fastQpath.toString());
myLogger.error(" The filename does not contain either 3, 4, or 5 underscore-delimited values.");
myLogger.error(" Expect: flowcell_lane_fastq.txt.gz OR flowcell_s_lane_fastq.txt.gz OR code_flowcell_s_lane_fastq.txt.gz");
return null;
}
return annoTL;
}
/**
* Produces a trie for sorting the read
* @param taxaList the taxaList of the current flowcell lanes that is annotated with barcode information
* @param masterTaxaList the mastertaxaList provides the taxaIndex
* @param myEnzyme
* @return Barcode trie for examining the prefixes
*/
public static BarcodeTrie initializeBarcodeTrie(ArrayList taxaList, TaxaList masterTaxaList,
EnzymeList.Enzyme myEnzyme){
BarcodeTrie aTrie=new BarcodeTrie();
for (Taxon taxon : taxaList) {
int masterIndex=masterTaxaList.indexOf(taxon.getName());
GeneralAnnotation annotation = taxon.getAnnotation();
String[] myTissues = annotation.getTextAnnotation("Tissue");
// Tissue should be stored as annotation against the taxon
Barcode theBC = null;
theBC = new Barcode(annotation.getTextAnnotation(barcodeField)[0], myEnzyme.initialCutSiteRemnant(), taxon.getName(),
masterIndex,annotation.getTextAnnotation(flowcellField)[0],annotation.getTextAnnotation("Lane")[0]);
aTrie.addBarcode(theBC);
}
return aTrie;
}
// THis one sends in a tissue list - is called from the RNASeq pipeline
public static BarcodeTrie initializeBarcodeTrie(ArrayList taxaList, TaxaList masterTaxaList,
ArrayList masterTissueList, EnzymeList.Enzyme myEnzyme){
BarcodeTrie aTrie=new BarcodeTrie();
for (Taxon taxon : taxaList) {
int masterIndex=masterTaxaList.indexOf(taxon.getName());
GeneralAnnotation annotation = taxon.getAnnotation();
String[] myTissues = annotation.getTextAnnotation("Tissue");
// Tissue should be stored as annotation against the taxon
Barcode theBC = null;
if (myTissues.length > 0) {
int masterTissueIndex = masterTissueList.indexOf(myTissues[0]);
// keyfile had Tissue column: tissues were added to taxon annotations
theBC = new Barcode(annotation.getTextAnnotation(barcodeField)[0], myEnzyme.initialCutSiteRemnant(), taxon.getName(),
masterIndex,myTissues[0], masterTissueIndex,
annotation.getTextAnnotation(flowcellField)[0],annotation.getTextAnnotation("Lane")[0]);
} else { // no tissue variables in the taxon annotations
theBC = new Barcode(annotation.getTextAnnotation(barcodeField)[0], myEnzyme.initialCutSiteRemnant(), taxon.getName(),
masterIndex,annotation.getTextAnnotation(flowcellField)[0],annotation.getTextAnnotation("Lane")[0]);
}
aTrie.addBarcode(theBC);
}
return aTrie;
}
/**
* Produces a list of fastq files that are represented by the plugin's keyfile
* @param directoryFiles: List of all the files in the directory
* @return filesToProcess: List of only those files that should be processed
*/
public static List culledFiles(ListdirectoryFiles,Path keyFile ) {
List filesToProcess = new ArrayList();
// Get map of flowcell/lanes from the key file
String keyFileName = keyFile.toString();
ListMultimap keyFileValues = parseKeyfileIntoMap(keyFileName);
if (keyFileValues.isEmpty()) return filesToProcess; // no entries
// for each file in the directory, check if the flowcell and lane are represented
// The directoryFile list is in alphabetical order. It is quicker to run a non-parallel
// stream and skip sorting than run with parallel and have to sort at the end (entries
// in filesToProcess are not in alphabetical order when parallelStream is used).
// Alphabetical order is necessary to ensure consistency of tags removed by
// "removeTagsWithoutReplication" when multiple runs are performed.
directoryFiles.stream()
.forEach(directoryFile -> {
String[] filenameField = directoryFile.getFileName().toString().split("_");
if (filenameField.length == 3) {
if (keyFileValues.containsEntry(filenameField[0],filenameField[1])) {
filesToProcess.add(directoryFile);
}
} else if (filenameField.length == 4) {
if (keyFileValues.containsEntry(filenameField[0],filenameField[2])) {
filesToProcess.add(directoryFile);
}
}
else if (filenameField.length == 5) {
if (keyFileValues.containsEntry(filenameField[1],filenameField[3])) {
filesToProcess.add(directoryFile);
}
}
else {
myLogger.error("Error in parsing file name: " + directoryFile.toString());
myLogger.error(" The filename does not contain either 3, 4, or 5 underscore-delimited values.");
myLogger.error(" Expect: flowcell_lane_fastq.txt.gz OR flowcell_s_lane_fastq.txt.gz OR code_flowcell_s_lane_fastq.txt.gz");
myLogger.error(" " + directoryFile.toString() + " will not be processed.");
}
});
return filesToProcess;
}
/**
* Parses a tab-delimited keyFile storing the flow cell and lane values into a multimap.
* The flow cell is the key, which may have multiple associated lanes.
*
* @param
* @return
*/
public static ListMultimap parseKeyfileIntoMap(String fileName) {
if (fileName == null) {
return null;
}
ImmutableListMultimap.Builder mMap = new ImmutableListMultimap.Builder()
.orderKeysBy(Ordering.natural()).orderValuesBy(Ordering.natural());
try {
BufferedReader fileIn = Utils.getBufferedReader(fileName, 1000000);
fileIn.mark(1 << 16);
String line = fileIn.readLine();
int indexOfFlowcell = 0, indexOfLane = 0;
//parse headers
if (line.contains(flowcellField)) {
int idx = 0;
for (String header : line.split("\\t")) {
if (header.equals(flowcellField)) {
indexOfFlowcell = idx;
}
if (header.equals(laneField)) {
indexOfLane = idx;
}
idx++;
}
} else {
fileIn.reset();
}
// create list of flowcells and lanes
while ((line = fileIn.readLine()) != null) {
String[] myString = line.split("\\t");
String myFlowCell = myString[indexOfFlowcell];
String myLane = myString[indexOfLane];
mMap.put(myFlowCell,myLane);
}
} catch (Exception e) {
System.err.println("Error in Reading Parsing Key File:" + fileName);
e.printStackTrace();
}
return mMap.build();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy