net.maizegenetics.analysis.data.AdjustPhasingPlugin Maven / Gradle / Ivy
/*
* AdjustPhasingPlugin
*
* Created on Feb 17, 2017
*/
package net.maizegenetics.analysis.data;
import java.awt.Frame;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.LinkedBlockingQueue;
import javax.swing.ImageIcon;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.plugindef.PluginParameter;
import net.maizegenetics.util.BitSet;
import net.maizegenetics.util.OpenBitSet;
import net.maizegenetics.util.Utils;
import org.apache.log4j.Logger;
/**
*
* @author Terry Casstevens
*/
public class AdjustPhasingPlugin extends AbstractPlugin {
private static final Logger myLogger = Logger.getLogger(AdjustPhasingPlugin.class);
private static final int NUM_VCF_HEADER_COLUMNS = 9;
private PluginParameter myInputVCFFile = new PluginParameter.Builder<>("inputVCFFile", null, String.class)
.description("Input VCF file")
.inFile()
.guiName("Input VCF File")
.required(true)
.build();
private PluginParameter myHapcutDir = new PluginParameter.Builder<>("hapcutDir", null, String.class)
.description("Directory containing Hapcut output files.")
.inDir()
.required(true)
.build();
private PluginParameter myOutputVCFFile = new PluginParameter.Builder<>("outputVCFFile", null, String.class)
.description("Output VCF file")
.outFile()
.guiName("Output VCF File")
.required(true)
.build();
private int myNumTaxa = 0;
private String[] myTaxaNames = null;
private int myNumSites = 0;
public AdjustPhasingPlugin(Frame parentFrame, boolean isInteractive) {
super(parentFrame, isInteractive);
}
@Override
public DataSet processData(DataSet input) {
myNumSites = Utils.getNumberLinesNotHashOrBlank(inputVCFFile());
try {
ExecutorService pool = Executors.newWorkStealingPool();
List> futures = new ArrayList<>();
File temp = new File(hapcutDir());
for (File current : temp.listFiles()) {
String name = current.getCanonicalPath();
if (name.endsWith("_haplotypes")) {
ProcessHapcut process = new ProcessHapcut(name, myNumSites);
futures.add(pool.submit(process));
}
}
String[] positionsFromHapcut = new String[myNumSites];
Map processedHapcutFiles = new HashMap<>();
for (Future future : futures) {
ProcessHapcut processed = future.get();
for (int s = 0; s < myNumSites; s++) {
if (positionsFromHapcut[s] == null) {
positionsFromHapcut[s] = processed.myPositions[s];
} else if (processed.myPositions[s] != null && !positionsFromHapcut[s].equals(processed.myPositions[s])) {
throw new IllegalStateException("AdjustPhasingPlugin: position at site: " + s + " in: " + processed.myFilename + ": " + processed.myPositions[s] + " doesn't match other Hapcut files: " + positionsFromHapcut[s]);
}
}
processedHapcutFiles.put(processed.taxaName(), processed);
myLogger.info("finished processing: " + processed.filename() + " taxa: " + processed.taxaName());
}
try (BufferedReader reader = Utils.getBufferedReader(inputVCFFile());
BufferedWriter writer = Utils.getBufferedWriter(outputVCFFile())) {
String line = reader.readLine();
while (line != null && line.startsWith("##")) {
writer.write(line);
line = reader.readLine();
}
if (line == null || !line.startsWith("#CHROM")) {
throw new IllegalArgumentException("AdjustPhasingPlugin: processData: First line after ## lines should be header #CHROM...");
}
writer.write(line);
writer.write('\n');
// Taxa Names
String[] tokens = line.split("\t");
myNumTaxa = tokens.length - NUM_VCF_HEADER_COLUMNS;
myLogger.info("Number of Taxa: " + myNumTaxa);
myLogger.info("Number of Sites: " + myNumSites);
myTaxaNames = new String[myNumTaxa];
Switch[] switches = new Switch[myNumTaxa];
int numIntersectingTaxa = 0;
int numTaxaWithOutHapcut = 0;
for (int i = NUM_VCF_HEADER_COLUMNS; i < tokens.length; i++) {
myTaxaNames[i - NUM_VCF_HEADER_COLUMNS] = tokens[i];
ProcessHapcut current = processedHapcutFiles.remove(tokens[i]);
if (current == null) {
switches[i - NUM_VCF_HEADER_COLUMNS] = FALSE_SWITCH;
numTaxaWithOutHapcut++;
} else {
switches[i - NUM_VCF_HEADER_COLUMNS] = current.result();
numIntersectingTaxa++;
}
}
int numHapcutNotInVCF = processedHapcutFiles.size();
myLogger.info("Number intersecting taxa between Hapcut files and VCF: " + numIntersectingTaxa);
myLogger.info("Number Taxa in VCF without Hapcut files: " + numTaxaWithOutHapcut);
myLogger.info("Number Hapcut files without taxa in VCF: " + numHapcutNotInVCF);
BlockingQueue> queue = new LinkedBlockingQueue<>();
Future> readFuture = pool.submit(new ReadLines(reader, queue, switches, pool));
Future> writeFuture = pool.submit(new WriteLines(writer, queue));
readFuture.get();
writeFuture.get();
} catch (Exception ex) {
myLogger.debug(ex.getMessage(), ex);
throw new IllegalStateException("AdjustPhasingPlugin: processData: problem converting file: " + inputVCFFile() + " to file: " + outputVCFFile() + " error: " + ex.getMessage());
}
pool.shutdown();
} catch (Exception e) {
myLogger.debug(e.getMessage(), e);
}
return null;
}
/**
* Input VCF file
*
* @return Input VCF File
*/
public String inputVCFFile() {
return myInputVCFFile.value();
}
/**
* Set Input VCF File. Input file
*
* @param value Input VCF File
*
* @return this plugin
*/
public AdjustPhasingPlugin inputVCFFile(String value) {
myInputVCFFile = new PluginParameter<>(myInputVCFFile, value);
return this;
}
/**
* Directory containing Hapcut output files.
*
* @return Hapcut Dir
*/
public String hapcutDir() {
return myHapcutDir.value();
}
/**
* Set Hapcut Dir. Directory containing Hapcut output files.
*
* @param value Hapcut Dir
*
* @return this plugin
*/
public AdjustPhasingPlugin hapcutDir(String value) {
myHapcutDir = new PluginParameter<>(myHapcutDir, value);
return this;
}
/**
* Output VCF file
*
* @return Output VCF File
*/
public String outputVCFFile() {
return myOutputVCFFile.value();
}
/**
* Set Output VCF File. Output VCF file
*
* @param value Output VCF File
*
* @return this plugin
*/
public AdjustPhasingPlugin outputVCFFile(String value) {
myOutputVCFFile = new PluginParameter<>(myOutputVCFFile, value);
return this;
}
@Override
public ImageIcon getIcon() {
return null;
}
@Override
public String getButtonName() {
return "Adjust Phasing";
}
@Override
public String getToolTipText() {
return "Adjust Phasing";
}
private class ProcessHapcut implements Callable {
private final String myFilename;
private final String myLogfile;
private final String myTaxaName;
private final BitSet myResult;
private final String[] myPositions;
private final List myChromosomes = new ArrayList<>();
private final List myOffsets = new ArrayList<>();
public ProcessHapcut(String filename, int numSites) {
myFilename = filename;
myLogfile = filename + ".log";
myTaxaName = Utils.getFilename(filename).split("_")[0];
myResult = new OpenBitSet(numSites);
myPositions = new String[numSites];
}
@Override
public ProcessHapcut call() throws Exception {
try (BufferedReader reader = Utils.getBufferedReader(myFilename);
BufferedWriter log = Utils.getBufferedWriter(myLogfile)) {
String line = reader.readLine();
if (line == null) {
myLogger.warn("Hapcut file: " + myFilename + " is empty.");
}
int blockNum = 0;
while (line != null) {
if (line.startsWith("BLOCK")) {
blockNum++;
processBlock(reader, line, blockNum, log);
} else {
throw new IllegalStateException("AdjustPhasingPlugin: expected BLOCK statement: " + line);
}
line = reader.readLine();
}
} catch (Exception e) {
myLogger.debug(e.getMessage(), e);
throw new IllegalStateException("AdjustPhasingPlugin: ProcessHapcut: problem reading file: " + myFilename + ": " + e.getMessage());
}
return this;
}
private void processBlock(BufferedReader reader, String blockLine, int blockNum, BufferedWriter log) {
String currentChr = null;
int highestChrIndex = -1;
List[] phased = new List[2];
phased[0] = new ArrayList<>();
phased[1] = new ArrayList<>();
Map blockLines = new HashMap<>();
try {
String line = reader.readLine();
while (line != null && !line.startsWith("********")) {
String[] tokens = line.split("\t");
if (currentChr == null) {
// tokens[3] is chromosome
currentChr = tokens[3];
} else if (!currentChr.equals(tokens[3])) {
throw new IllegalStateException("AdjustPhasingPlugin: ProcessHapcut: Different chr: " + tokens[3] + " within block that started with chr: " + currentChr);
}
// tokens[0] site number plus 1
int tempIndex = Integer.parseInt(tokens[0]) - 1;
blockLines.put(tempIndex, line);
if (highestChrIndex >= tempIndex) {
throw new IllegalStateException("AdjustPhasingPlugin: ProcessHapcut: index out of order: " + tempIndex);
} else {
highestChrIndex = tempIndex;
}
// tokens[4] is physical position
myPositions[tempIndex] = tokens[4];
// tokens[1] haploid A
// tokens[2] haploid B
// tokens[7] VCF genotype field
String[] fromVCF = tokens[7].split(":")[0].split("\\|");
if (tokens[1].equals(fromVCF[0]) || tokens[2].equals(fromVCF[1])) {
phased[0].add(tempIndex);
} else {
phased[1].add(tempIndex);
}
line = reader.readLine();
}
} catch (Exception e) {
myLogger.debug(e.getMessage(), e);
throw new IllegalStateException("AdjustPhasingPlugin: ProcessHapcut: problem reading file: " + myFilename + ": " + e.getMessage());
}
int chrIndex = myChromosomes.indexOf(currentChr);
if (chrIndex == -1) {
myChromosomes.add(currentChr);
myOffsets.add(highestChrIndex);
} else if (myOffsets.get(chrIndex) < highestChrIndex) {
myOffsets.add(chrIndex, highestChrIndex);
}
if (!phased[0].isEmpty() || !phased[1].isEmpty()) {
List smallest = null;
if (phased[0].size() < phased[1].size()) {
smallest = phased[0];
} else {
smallest = phased[1];
}
for (Integer index : smallest) {
myResult.fastSet(index);
try {
log.write(String.valueOf(blockNum));
log.write("\t");
log.write(blockLines.get(index));
log.write("\n");
} catch (Exception e) {
myLogger.debug(e.getMessage(), e);
}
}
}
}
public String filename() {
return myFilename;
}
public String taxaName() {
return myTaxaName;
}
public Switch result() {
if (myResult.cardinality() == 0) {
return FALSE_SWITCH;
} else {
return new SwitchBitSet(myResult);
}
}
public String[] positions() {
return myPositions;
}
public List chromosomes() {
return myChromosomes;
}
public List offsets() {
return myOffsets;
}
}
private final Switch FALSE_SWITCH = new Switch();
private class Switch {
public boolean alleles(int site) {
return false;
}
}
private class SwitchBitSet extends Switch {
private final BitSet myBitSet;
public SwitchBitSet(BitSet bitSet) {
myBitSet = bitSet;
}
public boolean alleles(int site) {
return myBitSet.fastGet(site);
}
}
private class ProcessLines implements Callable {
private final int myStartSite;
private final String[] myLines;
private final Switch[] mySwitches;
private final StringBuilder myBuilder = new StringBuilder();
private final boolean myIsFinalProcessLines;
private final int[] myNumSwitchesTaxa;
private final int[] myNumSwitchesSites;
public ProcessLines(int startSite, String[] lines, Switch[] switches) {
myStartSite = startSite;
myLines = lines;
mySwitches = switches;
myIsFinalProcessLines = false;
myNumSwitchesTaxa = new int[myNumTaxa];
myNumSwitchesSites = new int[myLines.length];
}
public ProcessLines() {
myStartSite = 0;
myLines = new String[0];
mySwitches = null;
myIsFinalProcessLines = true;
myNumSwitchesTaxa = new int[0];
myNumSwitchesSites = new int[0];
}
@Override
public ProcessLines call() throws Exception {
int currentSite = myStartSite;
for (String current : myLines) {
char[] temp = current.toCharArray();
int numChars = temp.length;
int copyToNthTab = NUM_VCF_HEADER_COLUMNS;
int numTabsFound = 0;
int startIndex = 0;
int charIndex = 0;
for (int t = 0; t < myNumTaxa; t++) {
if (mySwitches[t].alleles(currentSite)) {
myNumSwitchesTaxa[t]++;
myNumSwitchesSites[currentSite - myStartSite]++;
while (true) {
if (temp[charIndex] == '\t') {
numTabsFound++;
if (numTabsFound == copyToNthTab) {
myBuilder.append(temp, startIndex, charIndex - startIndex + 1);
charIndex++;
char first = temp[charIndex];
charIndex++;
if (temp[charIndex] != '|') {
throw new IllegalStateException("AdjustPhasingPlugin: ProcessLines: | char expected.");
}
charIndex++;
myBuilder.append(temp[charIndex]);
myBuilder.append('|');
myBuilder.append(first);
charIndex++;
while (charIndex < numChars && temp[charIndex] != '\t') {
myBuilder.append(temp[charIndex]);
charIndex++;
}
if (charIndex < numChars) {
myBuilder.append('\t');
} else {
myBuilder.append('\n');
}
startIndex = charIndex + 1;
break;
}
}
charIndex++;
}
copyToNthTab++;
} else {
copyToNthTab++;
}
}
if (startIndex < numChars) {
myBuilder.append(temp, startIndex, numChars - startIndex);
myBuilder.append('\n');
}
currentSite++;
}
return this;
}
}
private static final int NUM_LINES_PER_BLOCK = 100;
private class ReadLines implements Runnable {
private final BufferedReader myReader;
private final BlockingQueue> myQueue;
private final Switch[] mySwitches;
private final ExecutorService myPool;
public ReadLines(BufferedReader reader, BlockingQueue> queue, Switch[] switches, ExecutorService pool) {
myReader = reader;
myQueue = queue;
mySwitches = switches;
myPool = pool;
}
@Override
public void run() {
try {
List temp = new ArrayList<>();
String line = myReader.readLine();
int count = 0;
int startSite = 0;
while (line != null) {
temp.add(line);
count++;
if (count == NUM_LINES_PER_BLOCK) {
String[] lines = new String[NUM_LINES_PER_BLOCK];
myQueue.add(myPool.submit(new ProcessLines(startSite, temp.toArray(lines), mySwitches)));
temp.clear();
count = 0;
startSite += NUM_LINES_PER_BLOCK;
}
line = myReader.readLine();
}
if (!temp.isEmpty()) {
String[] lines = new String[temp.size()];
myQueue.add(myPool.submit(new ProcessLines(startSite, temp.toArray(lines), mySwitches)));
}
myQueue.add(myPool.submit(new ProcessLines()));
} catch (Exception e) {
myLogger.debug(e.getMessage(), e);
throw new IllegalStateException("AdjustPhasingPlugin: ReadLines: problem reading file: " + inputVCFFile());
}
}
}
private class WriteLines implements Runnable {
private final BufferedWriter myWriter;
private final BlockingQueue> myQueue;
public WriteLines(BufferedWriter writer, BlockingQueue> queue) {
myWriter = writer;
myQueue = queue;
}
@Override
public void run() {
int[] numSwitchesTaxa = new int[myNumTaxa];
int[] numSwitchesSites = new int[myNumSites];
try {
Future future = myQueue.take();
ProcessLines processed = future.get();
while (!processed.myIsFinalProcessLines) {
myWriter.write(processed.myBuilder.toString());
int percent = Math.min(Math.abs((int) ((double) processed.myStartSite / (double) myNumSites * 100.0)), 100);
progress(percent, this);
for (int t = 0; t < myNumTaxa; t++) {
numSwitchesTaxa[t] += processed.myNumSwitchesTaxa[t];
}
System.arraycopy(processed.myNumSwitchesSites, 0, numSwitchesSites, processed.myStartSite, processed.myLines.length);
future = myQueue.take();
processed = future.get();
}
} catch (Exception e) {
myLogger.debug(e.getMessage(), e);
throw new IllegalStateException("AdjustPhasingPlugin: WriteLines: problem writing file: " + outputVCFFile());
}
for (int t = 0; t < myNumTaxa; t++) {
System.out.println(t + ": " + myTaxaNames[t] + ": alleles switched: " + numSwitchesTaxa[t]);
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy