net.maizegenetics.analysis.data.ExtractHapmapSubsetPlugin Maven / Gradle / Ivy
package net.maizegenetics.analysis.data;
import net.maizegenetics.dna.snp.GenotypeTable;
import net.maizegenetics.dna.snp.ExportUtils;
import net.maizegenetics.dna.snp.FilterGenotypeTable;
import net.maizegenetics.taxa.TaxaList;
import net.maizegenetics.taxa.TaxaListBuilder;
import net.maizegenetics.plugindef.AbstractPlugin;
import net.maizegenetics.plugindef.DataSet;
import net.maizegenetics.util.Utils;
import org.apache.log4j.Logger;
import javax.swing.*;
import java.awt.*;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.util.Collections;
import java.util.LinkedList;
import java.util.TreeSet;
import java.util.regex.Pattern;
public class ExtractHapmapSubsetPlugin extends AbstractPlugin {
private static final Logger myLogger = Logger.getLogger(ExtractHapmapSubsetPlugin.class);
private static final Pattern tab = Pattern.compile("\t");
private int minAlleleCount = 0;
private String inputFilename = "";
private String outputFilename = "";
private String pedFilename = "";
public ExtractHapmapSubsetPlugin(Frame parentFrame) {
super(parentFrame, false);
}
@Override
public DataSet performFunction(DataSet input) {
extractSubset();
return null;
}
@Override
public void setParameters(String[] args) {
if (args == null || args.length < 3) {
myLogger.error(getUsage());
return;
}
int narg = args.length;
for (int i = 0; i < narg - 1; i++) {
if (args[i].equals("-h") || args[i].equalsIgnoreCase("-inputfile")) {
inputFilename = args[++i];
}
else if (args[i].equals("-o") || args[i].equalsIgnoreCase("-outputfile")) {
outputFilename = args[++i];
}
else if (args[i].equals("-p") || args[i].equalsIgnoreCase("-pedfile")) {
pedFilename = args[++i];
}
else if (args[i].equals("-a") || args[i].equalsIgnoreCase("-minalleles")) {
minAlleleCount = Integer.parseInt(args[++i]);
}
}
}
public String getUsage() {
StringBuilder usage = new StringBuilder("The ExtractHapmapSubsetPlugin requires the following parameter:\n");
usage.append("-h or -inputfile : a hapmap file containing the full data set.\n");
usage.append("-o or -outputfile : the name of the file that will contain the subset specified by the pedfile.\n");
usage.append("-p or -pedfile : the pedigree file containing a list of taxa to be extracted.\n");
usage.append("The following parameter is optional:\n");
usage.append("-a or -minalleles : only snps with the minimum number of alleles will be extracted. Default = 0.\n\n");
return usage.toString();
}
@Override
public ImageIcon getIcon() {
return null;
}
@Override
public String getButtonName() {
return null;
}
@Override
public String getToolTipText() {
return null;
}
public void extractSubset() {
if (inputFilename.length() == 0 || outputFilename.length() == 0 || pedFilename.length() == 0) {
myLogger.info("Extract taxa requires three parameters: the input hapmap file, the output hapmap file, and the pedigree file.");
myLogger.info(getUsage());
System.exit(-1);
}
if (inputFilename.endsWith(".h5")) {
extractSubsetFromHDF5();
return;
}
try {
LinkedList pedlist = new LinkedList();
System.out.println("Reading taxa file, " + pedFilename);
BufferedReader br = Utils.getBufferedReader(pedFilename);
String input;
while ((input = br.readLine()) != null) {
pedlist.add(input);
}
System.out.println(pedlist.size() + " taxa read from the taxa file");
br.close();
br = Utils.getBufferedReader(inputFilename);
System.out.println("Reading hapmap file, " + inputFilename);
input = br.readLine();
String[] info = tab.split(input);
int n = info.length;
LinkedList colList = new LinkedList();
Collections.sort(pedlist);
for (int i = 11; i < n; i++) {
if (Collections.binarySearch(pedlist, info[i]) > -1) colList.add(i);
}
System.out.println(colList.size() + " matching taxa found in the HapMap file");
BufferedWriter bw = Utils.getBufferedWriter(outputFilename);
bw.write(info[0]);
for (int i = 1; i < 11; i++) {
bw.write("\t");
bw.write(info[i]);
}
for (Integer col:colList) {
bw.write("\t");
bw.write(info[col]);
}
bw.newLine();
if (minAlleleCount > 0) {
while ((input = br.readLine()) != null) {
info = tab.split(input);
TreeSet alleles = new TreeSet();
StringBuilder sb = new StringBuilder(info[0]);
for (int i = 1; i < 11; i++) sb.append("\t").append(info[i]);
for (Integer col:colList) {
alleles.add(info[col]);
sb.append("\t").append(info[col]);
}
int nAlleles = alleles.size();
boolean hasN = alleles.contains("N");
if ((hasN && nAlleles >= minAlleleCount + 1) || (!hasN && nAlleles >= minAlleleCount)) {
bw.write(sb.toString());
bw.newLine();
}
}
} else {
while ((input = br.readLine()) != null) {
info = tab.split(input);
StringBuilder sb = new StringBuilder(info[0]);
for (int i = 1; i < 11; i++) sb.append("\t").append(info[i]);
for (Integer col:colList) {
sb.append("\t").append(info[col]);
}
bw.write(sb.toString());
bw.newLine();
}
}
br.close();
bw.close();
} catch(IOException e) {
e.printStackTrace();
System.exit(-1);
}
System.out.println("Finished. Output written to " + outputFilename);
}
public void extractSubsetFromHDF5() {
long start = System.currentTimeMillis();
FileLoadPlugin flp = new FileLoadPlugin(null, false);
flp.setOpenFiles(new String[]{inputFilename});
DataSet ds = flp.performFunction(null);
GenotypeTable a = (GenotypeTable) ds.getData(0).getData();
System.out.format("Time elapsed for reading hdf5 file = %d\n", System.currentTimeMillis() - start);
LinkedList pedlist = new LinkedList();
System.out.println("Reading taxa file, " + pedFilename);
try {
BufferedReader br = Utils.getBufferedReader(pedFilename);
String input;
while ((input = br.readLine()) != null) {
pedlist.add(input);
}
System.out.println(pedlist.size() + " taxa read from the taxa file");
br.close();
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
}
String[] taxanames = new String[pedlist.size()];
pedlist.toArray(taxanames);
TaxaList ids =new TaxaListBuilder().addAll(taxanames).build();
start = System.currentTimeMillis();
GenotypeTable b = FilterGenotypeTable.getInstance(a, ids);
System.out.format("Time elapsed for filtering alignment = %d\n", System.currentTimeMillis() - start);
start = System.currentTimeMillis();
if (outputFilename.contains(".h5")) ExportUtils.writeGenotypeHDF5(b, outputFilename);
System.out.format("Time elapsed for writing new file = %d\n", System.currentTimeMillis() - start);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy