net.maizegenetics.dna.snp.io.BuilderFromPLINK Maven / Gradle / Ivy
package net.maizegenetics.dna.snp.io;
import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.dna.map.GeneralPosition;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.dna.map.PositionListBuilder;
import net.maizegenetics.dna.snp.GenotypeTable;
import net.maizegenetics.dna.snp.GenotypeTableBuilder;
import net.maizegenetics.dna.snp.GenotypeTableUtils;
import net.maizegenetics.dna.snp.genotypecall.GenotypeCallTable;
import net.maizegenetics.dna.snp.genotypecall.GenotypeCallTableBuilder;
import net.maizegenetics.taxa.TaxaList;
import net.maizegenetics.taxa.TaxaListBuilder;
import net.maizegenetics.taxa.Taxon;
import net.maizegenetics.util.ProgressListener;
import net.maizegenetics.util.Utils;
import org.apache.log4j.Logger;
import java.io.BufferedReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.regex.Pattern;
import static net.maizegenetics.dna.snp.NucleotideAlignmentConstants.*;
/**
* @author Terry Casstevens
* @author Ed Buckler
*/
public class BuilderFromPLINK {
private static final Logger myLogger = Logger.getLogger(BuilderFromPLINK.class);
private static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\s");
private static final int NUM_PLINK_NON_SITE_HEADERS = 6;
private final String myPedFile;
private final String myMapFile;
private final ProgressListener myProgressListener;
private boolean mySortTaxaAlphabetically = false;
private boolean mySortPositions = false;
private BuilderFromPLINK(String pedfile, String mapfile, ProgressListener listener) {
myPedFile = pedfile;
myMapFile = mapfile;
myProgressListener = listener;
}
public static BuilderFromPLINK getBuilder(String pedfile, String mapfile, ProgressListener listener) {
return new BuilderFromPLINK(pedfile, mapfile, listener);
}
public GenotypeTable build() {
GenotypeTable result = null;
try {
int numThreads = Runtime.getRuntime().availableProcessors();
ExecutorService pool = Executors.newFixedThreadPool(numThreads);
myLogger.info("Reading: " + myPedFile + " and " + myMapFile);
PositionListBuilder posBuild = processSites(myMapFile);
myLogger.info("Number of sites: " + posBuild.size());
int numOfTaxa = Utils.getNumberLinesNotHashOrBlank(myPedFile);
myLogger.info("Number of taxa: " + numOfTaxa);
GenotypeCallTableBuilder genotypeCallTableBuilder = GenotypeCallTableBuilder.getUnphasedNucleotideGenotypeBuilder(numOfTaxa, posBuild.size());
int linesAtTime = (int) Math.ceil((1 << 24) / posBuild.size());
ArrayList textLines = new ArrayList<>(linesAtTime);
List> futures = new ArrayList<>();
int numLines = 0;
try (BufferedReader reader = Utils.getBufferedReader(myPedFile)) {
String currLine = reader.readLine();
while (currLine != null) {
textLines.add(currLine);
numLines++;
if (numLines % linesAtTime == 0) {
ProcessPLINKBlock processBlock = new ProcessPLINKBlock(textLines, genotypeCallTableBuilder, numLines - linesAtTime, numLines * 100 / numOfTaxa);
futures.add(pool.submit(processBlock));
textLines = new ArrayList<>(linesAtTime);
}
currLine = reader.readLine();
}
}
if (textLines.size() > 0) {
ProcessPLINKBlock processBlock = new ProcessPLINKBlock(textLines, genotypeCallTableBuilder, numLines - textLines.size(), 100);
futures.add(pool.submit(processBlock));
}
TaxaListBuilder taxaBuild = new TaxaListBuilder();
for (Future future : futures) {
ProcessPLINKBlock pb = future.get();
taxaBuild.addAll(pb.getBlockTaxa());
}
TaxaList taxaList = taxaBuild.build();
pool.shutdown();
// Check that result is in correct order. If not, either try to sort
// or just throw an error (determined by what was passed to fullSort)
if (posBuild.validateOrdering() == false) {
if (mySortPositions) {
posBuild.sortPositions(genotypeCallTableBuilder);
// Double-check post-sort ordering. Should never happen, but just to be safe
if (posBuild.validateOrdering() == false) {
throw new IllegalStateException("BuilderFromPLINK: Ordering of PLINK failed.");
}
} else {
throw new IllegalStateException("BuilderFromPLINK: Ordering incorrect. PLINK must be ordered by position.");
}
}
GenotypeCallTable g = genotypeCallTableBuilder.build();
result = GenotypeTableBuilder.getInstance(g, posBuild.build(), taxaList);
} catch (Exception e) {
myLogger.debug(e.getMessage(), e);
throw new IllegalStateException("BuilderFromPLINK: build: problem processing: " + e.getMessage());
}
return result;
}
/**
* Set the builder so that when built it will sort the taxa
*/
public BuilderFromPLINK sortTaxa() {
mySortTaxaAlphabetically = true;
return this;
}
/**
* Set the builder so that when built it will sort the positions.
*/
public BuilderFromPLINK sortPositions() {
mySortPositions = true;
return this;
}
// chromosome (1-22, X, Y or 0 if unplaced)
// rs# or snp identifier
// Genetic distance (morgans)
// Base-pair position (bp units)
private static final int PLINK_MAP_CHROMOSOME_INDEX = 0;
private static final int PLINK_MAP_SND_ID_INDEX = 1;
private static final int PLINK_MAP_GENETIC_DISTANCE_INDEX = 2;
private static final int PLINK_MAP_POSITION_INDEX = 3;
private static final int NUM_PLINK_MAP_COLUMNS = 4;
private static PositionListBuilder processSites(String mapfile) {
Map chromosomes = new HashMap<>();
List positions = new ArrayList<>();
int lineNum = 1;
try (BufferedReader reader = Utils.getBufferedReader(mapfile)) {
String line = reader.readLine();
while (line != null) {
String[] tokens = WHITESPACE_PATTERN.split(line);
if (tokens.length < NUM_PLINK_MAP_COLUMNS) {
throw new IllegalStateException("BuilderFromPLINK: processSites: Not all columns defined line : \"" + line + "\" of file: " + mapfile);
}
Chromosome chr = chromosomes.get(tokens[PLINK_MAP_CHROMOSOME_INDEX]);
if (chr == null) {
chr = Chromosome.instance(new String(tokens[PLINK_MAP_CHROMOSOME_INDEX]));
chromosomes.put(tokens[PLINK_MAP_CHROMOSOME_INDEX], chr);
}
GeneralPosition current = new GeneralPosition.Builder(chr, Integer.parseInt(tokens[PLINK_MAP_POSITION_INDEX]))
.snpName(new String(tokens[PLINK_MAP_SND_ID_INDEX])).build();
positions.add(current);
line = reader.readLine();
lineNum++;
}
} catch (Exception e) {
myLogger.debug(e.getMessage(), e);
throw new IllegalStateException("BuilderFromPLINK: processSites: problem with: " + mapfile + " line: " + lineNum);
}
PositionListBuilder result = new PositionListBuilder();
result.addAll(positions);
return result;
}
// Family ID
// Individual ID
// Paternal ID
// Maternal ID
// Sex (1=male; 2=female; other=unknown)
// Phenotype
private static final int PLINK_PED_FAMILY_ID_INDEX = 0;
private static final int PLINK_PED_INDIVIDUAL_ID_INDEX = 1;
private static final int PLINK_PED_PATERNAL_ID_INDEX = 2;
private static final int PLINK_PED_MATERNAL_ID_INDEX = 3;
private static final int PLINK_PED_SEX_INDEX = 4;
private static final int PLINK_PED_PHENOTYPE_INDEX = 5;
private class ProcessPLINKBlock implements Callable {
private final int myNumTaxaToProcess;
private ArrayList myTextLines;
private final ArrayList myBlockTaxaList;
private final GenotypeCallTableBuilder myBuilder;
private final int myStartTaxon;
private final int myProgress;
private ProcessPLINKBlock(ArrayList textLines, GenotypeCallTableBuilder builder, int startTaxon, int progress) {
myNumTaxaToProcess = textLines.size();
myTextLines = textLines;
myBlockTaxaList = new ArrayList<>(myNumTaxaToProcess);
myBuilder = builder;
myStartTaxon = startTaxon;
myProgress = progress;
}
@Override
public ProcessPLINKBlock call() throws Exception {
for (int t = 0; t < myNumTaxaToProcess; t++) {
String input = myTextLines.get(t);
try {
String[] tokens = WHITESPACE_PATTERN.split(input, NUM_PLINK_NON_SITE_HEADERS + 1);
String taxonName = new String(tokens[PLINK_PED_INDIVIDUAL_ID_INDEX].trim());
Taxon taxon = new Taxon.Builder(taxonName).build();
myBlockTaxaList.add(taxon);
int taxonIndex = t + myStartTaxon;
for (int i = 0, n = tokens[NUM_PLINK_NON_SITE_HEADERS].length(); i < n; i += 4) {
myBuilder.setBase(taxonIndex, i / 4, GenotypeTableUtils.getDiploidValue(getPLINKAlleleByte(tokens[NUM_PLINK_NON_SITE_HEADERS].charAt(i)),
getPLINKAlleleByte(tokens[NUM_PLINK_NON_SITE_HEADERS].charAt(i + 2))));
}
} catch (ArrayIndexOutOfBoundsException e) {
myLogger.debug(e.getMessage(), e);
throw new IllegalStateException("BuilderFromPLINK: ProcessPLINKBlock: problem: to many genotypes in file: " + myPedFile + " line: " + input.substring(0, Math.min(input.length(), 50)));
} catch (Exception e) {
myLogger.debug(e.getMessage(), e);
throw new IllegalStateException("BuilderFromPLINK: ProcessPLINKBlock: problem: " + e.getMessage() + " file: " + myPedFile + " line: " + input.substring(0, Math.min(input.length(), 50)));
}
}
myTextLines = null;
if (myProgressListener != null) {
myProgressListener.progress(myProgress, null);
}
return this;
}
List getBlockTaxa() {
return myBlockTaxaList;
}
}
private static final Map PLINK_ALLELE_HASH = new HashMap<>();
private static final byte[] PLINK_ALLELE_ARRAY = new byte[256];
static {
PLINK_ALLELE_HASH.put("A", A_ALLELE);
PLINK_ALLELE_HASH.put("C", C_ALLELE);
PLINK_ALLELE_HASH.put("G", G_ALLELE);
PLINK_ALLELE_HASH.put("T", T_ALLELE);
PLINK_ALLELE_HASH.put("+", INSERT_ALLELE);
PLINK_ALLELE_HASH.put("-", GAP_ALLELE);
PLINK_ALLELE_HASH.put("N", GenotypeTable.UNKNOWN_ALLELE);
PLINK_ALLELE_HASH.put("0", GenotypeTable.UNKNOWN_ALLELE);
PLINK_ALLELE_HASH.put("1", A_ALLELE);
PLINK_ALLELE_HASH.put("2", C_ALLELE);
PLINK_ALLELE_HASH.put("3", G_ALLELE);
PLINK_ALLELE_HASH.put("4", T_ALLELE);
Arrays.fill(PLINK_ALLELE_ARRAY, UNDEFINED_ALLELE);
for (Map.Entry en : PLINK_ALLELE_HASH.entrySet()) {
PLINK_ALLELE_ARRAY[en.getKey().charAt(0)] = en.getValue();
}
}
/**
* Returns haploid byte value for given PLINK value. Only right-most four
* bits used.
*
* @param value haploid allele value
*
* @return nucleotide haploid allele byte value
*/
public static byte getPLINKAlleleByte(char value) {
try {
return PLINK_ALLELE_ARRAY[value];
} catch (NullPointerException e) {
throw new IllegalArgumentException("BuilderFromPLINK: getPLINKAlleleByte: unknown allele value: " + value);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy