net.maizegenetics.dna.snp.io.BuilderFromHapMapLIX Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of tassel Show documentation
Show all versions of tassel Show documentation
TASSEL is a software package to evaluate traits associations, evolutionary patterns, and linkage
disequilibrium.
The newest version!
/*
* BuilderFromHapMapLIX
*
* Created on Aug 29, 2015
*/
package net.maizegenetics.dna.snp.io;
import com.google.common.collect.SetMultimap;
import htsjdk.samtools.util.BlockCompressedInputStream;
import htsjdk.tribble.util.ParsingUtils;
import java.io.File;
import java.util.HashMap;
import java.util.Map;
import java.util.TreeMap;
import java.util.regex.Pattern;
import net.maizegenetics.dna.WHICH_ALLELE;
import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.dna.map.GeneralPosition;
import net.maizegenetics.dna.map.PositionList;
import net.maizegenetics.dna.map.PositionListBuilder;
import net.maizegenetics.dna.snp.GenotypeTable;
import net.maizegenetics.dna.snp.GenotypeTableBuilder;
import net.maizegenetics.dna.snp.NucleotideAlignmentConstants;
import net.maizegenetics.dna.snp.genotypecall.LineIndexHapmapGenotypeCallTable;
import static net.maizegenetics.dna.snp.io.BuilderFromHapMap.processTaxa;
import net.maizegenetics.taxa.TaxaList;
import net.maizegenetics.taxa.TaxaListIOUtils;
import net.maizegenetics.util.Tuple;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
/**
*
* @author Terry Casstevens
*/
public class BuilderFromHapMapLIX {
private static final Logger myLogger = LogManager.getLogger(BuilderFromHapMapLIX.class);
private static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\s");
private static final int NUM_HAPMAP_NON_TAXA_HEADERS = 11;
private static final int SNPID_INDEX = 0;
private static final int VARIANT_INDEX = 1;
private static final int CHROMOSOME_INDEX = 2;
private static final int POSITION_INDEX = 3;
private BuilderFromHapMapLIX() {
}
public static GenotypeTable build(String hapmapFileBGZip) {
return build(hapmapFileBGZip, ParsingUtils.appendToPath(hapmapFileBGZip, LineIndexBuilder.LINE_INDEX_FILE_EXTENSION));
}
public static GenotypeTable build(String hapmapFileBGZip, String indexFilename) {
Tuple indexPositionInfo = LineIndexBuilder.readIndex(indexFilename);
Map chromosomeLookup = new HashMap<>();
PositionListBuilder positions = new PositionListBuilder();
for (String current : indexPositionInfo.y) {
String[] tokens = current.split("\t");
String chrName = tokens[CHROMOSOME_INDEX];
Chromosome currChr = chromosomeLookup.get(chrName);
if (currChr == null) {
currChr = new Chromosome(new String(chrName));
chromosomeLookup.put(chrName, currChr);
}
String variants = tokens[VARIANT_INDEX];
int physicalPos;
try {
physicalPos = Integer.parseInt(tokens[POSITION_INDEX]);
} catch (Exception ex) {
throw new IllegalArgumentException("BuilderFromHapMapLIX: Position must be an integer: " + tokens[POSITION_INDEX]);
}
GeneralPosition.Builder positionBuilder = new GeneralPosition.Builder(currChr, physicalPos)
.snpName(tokens[SNPID_INDEX])
.knownVariants(variants);
byte glbMajor = NucleotideAlignmentConstants.getNucleotideDiploidByte(variants.charAt(0));
positionBuilder.allele(WHICH_ALLELE.GlobalMajor, glbMajor);
if (variants.length() == 3) {
byte glbMinor = NucleotideAlignmentConstants.getNucleotideDiploidByte(variants.charAt(2));
positionBuilder.allele(WHICH_ALLELE.GlobalMinor, glbMinor);
}
positions.add(positionBuilder.build());
}
PositionList positionList = positions.build();
TaxaList taxaList = null;
boolean isOneLetter = false;
try (BlockCompressedInputStream reader = new BlockCompressedInputStream(new File(hapmapFileBGZip))) {
Map> sampAnnoBuild = new TreeMap<>();
String currLine = reader.readLine();
while ((currLine != null) && currLine.startsWith("##")) {
String[] cat = currLine.split("=", 2);
if (cat.length < 2) {
continue;
}
if (cat[0].startsWith("##SAMPLE")) {
SetMultimap mapOfAnno = TaxaListIOUtils.parseVCFHeadersIntoMap(cat[1]);
String taxaID = mapOfAnno.get("ID").iterator().next();
if (taxaID != null) {
sampAnnoBuild.put(taxaID, mapOfAnno);
}
}
currLine = reader.readLine();
}
taxaList = processTaxa(currLine, sampAnnoBuild).build();
int numTaxa = taxaList.numberOfTaxa();
currLine = reader.readLine();
String[] tokens = WHITESPACE_PATTERN.split(currLine, NUM_HAPMAP_NON_TAXA_HEADERS + 1);
if (tokens.length <= NUM_HAPMAP_NON_TAXA_HEADERS) {
throw new IllegalStateException("BuilderFromHapMapLIX: Header Incorrectly Formatted: See:\nhttps://bitbucket.org/tasseladmin/tassel-5-source/wiki/UserManual/Load/Load#markdown-header-hapmap");
}
double avg = (double) (tokens[NUM_HAPMAP_NON_TAXA_HEADERS].length() + 1) / (double) numTaxa;
if ((avg > 1.99) && (avg < 2.01)) {
isOneLetter = true;
} else if ((avg > 2.99) && (avg < 3.01)) {
isOneLetter = false;
} else {
throw new IllegalStateException("BuilderFromHapMapLIX: Genotype coded wrong use 1 or 2 letters per genotype. Or first site has incorrect number of values.");
}
} catch (Exception e) {
throw new IllegalStateException("BuilderFromHapMapLIX: Problem opening file: " + hapmapFileBGZip + "\n" + e.getMessage());
}
return GenotypeTableBuilder.getInstance(LineIndexHapmapGenotypeCallTable.getInstance(taxaList.numberOfTaxa(), positionList.numberOfSites(), false, isOneLetter, indexPositionInfo.x, hapmapFileBGZip), positionList, taxaList);
}
}