net.maizegenetics.dna.factor.io.BuilderFromHapmap.kt Maven / Gradle / Ivy
package net.maizegenetics.dna.factor.io
import com.google.common.collect.SetMultimap
import net.maizegenetics.dna.factor.FeatureTable
import net.maizegenetics.dna.factor.FeatureTableBuilder
import net.maizegenetics.dna.factor.site.SNPSiteBuilder
import net.maizegenetics.dna.map.Chromosome
import net.maizegenetics.dna.map.GenomicFeature
import net.maizegenetics.dna.map.GenomicFeatureList
import net.maizegenetics.dna.snp.GenotypeTableUtils
import net.maizegenetics.dna.snp.NucleotideAlignmentConstants
import net.maizegenetics.taxa.TaxaListBuilder
import net.maizegenetics.taxa.TaxaListIOUtils
import net.maizegenetics.taxa.Taxon
import net.maizegenetics.util.ProgressListener
import net.maizegenetics.util.SuperByteMatrix
import net.maizegenetics.util.SuperByteMatrixBuilder
import net.maizegenetics.util.Utils
import org.apache.logging.log4j.LogManager
import java.util.*
import java.util.concurrent.*
import java.util.regex.Pattern
/*
* BuilderFromHapmap
*/
/**
*
* @author Ed Buckler
* @author Terry Casstevens
*/
class BuilderFromHapMap private constructor(private val myHapmapFile: String, private val myProgressListener: ProgressListener?) {
private var mySortPositions = false
fun build(): FeatureTable {
var pool: ExecutorService? = null
try {
Utils.getBufferedReader(myHapmapFile, 1 shl 20)!!.use { reader ->
val sampAnnoBuild = TreeMap>()
var currLine: String? = reader.readLine()
while (currLine != null && currLine.startsWith("##")) {
val cat = currLine.split("=".toRegex(), 2).toTypedArray()
if (cat.size < 2) {
continue
}
if (cat[0].startsWith("##SAMPLE")) {
val mapOfAnno = TaxaListIOUtils.parseVCFHeadersIntoMap(cat[1])
val taxaID = mapOfAnno.get("ID").iterator().next()
if (taxaID != null) {
sampAnnoBuild[taxaID] = mapOfAnno
}
}
currLine = reader.readLine()
}
val taxaList = processTaxa(currLine, sampAnnoBuild)
val taxa = taxaList.build()
val numTaxa = taxa.numberOfTaxa()
val chromosomeLookup = ConcurrentHashMap()
currLine = reader.readLine()
var isOneLetter = false
val tokens = WHITESPACE_PATTERN.split(currLine, NUM_HAPMAP_NON_TAXA_HEADERS + 1)
if (tokens.size <= NUM_HAPMAP_NON_TAXA_HEADERS) {
throw IllegalStateException("BuilderFromHapMap: Header Incorrectly Formatted: See:\nhttps://bitbucket.org/tasseladmin/tassel-5-source/wiki/UserManual/Load/Load#markdown-header-hapmap")
}
val avg = (tokens[NUM_HAPMAP_NON_TAXA_HEADERS].length + 1).toDouble() / numTaxa.toDouble()
if (avg > 1.99 && avg < 2.01) {
isOneLetter = true
} else if (avg > 2.99 && avg < 3.01) {
isOneLetter = false
} else {
throw IllegalStateException("BuilderFromHapMap: Genotype coded wrong use 1 or 2 letters per genotype. Average chars including tab: $avg Or first site has incorrect number of values. Number of taxa: $numTaxa")
}
val numThreads = Runtime.getRuntime().availableProcessors()
pool = Executors.newFixedThreadPool(numThreads)
val futures = ArrayList>()
var numSitesToProcessTogether = NUM_VALUES_PROCESSED_TOGETHER / numTaxa
numSitesToProcessTogether = Math.min(1 shl 16, numSitesToProcessTogether)
numSitesToProcessTogether = Math.max(512, numSitesToProcessTogether)
var textLines = ArrayList(numSitesToProcessTogether)
var numLines = 0
while (currLine != null) {
textLines.add(currLine)
numLines++
if (numLines % numSitesToProcessTogether == 0) {
val processBlock = ProcessHapmapBlock(textLines, numTaxa, chromosomeLookup, isOneLetter)
futures.add(pool!!.submit(processBlock))
textLines = ArrayList(numSitesToProcessTogether)
}
currLine = reader.readLine()
}
if (textLines.size > 0) {
val processBlock = ProcessHapmapBlock(textLines, numTaxa, chromosomeLookup, isOneLetter)
futures.add(pool!!.submit(processBlock))
}
var currentSite = 0
val positions = GenomicFeatureList.Builder()
val factorTableBuilder = FeatureTableBuilder(taxa)
//val genotypes = SNPSiteBuilder.instance(numTaxa, numLines)
val numFutures = futures.size
var count = 0
for (future in futures) {
val pb = future.get()
positions.addAll(pb.positions)
val bgTS = pb.genotypes
for (s in 0 until bgTS!!.numColumns) {
val snpSite = SNPSiteBuilder(pb.positions[s], taxa)
for (t in 0 until bgTS.numRows) {
snpSite.set(t, bgTS.get(t, s))
}
factorTableBuilder.add(snpSite.build())
}
// for (t in 0 until bgTS!!.numRows) {
// val snpSite = SNPSiteBuilder(pb.positions[])
// for (s in 0 until bgTS.numColumns) {
// genotypes.set(t, currentSite + s, bgTS.get(t, s))
// }
// }
currentSite += pb.numberSitesProcessed
if (myProgressListener != null) {
count++
myProgressListener.progress(count * 100 / numFutures, null)
}
}
pool!!.shutdown()
//if (mySortPositions) {
// positions.sortPositions(genotypes)
//}
//if (positions.validateOrdering() == false) {
// throw IllegalStateException("BuilderFromHapMap: Ordering incorrect. HapMap must be ordered by position. Please first use SortGenotypeFilePlugin to correctly order the file.")
//}
//genotypes.taxa = taxa
//genotypes.factors = positions.build()
return factorTableBuilder.build()
}
} catch (e: Exception) {
if (pool != null) {
pool!!.shutdown()
}
myLogger.debug(e.message, e)
throw IllegalStateException(e.message)
}
}
private inner class ProcessHapmapBlock(private var myInputLines: List?, private val myNumTaxa: Int, private val myChromosomeLookup: MutableMap, private val myIsOneLetter: Boolean) : Callable {
private val myPositionList: MutableList
val numberSitesProcessed: Int
var genotypes: SuperByteMatrix? = null
private set
val positions: List
get() = myPositionList
init {
numberSitesProcessed = myInputLines!!.size
myPositionList = ArrayList(numberSitesProcessed)
}
override fun call(): ProcessHapmapBlock {
genotypes = SuperByteMatrixBuilder.getInstance(myNumTaxa, numberSitesProcessed)
for (site in 0 until numberSitesProcessed) {
val input = myInputLines!![site]
try {
val tabPos = IntArray(NUM_HAPMAP_NON_TAXA_HEADERS)
var tabIndex = 0
val len = input.length
run {
var i = 0
while (tabIndex < NUM_HAPMAP_NON_TAXA_HEADERS && i < len) {
if (input[i] == '\t') {
tabPos[tabIndex++] = i
}
i++
}
}
val chrName = input.substring(tabPos[CHROMOSOME_INDEX - 1] + 1, tabPos[CHROMOSOME_INDEX])
val tempChr = myChromosomeLookup[chrName]
val currChr = when (tempChr) {
null -> {
Chromosome.instance(chrName).also { myChromosomeLookup[chrName] = it }
}
else -> tempChr
}
val variants = input.substring(tabPos[VARIANT_INDEX - 1] + 1, tabPos[VARIANT_INDEX])
val physicalPos: Int
try {
physicalPos = Integer.parseInt(input.substring(tabPos[POSITION_INDEX - 1] + 1, tabPos[POSITION_INDEX]))
} catch (ex: Exception) {
throw IllegalArgumentException("BuilderFromHapMap: Position must be an integer: " + input.substring(tabPos[POSITION_INDEX - 1] + 1, tabPos[POSITION_INDEX]).trim { it <= ' ' })
}
//input.substring(0, tabPos[SNPID_INDEX]),
val apb = GenomicFeature(currChr, physicalPos)
// TODO("knownVariants(variants) //TODO strand, variants,")
//val glbMajor = NucleotideAlignmentConstants.getNucleotideDiploidByte(variants[0])
//apb.allele(WHICH_ALLELE.GlobalMajor, glbMajor)
//if (variants.length == 3) {
// val glbMinor = NucleotideAlignmentConstants.getNucleotideDiploidByte(variants[2])
// apb.allele(WHICH_ALLELE.GlobalMinor, glbMinor)
//}
myPositionList.add(apb)
val offset = tabPos[NUM_HAPMAP_NON_TAXA_HEADERS - 1] + 1
var taxon = 0
if (myIsOneLetter) {
var i = offset
while (i < len) {
if (taxon >= myNumTaxa) {
throw IllegalStateException("BuilderFromHapMap: SNP at Chromosome: ${myPositionList[myPositionList.size - 1].startChr} Position: ${myPositionList[myPositionList.size - 1].startPos} has too many values.")
}
val value = NucleotideAlignmentConstants.getNucleotideDiploidByte(input[i])
if (value == NucleotideAlignmentConstants.UNDEFINED_HOMOZYGOUS) {
throw IllegalStateException("BuilderFromHapMap: SNP at Chromosome: ${myPositionList[myPositionList.size - 1].startChr} Position: ${myPositionList[myPositionList.size - 1].startPos} has illegal value: " + input[i])
}
genotypes!!.set(taxon++, site, value)
i += 2
}
} else {
var i = offset
while (i < len) {
if (taxon >= myNumTaxa) {
throw IllegalStateException("BuilderFromHapMap: SNP at Chromosome: ${myPositionList[myPositionList.size - 1].startChr} Position: ${myPositionList[myPositionList.size - 1].startPos} has too many values.")
}
// there is a phasing conflict with the existing import approach
val value = GenotypeTableUtils.getDiploidValue(NucleotideAlignmentConstants.getNucleotideDiploidByte(input[i + 1]),
NucleotideAlignmentConstants.getNucleotideDiploidByte(input[i]))
if (value == NucleotideAlignmentConstants.UNDEFINED_HOMOZYGOUS) {
throw IllegalStateException("BuilderFromHapMap: SNP at Chromosome: ${myPositionList[myPositionList.size - 1].startChr} Position: ${myPositionList[myPositionList.size - 1].startPos} has illegal value: " + input[i] + input[i + 1])
}
genotypes!!.set(taxon++, site, value)
i += 3
}
}
if (taxon != myNumTaxa) {
throw IllegalStateException("BuilderFromHapMap: SNP at Chromosome: ${myPositionList[myPositionList.size - 1].startChr} Position: ${myPositionList[myPositionList.size - 1].startPos} has too few values.")
}
swapSitesIfOutOfOrder(site)
} catch (e: Exception) {
myLogger.error("Error parsing this row $input")
myLogger.debug(e.message, e)
throw IllegalStateException("BuilderFromHapMap: Error Parsing Line: " + input.substring(0, Math.min(25, input.length)) + "...\n" + e.message)
}
}
myInputLines = null
return this
}
// Swap adjacent misordered sites, often caused by two sites at the same positions with a different name order
private fun swapSitesIfOutOfOrder(site: Int) {
if (site < 1) {
return
}
if (myPositionList[site - 1].compareTo(myPositionList[site]) > 0) {
//swap
val tempP = myPositionList[site - 1]
myLogger.warn("Swapping:" + tempP.toString() + " <-> " + myPositionList[site].toString())
myPositionList[site - 1] = myPositionList[site]
myPositionList[site] = tempP
for (t in 0 until genotypes!!.numRows) {
val tempG = genotypes!!.get(t, site - 1)
genotypes!!.set(t, site - 1, genotypes!!.get(t, site))
genotypes!!.set(t, site, tempG)
}
}
}
}
/**
* Set the builder so that when built it will sort positions.
*/
fun sortPositions(): BuilderFromHapMap {
mySortPositions = true
return this
}
companion object {
private val myLogger = LogManager.getLogger(BuilderFromHapMap::class.java)
private val WHITESPACE_PATTERN = Pattern.compile("\\t")
private val NUM_HAPMAP_NON_TAXA_HEADERS = 11
private val SNPID_INDEX = 0
private val VARIANT_INDEX = 1
private val CHROMOSOME_INDEX = 2
private val POSITION_INDEX = 3
private val NUM_VALUES_PROCESSED_TOGETHER = 7 shl 20
@JvmStatic
fun getBuilder(hapmapFile: String): BuilderFromHapMap {
return BuilderFromHapMap(hapmapFile, null)
}
@JvmStatic
fun getBuilder(hapmapFile: String, listener: ProgressListener): BuilderFromHapMap {
return BuilderFromHapMap(hapmapFile, listener)
}
internal fun processTaxa(readLn: String?, taxaAnnotation: Map>): TaxaListBuilder {
val header = WHITESPACE_PATTERN.split(readLn)
val numTaxa = header.size - NUM_HAPMAP_NON_TAXA_HEADERS
val tlb = TaxaListBuilder()
for (i in 0 until numTaxa) {
val taxonID = header[i + NUM_HAPMAP_NON_TAXA_HEADERS]
if (taxonID == null || taxonID.isEmpty()) {
throw IllegalStateException("BuilderFromHapMap: processTaxa: Taxa names should be separated by a single tab and contain no spaces.")
}
val at = Taxon.Builder(taxonID)
val taMap = taxaAnnotation[taxonID]
if (taMap != null) {
for ((key, value) in taMap.entries()) {
if (key == "ID") {
continue //skip the IDs as these became the name
}
val s = value.replace("\"", "")
at.addAnno(key, s)
}
}
tlb.add(at.build())
}
return tlb
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy