All Downloads are FREE. Search and download functionalities are using the official Maven repository.

biokotlin.kegg.KeggParsers.kt Maven / Gradle / Ivy

package biokotlin.kegg

import biokotlin.kegg.KeggDB.*
import org.jgrapht.graph.DefaultDirectedGraph
import org.jgrapht.graph.DefaultEdge
import org.w3c.dom.Document
import org.w3c.dom.Element
import org.w3c.dom.NodeList
import org.xml.sax.InputSource
import java.io.StringReader
import javax.print.DocFlavor
import javax.xml.parsers.DocumentBuilder
import javax.xml.parsers.DocumentBuilderFactory


/**
 * Parses the Kegg Response to a map of KEGG labels to String with the new lines
 * retained in the value.  e.g. "ENTRY" -> "542318            CDS       T01088"
 */
private fun parseKEGG(str: String): Map {
    val keggKeyToValue = mutableMapOf()
            //There might be other defaults that we to include - i.e. use a when here
            .withDefault{ throw NoSuchElementException("KEGG response is missing the $it field") }
    var lastKeyword = ""
    for (line in str.lines()) {
        if (line.startsWith("///")) break
        if (line.substring(0..11).isNotBlank()) lastKeyword = line.substring(0..11).trim()
        val data = line.substring(12).trim()
        keggKeyToValue.merge(lastKeyword, data) { old, new -> old + "\n" + new }
    }
    return keggKeyToValue
}

private val whiteSpace = "\\s+".toRegex()

/** ^ start at beginning, (.+?) match any non-greedy,(?=\\d) till next digit*/
private val prefixChars = "^(.+?)(?=\\d)".toRegex()
internal fun String.keggPrefix(): String = prefixChars.find(this)?.value ?: ""

/** start are the first digit and grab the rest of the digits*/
private val suffixNum = "(\\d+)".toRegex()
internal fun String.keggSuffix(): String = suffixNum.find(this)?.value ?: "00000"

/** (?<=\[GN:) is lookbehind "[GN:", (?=\\]) is look ahead to pickup the last "]"*/
val orgCodeInGN = "(?<=\\[GN:)(.*)(?=])".toRegex()

/** (?<=\[GN:) is lookbehind "[GN:", (?=\\]) is look ahead to pickup the last "]"*/
val ecInBracket = "(?<=\\[EC:)(.*)(?=])".toRegex()

/**Parses the Kegg Response to a [KeggGenome]*/
internal fun geneParser(keggResponseText: String): KeggGene {
    val attributes = parseKEGG(keggResponseText)
    val entryHeader = (attributes["ENTRY"] ?: error("Gene entry is missing")).split(whiteSpace)
    val geneEntry = entryHeader[0]
    val keGenome = KeggEntry.of(genome.abbr, entryHeader[2])
    val orgCode = KeggCache.orgCode(keGenome) ?: throw IllegalStateException("Genome $keGenome not in org set")
    //TODO add pathway parsing
    //TODO KEGG changed things - with NAME, DEFINITION, and SYMBOL
    // https://www.kegg.jp/kegg/docs/dbentry.html
    val orthologyKID = KeggEntry.of(orthology.abbr, (attributes["ORTHOLOGY"]
            ?: error("ORTHOLOGY missing")).split(whiteSpace)[0])
    val nameAndDefinition = attributes["NAME"].orEmpty() //used to be DEFINITION

    val aaSeq = attributes["AASEQ"]?.let { cleanSeqWithLength(it) }?:""
    val ntSeq = attributes["NTSEQ"]?.let { cleanSeqWithLength(it) }?:""

    val ke = KeggInfo.of(genes, KeggEntry.of(orgCode, geneEntry), name = nameAndDefinition,
            org = orgCode, definition = nameAndDefinition)
    return KeggGene(ke, orthology = orthologyKID, position = attributes["POSITION"] ?: error("Position missing"),
            ntSeq = ntSeq, aaSeq = aaSeq)
}

private fun cleanSeqWithLength(sizeSeq: String): String {
    val lines = sizeSeq.lines()
    val size = lines[0].toInt()
    val seq = lines.drop(1).joinToString("")
    if (seq.length != size) throw IllegalStateException("Gene length does not agree with length")
    return seq
}

/**Parses the Kegg Response to a [KeggPathway]*/
internal fun pathwayParser(keggResponseText: String): KeggPathway {
    val attributes = parseKEGG(keggResponseText)
    val kid = (attributes.get("ENTRY")?:error("KID not in ENTRY")).split(whiteSpace)[0].let { KeggEntry.of("path", it) }
    val orgCode = attributes["ORGANISM"]?.let { orgCodeInGN.find(it)?.value?.lowercase() }.orEmpty()
    val nameAndDefinition = attributes["NAME"].orEmpty()

    val genes = (attributes["GENE"].orEmpty()).lines()
            .map { it.split(whiteSpace, 2)[0] }
            .map { KeggEntry.of(orgCode, it) }
    val compounds = (attributes["COMPOUND"].orEmpty()).lines()
            .map { it.split(whiteSpace, 2)[0] }
            .map { KeggEntry.of(compound.abbr, it) }


    val ke = KeggInfo.of(pathway, kid, name = nameAndDefinition,
            org = orgCode, definition = nameAndDefinition)
    return KeggPathway(ke, genes, compounds)
}

/**Parses the Kegg Response to a [KeggOrtholog]*/
internal fun orthologyParser(keggResponseText: String): KeggOrtholog {
    val attributes = parseKEGG(keggResponseText)
    val kid = KeggEntry.of("ko", (attributes["ENTRY"] ?: error("ENTRY missing")).split(whiteSpace)[0])
    val name = attributes["SYMBOL"] ?: error("NAME is missing")
    val definition = attributes["NAME"].orEmpty()
    val ec = ecInBracket.find(definition)!!.value

    val genes: Map> = (attributes["GENES"] ?: error("GENES is missing")).lines()
            .filterNot { it.startsWith("AG") } //AG are addendum genes with organisms
            //TODO filter by species or clade
            .associate { lineOfOrg ->
                val orgGenes = lineOfOrg.split(": ")
                val orgEntry = orgGenes[0].lowercase()
                val ke = orgGenes[1].split(" ")
                        .map { it.substringBefore("(") }
                        .map { KeggEntry.of(orgEntry, it) }
                orgEntry to ke
            }
    val ki = KeggInfoImpl(orthology, kid, name = name, definition = definition)
    return KeggOrtholog(ki, ec, genes)
}


// === KGML functions ===============================================

/**
 * Return graph from KEGG pathway using parsed KGML data
 * TODO - verify if this pathway extension is usable for deployment
 */
fun KeggPathway.kgmlGraph(): DefaultDirectedGraph {
    val parsedData = kgmlParser(this.keggInfo.keggEntry.dbAbbrev, this.keggInfo.keggEntry.kid)
    return kgmlGraphConstructor(parsedData)
}

/**
 * Parse KGML data into KGML data classes. Returns a Map.
 */
internal fun kgmlParser(path: String, kid: String): Map> {
    val rawXML = KeggServer.query(KeggOperations.get, "$path:$kid/kgml")//?: error("Not found in KEGG")
    val doc = convertStringToXMLDocument(rawXML)

    val entryList: NodeList = doc!!.getElementsByTagName("entry")
    val relationList: NodeList = doc.getElementsByTagName("relation")
    val reactionList: NodeList = doc.getElementsByTagName("reaction")

    val parsedEntryList = mutableListOf()
    val parsedReactList = mutableListOf()
    val parsedRelateList = mutableListOf()

    // Get entries
    for (i in 0 until entryList.length) {
        val baseChild = entryList.item(i).attributes
        val entry = KGMLEntry()
        entry.id = baseChild.getNamedItem("id").nodeValue.toInt()
        entry.name = baseChild.getNamedItem("name").nodeValue.split(" ")

        if (baseChild.getNamedItem("type") == null) {
            entry.type = "null"
        } else {
            entry.type = baseChild.getNamedItem("type").nodeValue
        }
        if (baseChild.getNamedItem("link") == null) {
            entry.link = "null"
        } else {
            entry.link = baseChild.getNamedItem("link").nodeValue
        }
        if (baseChild.getNamedItem("reaction") == null) {
            entry.reaction = "null"
        } else {
            entry.reaction = baseChild.getNamedItem("reaction").nodeValue
        }

        parsedEntryList += entry
    }

    // Get reactions - substrates and products
    for (i in 0 until reactionList.length) {
        val baseChild = reactionList.item(i).attributes
        val reaction = KGMLReaction()
        reaction.id = baseChild.getNamedItem("id").nodeValue.toInt()
        reaction.name = baseChild.getNamedItem("name").nodeValue

        if (baseChild.getNamedItem("type") == null) {
            reaction.type = "null"
        } else {
            reaction.type = baseChild.getNamedItem("type").nodeValue
        }

        val subList = mutableMapOf()
        val proList = mutableMapOf()
        val eElement: Element? = reactionList.item(i) as Element
        val sProt = eElement?.getElementsByTagName("substrate")
        val pProt = eElement?.getElementsByTagName("product")

        if (sProt != null) {
            for (j in 0 until sProt.length) {
                subList.put(
                        sProt.item(j).attributes.getNamedItem("id").nodeValue.toInt(),
                        sProt.item(j).attributes.getNamedItem("name").nodeValue
                )
            }
        }

        if (pProt != null) {
            for (j in 0 until pProt.length) {
                proList.put(
                        pProt.item(j).attributes.getNamedItem("id").nodeValue.toInt(),
                        pProt.item(j).attributes.getNamedItem("name").nodeValue
                )
            }
        }
        reaction.substrate = subList
        reaction.product = proList

        parsedReactList += reaction
    }

    for (i in 0 until relationList.length) {
        val baseChild = relationList.item(i).attributes
        val relation = KGMLRelation()
        relation.entry1 = baseChild.getNamedItem("entry1").nodeValue.toInt()
        relation.entry2 = baseChild.getNamedItem("entry2").nodeValue.toInt()
        relation.type = baseChild.getNamedItem("type").nodeValue

        parsedRelateList += relation
    }

    return mapOf("entries" to parsedEntryList, "reactions" to parsedReactList, "relationships" to parsedRelateList)
}

/**
 * Construct graph from parsed KGML data using JGraphT libraries
 */
@Suppress("UNCHECKED_CAST")
internal fun kgmlGraphConstructor(parsedKGML: Map>): DefaultDirectedGraph {
    val relationships = parsedKGML["relationships"] as List
    val entries = parsedKGML["entries"] as List
    val reactions = parsedKGML["reactions"] as List

    val g: DefaultDirectedGraph = DefaultDirectedGraph(DefaultEdge::class.java)

    for (i in entries.indices) {
        g.addVertex(entries[i])
    }

    // Add all edges for each entry ID from KGML relationship information
    val tmpGL = g.vertexSet().toList() as List
    for (i in relationships.indices) {
        val tmpG1 = tmpGL.find {it.id == relationships[i].entry1}
        val tmpG2 = tmpGL.find {it.id == relationships[i].entry2}
        g.addEdge(tmpG1, tmpG2)
    }

    // Add substrate and product edges
    //   * substrates go into gene list (sub) --> [gene1, gene2, ...]
    //   * products come out of gene list [gene1, gene2, ...] --> (product)
    for (i in reactions.indices) {
        val tmpG1 = tmpGL.find {it.id == reactions[i].id}
        reactions[i].substrate.forEach { k, _ ->
            val tmpSub = tmpGL.find{it.id == k}
            g.addEdge(tmpSub, tmpG1)
        }
        reactions[i].product.forEach{ k, _ ->
            val tmpProd = tmpGL.find{it.id == k}
            g.addEdge(tmpG1, tmpProd)
        }
    }

    return g
}

/**
 * Get XML document class from string output
 */
internal fun convertStringToXMLDocument(xmlString: String): Document? {
    //Parser that produces DOM object trees from XML content
    val factory = DocumentBuilderFactory.newInstance()
    val builder: DocumentBuilder?

    try {
        builder = factory.newDocumentBuilder()
        return builder.parse(InputSource(StringReader(xmlString)))
    } catch (e: Exception) {
        e.printStackTrace()
    }
    return null
}

/**
 * Get reaction components from graph object
 */
private fun  DefaultDirectedGraph.getReactions() {
    val nodes = this.vertexSet()
//    nodes =
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy