org.oewntk.xml.in.XmlExtractor.kt Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of fromxml Show documentation
Show all versions of fromxml Show documentation
Parse WordNet XML file into model.
The newest version!
/*
* Copyright (c) 2024. Bernard Bou.
*/
package org.oewntk.xml.`in`
import org.w3c.dom.Element
/**
* Extract information from attributes in XML files or retrieve it
*
* @author Bernard Bou
*/
object XmlExtractor {
/**
* Get sensekey from element
*
* @param senseElement sense element
* @return sensekey
*/
private fun getSenseKey(senseElement: Element): String {
val id = senseElement.getAttribute(XmlNames.ID_ATTR)
return toSensekey(id)
}
/**
* Get lexid from element
*
* @param senseElement sense element
* @return lexid
*/
fun getLexid(senseElement: Element): Int {
val id = senseElement.getAttribute(XmlNames.ID_ATTR)
val sk = id.substring("oewn-".length)
var b = sk.indexOf("__")
b += 2 + 5
val lexid = sk.substring(b, b + 2)
return lexid.toInt()
}
/**
* Get adj position from element
*
* @param senseElement sense element
* @return adj position
*/
fun getAdjPosition(senseElement: Element): String {
return senseElement.getAttribute(XmlNames.ADJPOSITION_ATTR)
}
/**
* Get verb frames from element
*
* @param senseElement sense element
* @return verb frame list of numeric ids
*/
fun getVerbFrames(senseElement: Element): String {
return senseElement.getAttribute(XmlNames.VERBFRAMES_ATTR)
}
/**
* Get rank of sense in synset
*
* @param senseElement sense element
* @param synsetsById synsets mapped by id, for resolution
* @return rank of this sense in synset
*/
fun getRank(senseElement: Element, synsetsById: Map): Int {
val lexElement = XmlUtils.getParentElement(senseElement)
val lexId = lexElement.getAttribute(XmlNames.ID_ATTR)
val synsetId = senseElement.getAttribute(XmlNames.SYNSET_ATTR)
val synsetElement = synsetsById[synsetId]
val membersAttr = synsetElement!!.getAttribute(XmlNames.MEMBERS_ATTR)
val members = membersAttr.split("\\s+".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray()
val found = members
.withIndex()
.firstOrNull { lexId == it.value } ?: throw RuntimeException("[E] member attr not found $lexId")
return found.index
}
/**
* Get tag count of sense
*
* @param senseElement sense element
* @param tagCountsBySensekey tag counts mapped by sensekey
* @return tag count
*/
fun getTagCount(senseElement: Element, tagCountsBySensekey: Map): Int {
val sensekey = getSenseKey(senseElement)
return tagCountsBySensekey[sensekey] ?: 0
}
/**
* Get verb templates for this sense
*
* @param senseElement sense element
* @param templateIdsBySensekey template ids by sensekey
* @return verb template list string
*/
fun getVerbTemplates(senseElement: Element, templateIdsBySensekey: Map): String {
val sensekey = getSenseKey(senseElement)
return templateIdsBySensekey[sensekey]?.joinToString(" ") ?: ""
}
private const val PREFIX = "oewn-"
private const val PREFIX_LENGTH = PREFIX.length
/**
* Convert id to sensekey by unescaping some character sequences
* -ap- apostrophe in "L'Enfant"
* -sl- slash in "I/O device"
* -cm- comma in "Prince William, duke of Cumberland"
* -ex- exclamation mark in "Yahoo!"
* -cl- colon on "Capital: critique of political economy"
* -pl- plus sign in "LGBT+"
* -sp- space in "accustomed to"
* -lb- left bracket
* -rb- right bracket
*
* @param id XML id
* @return sensekey
*/
fun toSensekey(id: String): String {
val sk = if (id.startsWith(PREFIX)) id.substring(PREFIX_LENGTH) else id
val b = sk.indexOf("__")
val lemma = sk.substring(0, b)
.replace("-ap-", "'")
.replace("-lb-", "(")
.replace("-rb-", ")")
.replace("-sl-", "/")
.replace("-cm-", ",")
.replace("-ex-", "!")
.replace("-cl-", ":")
.replace("-pl-", "+")
.replace("-sp-", "_")
val tail = sk.substring(b + 2)
.replace(".", ":")
.replace("-ap-", "'")
.replace("-lb-", "(")
.replace("-rb-", ")")
.replace("-sl-", "/")
.replace("-cm-", ",")
.replace("-ex-", "!")
.replace("-cl-", ":")
.replace("-pl-", "+")
.replace("-sp-", "_")
return "$lemma%$tail"
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy