org.clulab.reach.indexer.NxmlSearcher.scala Maven / Gradle / Ivy
The newest version!
package org.clulab.reach.indexer
import java.io.{FileWriter, PrintWriter, File}
import java.nio.file.Paths
import org.clulab.processors.bionlp.BioNLPProcessor
import org.clulab.utils.StringUtils
import org.apache.lucene.analysis.Analyzer
import org.apache.lucene.analysis.core.WhitespaceAnalyzer
import org.apache.lucene.analysis.standard.StandardAnalyzer
import org.apache.lucene.document.Document
import org.apache.lucene.index.DirectoryReader
import org.apache.lucene.queryparser.classic.QueryParser
import org.apache.lucene.search.{TopScoreDocCollector, IndexSearcher}
import org.apache.lucene.store.FSDirectory
import org.slf4j.LoggerFactory
import scala.collection.mutable
import NxmlSearcher._
import scala.collection.mutable.ArrayBuffer
/**
* Searches the NXML index created by NXML indexer
* User: mihais
* Date: 10/19/15
*/
class NxmlSearcher(val indexDir:String) {
val reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexDir)))
val searcher = new IndexSearcher(reader)
val proc = new BioNLPProcessor(withChunks = false)
def close() = reader.close()
def docs(ids:Set[(Int, Float)]):Set[(Document, Float)] = {
val ds = new mutable.HashSet[(Document, Float)]()
for(id <- ids) {
ds += new Tuple2(searcher.doc(id._1), id._2)
}
ds.toSet
}
def saveIds(docs:Set[(Document, Float)]): Unit = {
val os = new PrintWriter(new FileWriter("ids.txt"))
for(doc <- docs) {
val id = doc._1.get("id")
os.println(id)
}
os.close()
}
def saveNxml(resultDir:String, docs:Set[(Document, Float)], howManyToSave:Int = 0): Unit = {
val docSeq = if (howManyToSave > 0) {
docs.toSeq.sortBy(-_._2).take(howManyToSave)
} else {
docs.toSeq.sortBy(-_._2)
}
val sos = new PrintWriter(new FileWriter(resultDir + File.separator + "scores.tsv"))
for(doc <- docSeq) {
val id = doc._1.get("id")
val nxml = doc._1.get("nxml")
val os = new PrintWriter(new FileWriter(resultDir + File.separator + id + ".nxml"))
os.print(nxml)
os.close()
sos.println(s"$id\t${doc._2}")
}
sos.close()
}
def saveDocs(resultDir:String, docIds:Set[(Int, Float)]): Unit = {
val sos = new PrintWriter(new FileWriter(resultDir + File.separator + "scores.tsv"))
var count = 0
for(docId <- docIds) {
val doc = searcher.doc(docId._1)
val id = doc.get("id")
val nxml = doc.get("nxml")
val os = new PrintWriter(new FileWriter(resultDir + File.separator + id + ".nxml"))
os.print(nxml)
os.close()
sos.println(s"$id\t${docId._2}")
count += 1
}
sos.close()
logger.info(s"Saved $count documents.")
}
def search(query:String, totalHits:Int = TOTAL_HITS):Set[(Int, Float)] = {
searchByField(query, "text", new StandardAnalyzer(), totalHits)
}
def searchId(id:String, totalHits:Int = 1):Set[(Int, Float)] = {
searchByField(id, "id", new WhitespaceAnalyzer(), totalHits)
}
def searchByField(query:String,
field:String,
analyzer:Analyzer,
totalHits:Int = TOTAL_HITS,
verbose:Boolean = true):Set[(Int, Float)] = {
val q = new QueryParser(field, analyzer).parse(query)
val collector = TopScoreDocCollector.create(totalHits)
searcher.search(q, collector)
val hits = collector.topDocs().scoreDocs
val results = new mutable.HashSet[(Int, Float)]
for(hit <- hits) {
val docId = hit.doc
val score = hit.score
results += new Tuple2(docId, score)
}
if(verbose) logger.debug(s"""Found ${results.size} results for query "$query"""")
results.toSet
}
def intersection(s1:Set[(Int, Float)], s2:Set[(Int, Float)]):Set[(Int, Float)] = {
val result = new mutable.HashSet[(Int, Float)]()
for(s <- s1) {
var found = false
var otherScore = 0.0.toFloat
for(o <- s2 if ! found) {
if(s._1 == o._1) {
found = true
otherScore = o._2
}
}
if(found) {
result += new Tuple2(s._1, s._2 + otherScore)
}
}
result.toSet
}
def union(s1:Set[Int], s2:Set[Int]):Set[Int] = {
val result = new mutable.HashSet[Int]()
s1.foreach(result += _)
s2.foreach(result += _)
result.toSet
}
def countDocsContaining(eventDocs:Set[(Int, Float)], token:String):Int = {
// token could be a phrase; make sure quotes are used
val query = s"""Ras AND "$token""""
val result = intersection(eventDocs, search(query))
result.size
}
def useCase(resultDir:String): Unit = {
val eventDocs = search("phosphorylation phosphorylates ubiquitination ubiquitinates hydroxylation hydroxylates sumoylation sumoylates glycosylation glycosylates acetylation acetylates farnesylation farnesylates ribosylation ribosylates methylation methylates binding binds")
val result = intersection(eventDocs, search("""Ras AND (ROS OR "antioxidant response element" OR Warburg OR MAPK OR "Raf/Mek/Erk" OR Akt OR NfkB OR TGFb OR TGFbeta OR TGFb1 OR TGFbeta1 OR integrins OR ADAM OR EGF OR EGFR OR RTK OR apoptosis OR autophagy OR proliferation OR "transcription factors" OR ATM OR p53 OR RB OR "tumor suppressors" OR glycolysis OR "pentose phosphate pathway" OR OXPHOS OR mitochondria OR "cell cycle" OR "energy balance" OR exosomes OR RAGE OR HMGB1)"""))
logger.debug(s"The result contains ${result.size} documents.")
val resultDocs = docs(result)
saveNxml(resultDir, resultDocs, 0)
saveIds(resultDocs)
//
// histogram of term distribution in docs
//
logger.debug("Generating topic histogram...")
val histoPoints = Array(
"ROS",
"antioxidant response element",
"Warburg",
"MAPK",
"Raf/Mek/Erk",
"Akt",
"NfkB",
"TGFb",
"TGFbeta",
"TGFb1",
"TGFbeta1",
"integrins",
"ADAM",
"EGF",
"EGFR",
"EGFR",
"RTK",
"apoptosis",
"autophagy",
"proliferation",
"transcription factors",
"ATM",
"p53",
"RB",
"tumor suppressors",
"glycolysis",
"pentose phosphate pathway",
"exosomes",
"OXPHOS",
"mitochondria",
"cell cycle",
"energy balance",
"RAGE",
"HMGB1")
val histoValues = new ArrayBuffer[(String, Int)]()
for(point <- histoPoints) {
histoValues += new Tuple2(point, countDocsContaining(result, point))
}
val histoFile = new PrintWriter(new FileWriter(resultDir + File.separator + "histo.txt"))
for(i <- histoValues.sortBy(0 - _._2)) {
histoFile.println(s"${i._1}\t${i._2}")
}
histoFile.close()
logger.debug("Done.")
}
/** Finds all NXML that contain at least one biochemical interaction */
def useCase2(resultDir:String): Unit = {
val eventDocs = search("phosphorylation phosphorylates ubiquitination ubiquitinates hydroxylation hydroxylates sumoylation sumoylates glycosylation glycosylates acetylation acetylates farnesylation farnesylates ribosylation ribosylates methylation methylates binding binds")
logger.debug(s"The result contains ${eventDocs.size} documents.")
saveDocs(resultDir, eventDocs)
logger.debug("Done.")
}
def useCase3(resultDir:String): Unit = {
val eventDocs = search("children AND ((TNFAlpha AND nutrition) OR (inflammation AND stunting) OR (kcal AND inflammation) OR (protein AND inflammation) OR (nutrition AND inflammation))")
logger.debug(s"The result contains ${eventDocs.size} documents.")
saveDocs(resultDir, eventDocs)
logger.debug("Done.")
}
def useCaseTB(resultDir:String): Unit = {
val eventDocs = search(""" "chronic inflammation" AND ("tissue damage" OR "tissue repair" OR "wound healing" OR "angiogenesis" OR "fibrosis" OR "resolvin" OR "eicosanoid" OR "tumor-infiltrating lymphocyte" OR "lymphoid aggregate" OR "granuloma" OR "microbiome" OR "short-chain fatty acid") """)
logger.info(s"The result contains ${eventDocs.size} documents.")
saveDocs(resultDir, eventDocs)
logger.info("Done.")
}
// Natasa's use case, first query
def useCase4a(resultDir:String): Unit = {
val eventDocs = search("""(TGFbeta1 OR "Transforming Growth Factor beta 1") AND (BMP OR "Bone Morphogenetic Protein")""")
logger.debug(s"The result contains ${eventDocs.size} documents.")
saveDocs(resultDir, eventDocs)
logger.debug("Done.")
}
// Natasa's use case, second query
def useCase4b(resultDir:String): Unit = {
val query = """(TGFbeta1 OR "Transforming Growth Factor beta 1") AND pancreas"""
val eventDocs = search(query)
logger.info(s"The result contains ${eventDocs.size} documents for query [$query]")
saveDocs(resultDir, eventDocs)
logger.info("Done.")
}
// Natasa's use case, third query
def useCase4c(resultDir:String): Unit = {
val query = """(BMP OR "Bone Morphogenetic Protein") AND pancreas"""
val eventDocs = search(query)
logger.info(s"The result contains ${eventDocs.size} documents for query [$query]")
saveDocs(resultDir, eventDocs)
logger.info("Done.")
}
// Natasa's use case, fourth query
def useCase4d(resultDir:String): Unit = {
val query = """(TGFbeta1 OR "Transforming Growth Factor beta 1") AND (BMP OR "Bone Morphogenetic Protein") AND pancreas"""
val eventDocs = search(query)
logger.info(s"The result contains ${eventDocs.size} documents for query [$query]")
saveDocs(resultDir, eventDocs)
logger.info("Done.")
}
def searchByIds(ids:Array[String], resultDir:String): Unit = {
val result = new mutable.HashSet[(Int, Float)]()
logger.info(s"Searching for ${ids.length} ids: ${ids.mkString(", ")}")
for(id <- ids) {
val docs = searchId(id)
if(docs.isEmpty) {
logger.info(s"Found 0 results for id $id!")
} else if(docs.size > 1) {
logger.info(s"Found ${docs.size} for id $id, which should not happen!")
} else {
result ++= docs
}
}
logger.info(s"Found ${result.size} documents for ${ids.length} ids.")
val resultDocs = docs(result.toSet)
saveNxml(resultDir, resultDocs)
saveIds(resultDocs)
}
}
object NxmlSearcher {
val logger = LoggerFactory.getLogger(classOf[NxmlSearcher])
val TOTAL_HITS = 500000
def main(args:Array[String]): Unit = {
val props = StringUtils.argsToProperties(args)
val indexDir = props.getProperty("index")
val resultDir = props.getProperty("output")
val searcher = new NxmlSearcher(indexDir)
if(props.containsKey("ids")) {
val ids = readIds(props.getProperty("ids"))
searcher.searchByIds(ids, resultDir)
} else {
searcher.useCase2(resultDir)
}
searcher.close()
}
def readIds(fn:String):Array[String] = {
val ids = new ArrayBuffer[String]()
for(line <- io.Source.fromFile(fn).getLines()) {
var l = line.trim
if (! l.startsWith("PMC"))
l = "PMC" + l
ids += l
}
ids.toArray
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy