org.clulab.geonorm.GeoNamesIndex.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of geonorm_2.12 Show documentation
Geographical name normalization (a.k.a. toponym resolution)
The newest version!
package org.clulab.geonorm

import java.nio.file.{Files, Path, Paths}
import java.util.regex.Pattern
import java.util.zip.ZipFile

import scala.collection.JavaConverters._
import org.apache.lucene.analysis.{Analyzer, LowerCaseFilter}
import org.apache.lucene.analysis.core.{KeywordAnalyzer, KeywordTokenizer}
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper
import org.apache.lucene.analysis.ngram.NGramTokenFilter
import org.apache.lucene.analysis.pattern.PatternReplaceFilter
import org.apache.lucene.analysis.standard.StandardTokenizer
import org.apache.lucene.document.{Document, Field, StoredField, StringField, TextField}
import org.apache.lucene.index.{DirectoryReader, IndexWriter, IndexWriterConfig, Term}
import org.apache.lucene.queryparser.classic.QueryParser
import org.apache.lucene.search.{IndexSearcher, Query, TermQuery}
import org.apache.lucene.search.grouping.GroupingSearch
import org.apache.lucene.store.FSDirectory
import org.apache.lucene.util.automaton.TooComplexToDeterminizeException


class GeoNamesEntry(document: Document) {
  lazy val id: String = document.get("id")
  lazy val name: String = document.get("canonical-name")
  lazy val featureCode: String = document.get("feature-code")
  lazy val population: Long = document.get("population").toLong
  override def toString: String = s"${this.getClass.getSimpleName}($document)"
}


object GeoNamesIndexConfig {
  val idField: String => Field = new StoredField("id", _)
  val canonicalNameField: String => Field = new StoredField("canonical-name", _)
  val nameField: String => Field = new TextField("name", _, Field.Store.NO)
  val ngramsField: String => Field = new TextField("ngrams", _, Field.Store.NO)
  val latitudeField: Float => Field = new StoredField("latitude", _)
  val longitudeField: Float => Field = new StoredField("longitude", _)
  val featureCodeField: String => Field = new StoredField("feature-code", _)
  val populationField: Long => Field = new StoredField("population", _)
  val idEndField: Field = new StringField("idEnd", "x", Field.Store.NO)

  val idEndQuery: Query = new TermQuery(new Term("idEnd", "x"))
  val nameAnalyzer: Analyzer = new Analyzer {
    override def createComponents(fieldName: String): Analyzer.TokenStreamComponents = {
      val tokenizer = new KeywordTokenizer
      val filter = new PatternReplaceFilter(new LowerCaseFilter(tokenizer), Pattern.compile("\\W+"), "", true)
      new Analyzer.TokenStreamComponents(tokenizer, filter)
    }
  }
  val ngramAnalyzer: Analyzer = new Analyzer {
    override def createComponents(fieldName: String): Analyzer.TokenStreamComponents = {
      val tokenizer = new StandardTokenizer
      val filter = new NGramTokenFilter(new LowerCaseFilter(tokenizer), 3, 3)
      new Analyzer.TokenStreamComponents(tokenizer, filter)
    }
  }
  val analyzer: Analyzer = new PerFieldAnalyzerWrapper(
    new KeywordAnalyzer, Map("name" -> nameAnalyzer, "ngrams" -> ngramAnalyzer).asJava)
}


object GeoNamesIndex {
  def main(args: Array[String]): Unit = args match {
    case Array("index", indexPath, geoNamesPath) =>
      GeoNamesIndex.fromGeoNamesTxt(Paths.get(indexPath), Paths.get(geoNamesPath))
    case Array("search", indexPath, queryStrings @ _*) =>
      val index = new GeoNamesIndex(Paths.get(indexPath))
      for (queryString <- queryStrings) {
        println(queryString)
        for ((entry, score) <- index.search(queryString, 20)) {
          println(f"$score%.3f ${entry.id} ${entry.name} ${entry.featureCode} ${entry.population}")
        }
      }
      index.close()
  }

  def fromClasspathJar(indexPath: Path, resourceName: String = "/org/clulab/geonames/index/"): GeoNamesIndex = {
    if (Files.exists(indexPath) && Files.list(indexPath).count() > 0) {
      throw new IllegalArgumentException(s"Cannot create index: $indexPath is not empty")
    } else {
      // find the .jar file containing the GeoNames index
      val url = this.getClass.getResource(resourceName)
      val jarFileURL = url.openConnection().asInstanceOf[java.net.JarURLConnection].getJarFileURL

      // open the .jar file as a .zip file
      val jarFile = new java.io.File(jarFileURL.toURI)
      val zipFile = new ZipFile(jarFile)

      // find all .zip file entries that are part of the index
      try {
        val prefix = resourceName.drop(1) // no leading '/' in zip files
        for (entry <- zipFile.entries.asScala) {
          if (entry.getName.startsWith(prefix)) {
            val path = indexPath.resolve(entry.getName.drop(prefix.length))

            // write the file or directory to the index cache directory
            if (entry.isDirectory) {
              Files.createDirectories(path)
            } else {
              Files.createDirectories(path.getParent)
              Files.copy(zipFile.getInputStream(entry), path)
            }
          }
        }
      } finally {
        zipFile.close()
      }
    }
    new GeoNamesIndex(indexPath)
  }

  def fromGeoNamesTxt(indexPath: Path, geoNamesPath: Path): GeoNamesIndex = {
    // create an index writer
    val dir = FSDirectory.open(indexPath)
    val config = new IndexWriterConfig(GeoNamesIndexConfig.analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    val writer = new IndexWriter(dir, config)

    // walk through each line of the GeoNames file
    for (line <- Files.lines(geoNamesPath).iterator.asScala) {
      val Array(geoNameID, canonicalName, asciiName, alternateNames, latitude, longitude,
      _, featureCode, _, _, _, _, _, _, population, _, _, _, _) = line.split("\t")

      // generate a document for each name of this ID
      val docs = for (name <- Array(canonicalName, asciiName) ++ alternateNames.split(",")) yield {
        val doc = new Document
        doc.add(GeoNamesIndexConfig.idField(geoNameID))
        doc.add(GeoNamesIndexConfig.canonicalNameField(canonicalName))
        doc.add(GeoNamesIndexConfig.nameField(name))
        doc.add(GeoNamesIndexConfig.ngramsField(name))
        doc.add(GeoNamesIndexConfig.latitudeField(latitude.toFloat))
        doc.add(GeoNamesIndexConfig.longitudeField(longitude.toFloat))
        doc.add(GeoNamesIndexConfig.featureCodeField(featureCode))
        doc.add(GeoNamesIndexConfig.populationField(population.toLong))
        doc
      }

      // mark the last document (name) for the ID
      docs.last.add(GeoNamesIndexConfig.idEndField)

      // write all documents (names) for this ID in a block
      val docsList: java.util.List[Document] = java.util.Arrays.asList(docs: _*)
      writer.addDocuments(docsList)
    }
    writer.close()

    new GeoNamesIndex(indexPath)
  }
}

class GeoNamesIndex(indexPath: Path,
                    maxExactHits: Int = 1000,
                    maxFuzzyHits: Int = 5,
                    maxNGramHits: Int = 5) {
  private val reader = DirectoryReader.open(FSDirectory.open(indexPath))
  private val searcher = new IndexSearcher(reader)
  private val groupingSearch = new GroupingSearch(GeoNamesIndexConfig.idEndQuery)

  def search(queryString: String,
             maxExactHits: Int = this.maxExactHits,
             maxFuzzyHits: Int = this.maxFuzzyHits,
             maxNGramHits: Int = this.maxNGramHits): Array[(GeoNamesEntry, Float)] = {
    // create these locally, since they are not thread-safe
    val nameQueryParser = new QueryParser("name", GeoNamesIndexConfig.analyzer)
    val ngramsQueryParser = new QueryParser("ngrams", GeoNamesIndexConfig.analyzer)

    // escape special characters for queries to "name" field
    val luceneSpecialCharacters = """([-+&|!(){}\[\]^"~*?:\\/]|\bAND\b|\bOR\b|\bNOT\b)"""
    val escapedQueryString = queryString.replaceAll(luceneSpecialCharacters, """\\$1""")
    val whitespaceEscapedQueryString = escapedQueryString.replaceAll("""\s""", """\\ """)

    // first look for an exact match of the input phrase (the "name" field ignores spaces, punctuation, etc.)
    var results = scoredEntries(nameQueryParser.parse(whitespaceEscapedQueryString), maxExactHits)

    // if there's no exact match, search for fuzzy (1-2 edit-distance) matches
    if (results.isEmpty && maxFuzzyHits > 0) {
      try {
        results = scoredEntries(nameQueryParser.parse(whitespaceEscapedQueryString + "~"), maxFuzzyHits)
      } catch {
        case _: TooComplexToDeterminizeException =>
          // continue to n-gram search if the query is too complex for fuzzy search
      }
    }
    // if there's no fuzzy match, search for n-gram matches
    if (results.isEmpty && maxNGramHits > 0) {
      results = scoredEntries(ngramsQueryParser.parse(escapedQueryString), maxNGramHits)
    }
    // sort first by retrieval score, then by population, then by feature code (e.g., ADM1 before ADM3 and PPL)
    results.sortBy{
      case (entry, score) => (-score, -math.log10(entry.population + 1).round, entry.featureCode)
    }
  }

  def scoredEntries(query: Query, maxHits: Int): Array[(GeoNamesEntry, Float)] = {
    // perform a group-based search, where each group represents all names for a GeoNames ID
    val topGroups = groupingSearch.search[String](searcher, query, 0, maxHits)

    // for each of the hits, return an object representing the GeoNames entry, and the retrieval score
    if (topGroups == null) {
      Array.empty
    } else {
      for (group <- topGroups.groups) yield {
        val headDoc = searcher.doc(group.scoreDocs.head.doc)
        (new GeoNamesEntry(headDoc), group.maxScore)
      }
    }
  }

  def close(): Unit = {
    reader.close()
  }
}