io.fsq.twofishes.indexer.importers.geonames.GeonamesParser.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of twofishes-indexer-geonames_2.11 Show documentation
Show all versions of twofishes-indexer-geonames_2.11 Show documentation
Foursquare's coarse, splitting geocoder and reverse geocoder in Scala.
The newest version!
// Copyright 2012 Foursquare Labs Inc. All Rights Reserved.
package io.fsq.twofishes.indexer.importers.geonames
import akka.actor.{ActorSystem, Props}
import com.twitter.ostrich.admin.{AdminServiceFactory, RuntimeEnvironment}
import com.twitter.ostrich.stats.Stats
import com.vividsolutions.jts.geom.Geometry
import com.vividsolutions.jts.io.{WKBWriter, WKTReader}
import io.fsq.common.scala.Identity._
import io.fsq.common.scala.Lists.Implicits._
import io.fsq.geo.quadtree.CountryRevGeo
import io.fsq.twofishes.gen._
import io.fsq.twofishes.indexer.mongo.{
GeocodeRecordIndexes,
IndexerQueryExecutor,
NameIndex,
RevGeoIndex,
RogueMongoGeocodeStorageService
}
import io.fsq.twofishes.indexer.output._
import io.fsq.twofishes.indexer.util.{BoundingBox, DisplayName, GeocodeRecord, Point, SlugEntry}
import io.fsq.twofishes.indexer.util.FsqSimpleFeatureImplicits._
import io.fsq.twofishes.indexer.util.ShapefileIterator
import io.fsq.twofishes.model.gen.{
ThriftGeocodeRecord,
ThriftNameIndex,
ThriftPolygonIndex,
ThriftRevGeoIndex,
ThriftS2CoveringIndex,
ThriftS2InteriorIndex
}
import io.fsq.twofishes.util.{
DurationUtils,
GeoTools,
GeonamesId,
GeonamesNamespace,
Helpers,
NameNormalizer,
StoredFeatureId
}
import io.fsq.twofishes.util.Helpers._
import java.io.File
import java.util.concurrent.CountDownLatch
import org.bson.types.ObjectId
import org.json4s.NoTypeHints
import org.json4s.jackson.{JsonMethods, Serialization}
import org.opengis.feature.simple.SimpleFeature
import org.slf4s.Logging
import scala.collection.JavaConverters._
import scala.collection.mutable.{HashMap, HashSet}
import scala.io.Source
import scala.util.matching.Regex
object GeonamesParser extends DurationUtils {
var config: GeonamesImporterConfig = null
var countryLangMap = new HashMap[String, List[String]]()
var countryNameMap = new HashMap[String, String]()
var adminIdMap = new HashMap[String, String]()
val runtime = new RuntimeEnvironment(this)
val admin = AdminServiceFactory(httpPort = 7655)
.apply(runtime)
lazy val naturalEarthPopulatedPlacesMap: Map[StoredFeatureId, SimpleFeature] = {
new ShapefileIterator("src/jvm/io/fsq/twofishes/indexer/data/downloaded/ne_10m_populated_places_simple.shp")
.flatMap(f => {
f.propMap
.get("geonameid")
.map(id => {
(GeonamesId(id.toDouble.toLong) -> f)
})
})
.toMap
}
def parseCountryInfo() {
val fileSource =
scala.io.Source.fromFile(new File("src/jvm/io/fsq/twofishes/indexer/data/downloaded/countryInfo.txt"))
val lines = fileSource.getLines.filterNot(_.startsWith("#"))
lines.foreach(l => {
val parts = l.split("\t")
val cc = parts(0)
val englishName = parts(4)
val langs = parts(15).split(",").map(l => l.split("-")(0)).toList
val geonameid = parts(16)
countryLangMap += (cc -> langs)
countryNameMap += (cc -> englishName)
adminIdMap += (cc -> geonameid)
})
}
def parseAdminInfoFile(filename: String) {
// adm1, name, name, geonameid
val fileSource = scala.io.Source.fromFile(new File(filename))
val lines = fileSource.getLines.filterNot(_.startsWith("#"))
lines.foreach(l => {
val parts = l.split("\t")
val admCode = parts(0)
val geonameid = parts(3)
adminIdMap += (admCode -> geonameid)
})
}
val store = new RogueMongoGeocodeStorageService()
lazy val slugIndexer = new SlugIndexer()
def main(args: Array[String]) {
config = GeonamesImporterConfigParser.parse(args)
val parser = new GeonamesParser(store, slugIndexer)
try {
CountryRevGeo.getNearestCountryCode(40.74, -74)
} catch {
case e: Exception => {
println("caught exception in country revgeo warmup, no idea what's wrong")
System.exit(1)
}
}
if (config.reloadData) {
IndexerQueryExecutor.dropCollection(ThriftGeocodeRecord)
IndexerQueryExecutor.dropCollection(ThriftNameIndex)
IndexerQueryExecutor.dropCollection(ThriftPolygonIndex)
IndexerQueryExecutor.dropCollection(ThriftRevGeoIndex)
IndexerQueryExecutor.dropCollection(ThriftS2CoveringIndex)
IndexerQueryExecutor.dropCollection(ThriftS2InteriorIndex)
parser.loadIntoMongo()
writeIndexes(parser.s2CoveringLatch)
} else {
writeIndexes(None)
}
implicit val formats = Serialization.formats(NoTypeHints)
val prettyJsonStats = Serialization.writePretty(JsonMethods.parse(Stats.get().toJson))
log.info(prettyJsonStats)
log.info("all done with parse, trying to shutdown admin server and exit")
admin.shutdown()
System.exit(0)
}
def makeFinalIndexes() {
logPhase("making indexes before generating output") {
RevGeoIndex.makeIndexes(store.executor)
}
}
def writeIndex(args: Array[String]) {
config = GeonamesImporterConfigParser.parse(args)
writeIndexes(None)
}
// TODO: if we aren't redoing mongo indexing
// then add some code to see if the s2 index is 'done'
// We should also add an option to skip reloading polys
def writeIndexes(s2CoveringLatch: Option[CountDownLatch]) {
makeFinalIndexes()
val outputter = new OutputIndexes(
config.hfileBasePath,
config.outputPrefixIndex,
GeonamesParser.slugIndexer.slugEntryMap,
config.outputRevgeo,
config.outputS2Covering,
config.outputS2Interior
)
outputter.buildIndexes(s2CoveringLatch)
}
}
// (country -> tokenlist)
case class ShortenInfo(from: Regex, to: String, flags: Int)
import io.fsq.twofishes.indexer.importers.geonames.GeonamesParser._
class GeonamesParser(
store: RogueMongoGeocodeStorageService,
slugIndexer: SlugIndexer
) extends Logging {
lazy val polygonLoader = new PolygonLoader(this, store, config)
lazy val hierarchyTable = HierarchyParser.parseHierarchy(
List(
"src/jvm/io/fsq/twofishes/indexer/data/downloaded/hierarchy.txt",
"src/jvm/io/fsq/twofishes/indexer/data/private/hierarchy.txt",
"src/jvm/io/fsq/twofishes/indexer/data/custom/hierarchy.txt"
)
)
// token -> alt tokens
lazy val rewriteTable = new TsvHelperFileParser(
"src/jvm/io/fsq/twofishes/indexer/data/custom/rewrites.txt",
"src/jvm/io/fsq/twofishes/indexer/data/private/rewrites.txt"
).gidMap.map({
case (from, toList) => {
(from.r, toList)
}
})
lazy val shortensList: Map[String, List[ShortenInfo]] = {
scala.io.Source
.fromFile(new File("src/jvm/io/fsq/twofishes/indexer/data/custom/shortens.txt"))
.getLines
.toList
.filterNot(_.startsWith("#"))
.flatMap(l => {
val parts = l.split("[\\|\t]").toList
val countries = parts(0).split(",").toList
val shortenParts = parts.drop(1)
val toShortenFrom = shortenParts(0) + "\\b"
val toShortenTo = shortenParts.lift(1).getOrElse("")
val shortenFlags = parseFeatureNameFlags(shortenParts.lift(2))
countries.map(cc => (cc -> ShortenInfo(toShortenFrom.r, toShortenTo, shortenFlags)))
})
.groupBy(_._1)
.mappedValues(_.map(_._2))
.toList
.toMap
}
// geonameid -> boost value
lazy val boostTable = new GeoIdTsvHelperFileParser(
GeonamesNamespace,
"src/jvm/io/fsq/twofishes/indexer/data/custom/boosts.txt",
"src/jvm/io/fsq/twofishes/indexer/data/private/boosts.txt"
)
lazy val deletesList: List[String] = scala.io.Source
.fromFile(new File("src/jvm/io/fsq/twofishes/indexer/data/custom/deletes.txt"))
.getLines
.toList
.filterNot(_.startsWith("#"))
// geonameid --> new center
lazy val moveTable =
new GeoIdTsvHelperFileParser(GeonamesNamespace, "src/jvm/io/fsq/twofishes/indexer/data/custom/moves.txt")
// geonameid -> name to be deleted
lazy val nameDeleteTable =
new GeoIdTsvHelperFileParser(GeonamesNamespace, "src/jvm/io/fsq/twofishes/indexer/data/custom/name-deletes.txt")
// list of geoids (geonameid:XXX) to skip indexing
lazy val ignoreList: List[StoredFeatureId] = scala.io.Source
.fromFile(new File("src/jvm/io/fsq/twofishes/indexer/data/custom/ignores.txt"))
.getLines
.toList
.filterNot(_.startsWith("#"))
.map(l => GeonamesId(l.toLong))
// extra parents
lazy val extraRelationsList =
new GeoIdTsvHelperFileParser(GeonamesNamespace, "src/jvm/io/fsq/twofishes/indexer/data/custom/extra-relations.txt")
lazy val concordanceMap = new GeoIdTsvHelperFileParser(
GeonamesNamespace,
"src/jvm/io/fsq/twofishes/indexer/data/computed/concordances.txt",
"src/jvm/io/fsq/twofishes/indexer/data/private/concordances.txt"
)
val bboxDirs = List(
new File("src/jvm/io/fsq/twofishes/indexer/data/computed/bboxes/"),
new File("src/jvm/io/fsq/twofishes/indexer/data/private/bboxes/")
)
val bboxFiles = bboxDirs
.flatMap(bboxDir => {
if (bboxDir.exists) {
bboxDir.listFiles.toList
} else {
Nil
}
})
.sorted
lazy val bboxTable = BoundingBoxTsvImporter.parse(bboxFiles)
val displayBboxDirs = List(
new File("src/jvm/io/fsq/twofishes/indexer/data/computed/display_bboxes/"),
new File("src/jvm/io/fsq/twofishes/indexer/data/private/display_bboxes/")
)
val displayBboxFiles = displayBboxDirs
.flatMap(bboxDir => {
if (bboxDir.exists) {
bboxDir.listFiles.toList
} else {
Nil
}
})
.sorted
lazy val displayBboxTable = BoundingBoxTsvImporter.parse(displayBboxFiles)
val helperTables = List(boostTable)
val system = ActorSystem("S2CoveringSystem")
val (s2CoveringMaster, s2CoveringLatch) =
if (config != null && (config.outputRevgeo || config.outputS2Covering || config.outputS2Interior)) {
val latch = new CountDownLatch(1)
(Some(system.actorOf(Props(new S2CoveringMaster(latch)), name = "master")), Some(latch))
} else {
(None, None)
}
def logUnusedHelperEntries {
helperTables.flatMap(_.logUnused).foreach(line => log.error(line))
}
val wkbWriter = new WKBWriter()
val wktReader = new WKTReader()
def loadIntoMongo() {
parseCountryInfo()
if (config.importAlternateNames) {
Helpers.duration("readAlternateNamesFile") {
loadAlternateNames()
}
}
if (!config.parseWorld) {
val countries = config.parseCountry.split(",")
countries.foreach(f => {
log.info("Parsing %s".format(f))
parseAdminInfoFile("src/jvm/io/fsq/twofishes/indexer/data/downloaded/adminCodes-%s.txt".format(f))
parseAdminFile("src/jvm/io/fsq/twofishes/indexer/data/downloaded/%s.txt".format(f))
if (config.importPostalCodes) {
parsePostalCodeFile("src/jvm/io/fsq/twofishes/indexer/data/downloaded/zip/%s.txt".format(f))
}
})
} else {
parseAdminInfoFile("src/jvm/io/fsq/twofishes/indexer/data/downloaded/adminCodes.txt")
logPhase("parse global features") {
parseAdminFile("src/jvm/io/fsq/twofishes/indexer/data/downloaded/allCountries.txt")
}
if (config.importPostalCodes) {
logPhase("parse global postal codes") {
parsePostalCodeFile("src/jvm/io/fsq/twofishes/indexer/data/downloaded/zip/allCountries.txt")
}
}
}
val supplementalDirs = List(
new File("src/jvm/io/fsq/twofishes/indexer/data/computed/features"),
new File("src/jvm/io/fsq/twofishes/indexer/data/private/features")
)
supplementalDirs.foreach(
supplementalDir =>
if (supplementalDir.exists) {
supplementalDir.listFiles.foreach(f => {
logPhase("parsing supplemental file: %s".format(f)) {
parseAdminFile(f.toString, allowBuildings = true)
}
})
}
)
logPhase("building name indexes pre parseNameTransforms") {
NameIndex.makeIndexes(store.executor)
}
logPhase("parseNameTransforms") {
parseNameTransforms()
}
if (config.buildMissingSlugs) {
logPhase("building missing slugs") {
slugIndexer.buildMissingSlugs()
slugIndexer.writeMissingSlugs(store)
}
}
logPhase("building feature indexes pre polygon loading") {
GeocodeRecordIndexes.makeIndexes(store.executor)
}
polygonLoader.load(GeonamesNamespace)
}
def doRewrites(names: List[String]): List[String] = {
val nameSet = new scala.collection.mutable.HashSet[String]()
rewriteTable.foreach({
case (from, toList) => {
names.foreach(name => {
toList.values.foreach(to => {
nameSet += from.replaceAllIn(name, to)
})
})
}
})
nameSet ++= names.map(_.replace("ß", "ss"))
nameSet.toList
}
val bigDeleteRe = {
val re = deletesList
.map(_ + "\\b")
.sortBy(_.size * -1)
.mkString("|")
("(?i)%s".format(re)).r
}
def doDelete(name: String): Option[String] = {
val newName = bigDeleteRe.replaceAllIn(name, "")
if (newName != name) {
Some(fixName(newName))
} else {
None
}
}
def createNameIndexRecords(displayNames: List[DisplayName], fid: StoredFeatureId, record: Option[GeocodeRecord]) = {
displayNames.map(name => {
createNameIndexRecord(name, fid, record)
})
}
def addDisplayNameToNameIndex(dn: DisplayName, fid: StoredFeatureId, record: Option[GeocodeRecord]) = {
store.addNameIndexes(List(createNameIndexRecord(dn, fid, record)))
}
private def isAllDigits(x: String) = x forall Character.isDigit
private def shouldExcludeFromPrefixIndex(dn: DisplayName, woeType: YahooWoeType): Boolean = {
// exclude because of flags
((dn.flags & (FeatureNameFlags.NEVER_DISPLAY.getValue | FeatureNameFlags.LOW_QUALITY.getValue)) != 0) ||
// exclude purely numeric names of non-postalcode features
(woeType !=? YahooWoeType.POSTAL_CODE && isAllDigits(dn.name))
}
private def isNameDeleted(name: String, fid: StoredFeatureId): Boolean = {
nameDeleteTable.get(fid).exists(_ =? name)
}
def createNameIndexRecord(dn: DisplayName, fid: StoredFeatureId, record: Option[GeocodeRecord]) = {
val name = NameNormalizer.normalize(dn.name).trim
val cc: String = record.map(_.cc).getOrElse("")
val pop: Int =
record.map(_.population).getOrElse(0) + record.map(_.boost).getOrElse(0)
val woeType: Int = record.flatMap(_.woeTypeOption.map(_.id)).getOrElse(0)
val excludeFromPrefixIndex = shouldExcludeFromPrefixIndex(dn, YahooWoeType.findByIdOrNull(woeType))
NameIndex(name, fid.longId, cc, pop, woeType, dn.flags, dn.lang, excludeFromPrefixIndex, dn.idOrThrow)
}
def rewriteNames(names: List[String]): (List[String], List[String]) = {
val deleteModifiedNames: List[String] = names.flatMap(doDelete)
val deaccentedNames = names.map(NameNormalizer.deaccent).filterNot(n => names.contains(n))
val rewrittenNames = doRewrites(names ++ deleteModifiedNames).filterNot(n => names.contains(n))
(deaccentedNames, (deleteModifiedNames ++ rewrittenNames).distinct)
}
def parseFeature(feature: InputFeature): GeocodeRecord = {
val geonameId = feature.featureId
val ids: List[StoredFeatureId] = List(geonameId) ++
concordanceMap
.get(geonameId)
.flatMap(concordanceId => {
GeonamesParser.slugIndexer.slugEntryMap(concordanceId) = (SlugEntry(geonameId.humanReadableString, 0))
// this isn't great, because it means we need a mapping for the namespace of
// any concordances in StoredFeatureId, so it's harder to add ad-hoc concordances to
// external datasets
if (concordanceId.contains(":")) {
StoredFeatureId.fromHumanReadableString(concordanceId)
} else {
None
}
})
val preferredEnglishAltName =
alternateNamesMap.getOrElse(geonameId, Nil).find(altName => altName.lang == "en" && altName.isPrefName)
val hasEnglishAltName = alternateNamesMap.getOrElse(geonameId, Nil).exists(_.lang == "en")
val hasPreferredEnglishAltName = preferredEnglishAltName.isDefined
val hasNonPreferredEnglishAltNameIdenticalToFeatureName = alternateNamesMap
.getOrElse(geonameId, Nil)
.exists(altName => altName.lang == "en" && !altName.isPrefName && altName.name =? feature.name)
var displayNames: List[DisplayName] = Nil
// consider using the primary feature name from geonames as an english name:
// skip: if an identical preferred english alt name exists
// add as preferred:
// if no english alt name exists OR
// no preferred english alt name exists BUT an identical non-preferred english name exists
// add as non-preferred otherwise
if (!preferredEnglishAltName.exists(_.name =? feature.name)) {
displayNames ++= processFeatureName(
geonameId,
feature.countryCode,
"en",
feature.name,
isPrefName = !hasEnglishAltName ||
(!hasPreferredEnglishAltName && hasNonPreferredEnglishAltNameIdenticalToFeatureName),
isShortName = false,
woeType = feature.featureClass.woeType
)
}
if (feature.featureClass.woeType == YahooWoeType.COUNTRY) {
countryNameMap
.get(feature.countryCode)
.foreach(
name =>
displayNames ::=
DisplayName("en", name, FeatureNameFlags.PREFERRED.getValue() | FeatureNameFlags.COLLOQUIAL.getValue())
)
}
feature.asciiname.foreach(asciiname => {
val (deaccentedPrimary, _) = rewriteNames(List(feature.name))
if (feature.name != asciiname &&
asciiname.nonEmpty &&
// do not add if this name has been deleted
!isNameDeleted(asciiname, geonameId) &&
// or if this is the same as the primary name deaccented and the primary name has been deleted
!(deaccentedPrimary.has(asciiname) && isNameDeleted(feature.name, geonameId))) {
displayNames ::=
DisplayName("en", asciiname, FeatureNameFlags.DEACCENT.getValue)
}
})
if (feature.featureClass.woeType.getValue == YahooWoeType.COUNTRY.getValue) {
displayNames ::= DisplayName("abbr", feature.countryCode, 0)
}
// Build names
val alternateNames = alternateNamesMap.getOrElse(geonameId, Nil)
val altNames = alternateNames.flatMap(altName => {
processFeatureName(
geonameId,
feature.countryCode,
altName.lang,
altName.name,
isPrefName = altName.isPrefName,
isShortName = altName.isShortName,
isColloquial = altName.isColloquial,
isHistoric = altName.isHistoric,
woeType = feature.featureClass.woeType
)
})
val (deaccentedFeatureNames, nonDeaccentedFeatureNames) =
altNames.partition(n => (n.flags & FeatureNameFlags.DEACCENT.getValue) > 0)
val nonDeaccentedNames: Set[String] = nonDeaccentedFeatureNames.map(_.name).toSet
displayNames ++= nonDeaccentedFeatureNames
displayNames ++= deaccentedFeatureNames.filterNot(n => nonDeaccentedNames.has(n.name))
// the admincode is the internal geonames admin code, but is very often the
// same short name for the admin area that is actually used in the country
if (feature.featureClass.isAdmin1 || feature.featureClass.isAdmin2 || feature.featureClass.isAdmin3) {
displayNames ++= feature.adminCode.toList.flatMap(code => {
if (!isAllDigits(code)) {
Some(DisplayName("abbr", code, FeatureNameFlags.ABBREVIATION.getValue))
} else {
Some(DisplayName("", code, FeatureNameFlags.NEVER_DISPLAY.getValue))
}
})
}
def fixParent(p: String): Option[String] = {
adminIdMap.get(p) orElse {
//println("missing admin lookup for %s".format(p))
None
}
}
// Build parents
val extraParents: List[StoredFeatureId] =
feature.extraColumns
.get("parents")
.toList
.flatMap(_.split(",").toList)
.flatMap(pStr => StoredFeatureId.fromHumanReadableString(pStr))
val parents: List[StoredFeatureId] =
feature.parents.flatMap(fixParent).map(p => GeonamesId(p.toLong))
val hierarchyParents: List[StoredFeatureId] =
hierarchyTable.getOrElse(geonameId, Nil).filterNot(p => parents.has(p))
val allParents: List[StoredFeatureId] = extraParents ++ parents ++ hierarchyParents
val boost: Option[Int] =
feature.extraColumns.get("boost").map(_.toInt) orElse
boostTable.get(geonameId).headOption.flatMap(boost => TryO { boost.toInt })
val bbox = feature.extraColumns
.get("bbox")
.flatMap(bboxStr => {
// west, south, east, north
val parts = bboxStr.split(",").map(_.trim)
parts.toList match {
case w :: s :: e :: n :: Nil => {
Some(BoundingBox(Point(n.toDouble, e.toDouble), Point(s.toDouble, w.toDouble)))
}
case _ => {
log.error("malformed bbox: " + bboxStr)
None
}
}
}) orElse bboxTable.get(geonameId)
var lat = feature.latitude
var lng = feature.longitude
val latlngs = moveTable.get(geonameId)
if (latlngs.size > 0) {
lat = latlngs(0).toDouble
lng = latlngs(1).toDouble
}
val canGeocode = feature.extraColumns.get("canGeocode").map(_.toInt).getOrElse(1) > 0
val slug: Option[String] = slugIndexer.getBestSlug(geonameId)
if (slug.isEmpty &&
List(YahooWoeType.TOWN, YahooWoeType.SUBURB, YahooWoeType.COUNTRY, YahooWoeType.ADMIN1, YahooWoeType.ADMIN2)
.has(feature.featureClass.woeType)) {
slugIndexer.missingSlugList.add(geonameId.humanReadableString)
}
var attributesSet = false
lazy val attributesBuilder = {
attributesSet = true
GeocodeFeatureAttributes.newBuilder
}
naturalEarthPopulatedPlacesMap
.get(geonameId)
.map(sfeature => {
sfeature.propMap.get("adm0cap").foreach(v => attributesBuilder.adm0cap(v.toDouble.toInt == 1))
sfeature.propMap.get("worldcity").foreach(v => attributesBuilder.worldcity(v.toDouble.toInt == 1))
sfeature.propMap.get("scalerank").foreach(v => attributesBuilder.scalerank(v.toInt))
sfeature.propMap.get("natscale").foreach(v => attributesBuilder.natscale(v.toInt))
sfeature.propMap.get("labelrank").foreach(v => attributesBuilder.labelrank(v.toInt))
})
if (feature.featureClass.isAdmin1Capital) {
attributesBuilder.adm1cap(true)
}
feature.population.foreach(pop => attributesBuilder.population(pop))
feature.extraColumns.get("sociallyRelevant").map(v => attributesBuilder.sociallyRelevant(v.toBoolean))
feature.extraColumns
.get("neighborhoodType")
.map(v => attributesBuilder.neighborhoodType(NeighborhoodType.findByNameOrNull(v)))
attributesBuilder.urls(displayNames.filter(_.lang =? "link").map(_.name))
val extraRelations = extraRelationsList.get(geonameId).map(_.split(",").toList.map(_.toLong)).flatten
case class PolygonRecord(geom: Geometry) {
val id = new ObjectId()
}
// Let's please deprecate this codepath
val polygonOpt = feature.extraColumns
.get("geometry")
.map(polygon => {
wktReader.read(polygon)
})
val polygonRecordOpt =
if (feature.featureClass.woeType == YahooWoeType.POI
&& config.revgeoIndexPoints) {
Some(PolygonRecord(GeoTools.pointToGeometry(lat, lng)))
} else {
polygonOpt.map(poly => PolygonRecord(poly))
}
// combine flags of duplicate names in the same language
val finalDisplayNames = displayNames
.groupBy(dn => (dn.lang, dn.name))
.toList
.map({
case ((lang, name), displayNames) =>
DisplayName(lang = lang, name = name, flags = displayNames.foldLeft(0)((f, dn) => f | dn.flags))
})
val record = GeocodeRecord(
id = geonameId.longId,
names = Nil,
cc = feature.countryCode,
woeType = feature.featureClass.woeType.getValue,
lat = lat,
lng = lng,
parents = allParents.map(_.longId),
population = feature.population,
displayNames = finalDisplayNames,
boost = boost,
boundingBox = bbox,
displayBounds = displayBboxTable.get(geonameId),
canGeocode = canGeocode,
slug = slug,
// hasPoly = polygonExtraEntry.isDefined,
extraRelations = extraRelations,
ids = ids.map(_.longId),
polyId = polygonRecordOpt.map(_.id).getOrElse(GeocodeRecord.dummyOid),
hasPoly = polygonRecordOpt.isDefined
).applyIf(attributesSet, _.withAttributes(Some(attributesBuilder.result)))
polygonRecordOpt.foreach(polygonRecord => {
polygonLoader.indexPolygon(
polygonRecord.id,
polygonRecord.geom,
"self_point"
)
})
record
}
def parseAdminFile(filename: String, allowBuildings: Boolean = false) {
parseFromFile(
filename,
(index: Int, line: String) => GeonamesFeature.parseFromAdminLine(index, line),
"features",
allowBuildings
)
}
def parsePostalCodeFile(filename: String) {
parseFromFile(
filename,
(index: Int, line: String) => GeonamesFeature.parseFromPostalCodeLine(index, line),
"postal codes"
)
}
private def shouldTakeFeature(f: InputFeature, allowBuildings: Boolean): Boolean = {
f.shouldIndex &&
!ignoreList.contains(f.featureId) &&
(!f.featureClass.isBuilding || config.shouldParseBuildings || allowBuildings)
}
private def parseFromFile(
filename: String,
lineProcessor: (Int, String) => Option[GeonamesFeature],
typeName: String,
allowBuildings: Boolean = false
) {
var processed = 0
val lines = scala.io.Source.fromFile(new File(filename), "UTF-8").getLines
val groupSize = 2000
for {
(lineGroup, groupIndex) <- lines.grouped(groupSize).zipWithIndex
} {
val processed = groupIndex * groupSize
if (processed % 10000 == 0) {
log.info("imported %d %s so far".format(processed, typeName))
}
val recordsToInsert = lineGroup.zipWithIndex
.flatMap({
case (line, index) => {
val realIndex = groupIndex * groupSize + index
lineProcessor(realIndex, line)
.filter(f => shouldTakeFeature(f, allowBuildings))
.map(line => {
Stats.time("parse.line." + typeName) {
parseFeature(line)
}
})
}
})
.toList
Stats.time("parse.insert" + typeName + "." + groupSize) {
insertGeocodeRecords(recordsToInsert)
}
}
}
def insertGeocodeRecords(recordsToInsert: List[GeocodeRecord]) {
store.insert(recordsToInsert)
val displayNamesToInsert = recordsToInsert.flatMap(
r => createNameIndexRecords(r.displayNames.map(new DisplayName(_)).toList, r.featureId, Some(r))
)
store.addNameIndexes(displayNamesToInsert)
}
var alternateNamesMap = new HashMap[StoredFeatureId, List[AlternateNameEntry]]
def loadAlternateNames() {
val altDirs = List(
new File("src/jvm/io/fsq/twofishes/indexer/data/computed/alternateNames/"),
new File("src/jvm/io/fsq/twofishes/indexer/data/private/alternateNames/")
)
val files: List[String] = List("src/jvm/io/fsq/twofishes/indexer/data/downloaded/alternateNames.txt") ++ altDirs
.flatMap(altDir => {
if (altDir.exists) {
altDir.listFiles.toList.map(_.toString)
} else {
Nil
}
})
.sorted
alternateNamesMap = AlternateNamesReader.readAlternateNamesFiles(files)
}
val spaceRe = " +".r
def fixName(s: String) = spaceRe.replaceAllIn(s, " ").trim
// TODO: actually use flags
def doShorten(cc: String, name: String): List[String] = {
val shortens = shortensList.getOrElse("*", Nil) ++
shortensList.getOrElse(cc, Nil)
val candidates = shortens.flatMap(shorten => {
val newName = shorten.from.replaceAllIn(name, shorten.to)
if (newName != name) {
Some(fixName(newName))
} else {
None
}
})
candidates.sortBy(_.size).headOption.toList
}
def hackName(
lang: String,
name: String,
cc: String,
woeType: YahooWoeType
): List[String] = {
// TODO: move these to data files
if (woeType == YahooWoeType.ADMIN1 && cc == "JP" && (lang == "en" || lang == "")) {
List(name + " Prefecture")
} else if (woeType == YahooWoeType.TOWN && cc == "TW" && (lang == "en" || lang == "")) {
List(name + " County")
// Region Lima -> Lima Region
} else if (woeType == YahooWoeType.ADMIN1 && cc == "PE" && name.startsWith("Region")) {
List(name.replace("Region", "").trim + " Region")
} else {
Nil
}
}
def isLocalLang(lang: String, cc: String) =
countryLangMap.getOrElse(cc, Nil).contains(lang)
def processFeatureName(
fid: StoredFeatureId,
cc: String,
lang: String,
name: String,
isPrefName: Boolean = false,
isShortName: Boolean = false,
isColloquial: Boolean = false,
isHistoric: Boolean = false,
woeType: YahooWoeType
): List[DisplayName] = {
if (lang != "post" && !isNameDeleted(name, fid)) {
val originalNames = List(name)
val hackedNames = hackName(lang, name, cc, woeType)
val (deaccentedNames, allModifiedNames) = rewriteNames(originalNames)
val shortenedNames = doShorten(cc, name)
def buildDisplayName(name: String, flags: Int) = {
DisplayName(lang, name, flags)
}
def processNameList(names: List[String], flags: Int): List[DisplayName] = {
names.map(n => {
var finalFlags = flags
if (isLocalLang(lang, cc)) {
finalFlags |= FeatureNameFlags.LOCAL_LANG.getValue
}
if (isHistoric) {
finalFlags |= FeatureNameFlags.HISTORIC.getValue
}
if (isColloquial) {
finalFlags |= FeatureNameFlags.COLLOQUIAL.getValue
}
buildDisplayName(n, finalFlags)
})
}
val originalFlags = {
val prefFlag = if (isPrefName) {
FeatureNameFlags.PREFERRED.getValue
} else {
0
}
val shortFlag = if (isShortName) {
FeatureNameFlags.SHORT_NAME.getValue
} else {
0
}
shortFlag | prefFlag
}
processNameList(originalNames, originalFlags) ++
processNameList(shortenedNames, originalFlags | FeatureNameFlags.SHORT_NAME.getValue) ++
processNameList(deaccentedNames, originalFlags | FeatureNameFlags.DEACCENT.getValue) ++
processNameList(allModifiedNames, originalFlags | FeatureNameFlags.ALIAS.getValue) ++
processNameList(hackedNames, originalFlags | FeatureNameFlags.ALIAS.getValue)
} else {
Nil
}
}
def parseNameTransforms(): Unit = {
// geonameid -> lang|prefName|[optional flags]
val nameTransformsDirs = List(
new File("src/jvm/io/fsq/twofishes/indexer/data/custom/name-transforms"),
new File("src/jvm/io/fsq/twofishes/indexer/data/private/name-transforms")
)
val files = nameTransformsDirs.flatMap(dir => {
if (dir.exists) {
dir.listFiles
} else {
Nil
}
})
files.foreach(file => {
val lines = scala.io.Source.fromFile(file).getLines
parseNameTransforms(lines, file.toString)
})
}
private def parseFeatureNameFlags(
flagsString: Option[String],
default: List[FeatureNameFlags] = Nil
): Int = {
val flags: List[FeatureNameFlags] = if (flagsString.isEmpty) {
default
} else {
flagsString
.getOrElse("")
.split(",")
.map(
f =>
FeatureNameFlags
.unapply(f)
.getOrElse(
throw new Exception("couldn't parse name flag: %s".format(f))
)
)
.toList
}
var flagsMask = 0
flags.foreach(f => flagsMask = flagsMask | f.getValue())
flagsMask
}
def parseNameTransforms(lines: Iterator[String], filename: String = "n/a"): Unit = {
for {
(line, lineIndex) <- lines.zipWithIndex
if (!line.startsWith("#") && line.nonEmpty)
parts = line.split("[\t ]").toList
idString <- parts.lift(0)
featureId <- StoredFeatureId.fromHumanReadableString(idString, Some(GeonamesNamespace))
rest = parts.drop(1).mkString(" ")
lang <- rest.split("\\|").lift(0)
name <- rest.split("\\|").lift(1)
originalFlags = rest.split("\\|").lift(2)
} {
val flagsMask = parseFeatureNameFlags(originalFlags, List(FeatureNameFlags.PREFERRED))
val records = store.getById(featureId).toList
records match {
case Nil => log.error("no match for id %s".format(idString))
case record :: Nil => {
val flagsMaskComputed = flagsMask | (if (isLocalLang(lang, record.cc)) {
FeatureNameFlags.LOCAL_LANG.getValue
} else {
0
})
val newName = DisplayName(lang, name, flagsMaskComputed)
// all display names have already been deduped and their flags combined
// name transform can therefore have at most one display name dupe
// combine flags with that dupe, if it exists
var merged = false
val mergedNames = record.displayNames
.map(new DisplayName(_))
.map(dn => {
if (dn.lang =? lang && dn.name =? name) {
log.info(
"merged display name %s with name transform: id %s, lang %s, name %s, flags %d"
.format(dn, idString, lang, name, flagsMaskComputed)
)
merged = true
DisplayName(dn.lang, dn.name, dn.flags | flagsMaskComputed)
} else {
dn
}
})
// repeat merge for names in name index
if (merged) {
val normalizedName = NameNormalizer.normalize(name).trim
val nameRecords = store.getNameIndexByIdLangAndName(featureId, lang, normalizedName).toList
nameRecords match {
case Nil =>
log.error(
"display names and name index out of sync for id %s, lang %s, name %s".format(idString, lang, name)
)
case nameRecord :: dupes => {
// dupes can rarely creep into the name index when display names are not exact dupes
// but their normalized forms are, e.g. "LA", "L.A." both normalize to "la"
// in this case, use the first name's flags to update all names
val newFlags = nameRecord.flags | flagsMaskComputed
store.updateFlagsOnNameIndexByIdLangAndName(featureId, lang, normalizedName, newFlags)
}
}
} else {
addDisplayNameToNameIndex(newName, featureId, Some(record))
}
// if we're trying to put in a new preferred name, kill all the other preferred names in the same language
val modifiedNames: List[DisplayName] = mergedNames
.map(dn => {
if (dn.lang =? lang &&
dn.name !=? name &&
(flagsMaskComputed & FeatureNameFlags.PREFERRED.getValue) != 0) {
DisplayName(dn.lang, dn.name, dn.flags & ~FeatureNameFlags.PREFERRED.getValue())
} else {
dn
}
})
.toList
val newNames = modifiedNames ++
(if (merged) {
Nil
} else {
List(newName)
})
store.setRecordNames(featureId, newNames)
}
case list => log.error("multiple matches for id %s -- %s".format(idString, list))
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy