
vectorpipe.functions.osm.package.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of vectorpipe_2.11 Show documentation
Show all versions of vectorpipe_2.11 Show documentation
Import OSM data and output to VectorTiles with GeoTrellis.
The newest version!
package vectorpipe.functions
import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.{Column, DataFrame, Row}
import vectorpipe.model.Member
import vectorpipe.util._
import scala.util.matching.Regex
import scala.util.{Failure, Success, Try}
package object osm {
// Using tag listings from [id-area-keys](https://github.com/osmlab/id-area-keys) @ v2.13.0.
private val AreaKeys: Map[String, Map[String, Boolean]] = Map(
"addr:*" -> Map(),
"advertising" -> Map(
"billboard" -> true
),
"aerialway" -> Map(
"cable_car" -> true,
"chair_lift" -> true,
"drag_lift" -> true,
"gondola" -> true,
"goods" -> true,
"magic_carpet" -> true,
"mixed_lift" -> true,
"platter" -> true,
"rope_tow" -> true,
"t-bar" -> true
),
"aeroway" -> Map(
"runway" -> true,
"taxiway" -> true
),
"allotments" -> Map(),
"amenity" -> Map(
"bench" -> true
),
"area:highway" -> Map(),
"attraction" -> Map(
"dark_ride" -> true,
"river_rafting" -> true,
"summer_toboggan" -> true,
"train" -> true,
"water_slide" -> true
),
"bridge:support" -> Map(),
"building" -> Map(),
"camp_site" -> Map(),
"club" -> Map(),
"craft" -> Map(),
"emergency" -> Map(
"designated" -> true,
"destination" -> true,
"no" -> true,
"official" -> true,
"private" -> true,
"yes" -> true
),
"golf" -> Map(
"hole" -> true,
"lateral_water_hazard" -> true,
"water_hazard" -> true
),
"healthcare" -> Map(),
"historic" -> Map(),
"industrial" -> Map(),
"internet_access" -> Map(),
"junction" -> Map(
"circular" -> true,
"roundabout" -> true
),
"landuse" -> Map(),
"leisure" -> Map(
"slipway" -> true,
"track" -> true
),
"man_made" -> Map(
"breakwater" -> true,
"crane" -> true,
"cutline" -> true,
"embankment" -> true,
"groyne" -> true,
"pier" -> true,
"pipeline" -> true
),
"military" -> Map(),
"natural" -> Map(
"cliff" -> true,
"coastline" -> true,
"ridge" -> true,
"tree_row" -> true
),
"office" -> Map(),
"piste:type" -> Map(
"downhill" -> true,
"hike" -> true,
"ice_skate" -> true,
"nordic" -> true,
"skitour" -> true,
"sled" -> true,
"sleigh" -> true
),
"place" -> Map(),
"playground" -> Map(
"balancebeam" -> true,
"slide" -> true,
"zipwire" -> true
),
"power" -> Map(
"cable" -> true,
"line" -> true,
"minor_line" -> true
),
"public_transport" -> Map(
"platform" -> true
),
"residential" -> Map(),
"seamark:type" -> Map(),
"shop" -> Map(),
"tourism" -> Map(
"artwork" -> true
),
"traffic_calming" -> Map(
"bump" -> true,
"cushion" -> true,
"dip" -> true,
"hump" -> true,
"rumble_strip" -> true
),
"waterway" -> Map(
"canal" -> true,
"dam" -> true,
"ditch" -> true,
"drain" -> true,
"river" -> true,
"stream" -> true,
"weir" -> true
)
)
private val MultiPolygonTypes = Seq("multipolygon", "boundary")
private val TruthyValues = Seq("yes", "true", "1")
private val FalsyValues = Seq("no", "false", "0")
private val BooleanValues = TruthyValues ++ FalsyValues
private val WaterwayValues =
Seq(
"river", "riverbank", "canal", "stream", "stream_end", "brook", "drain", "ditch", "dam", "weir", "waterfall",
"pressurised"
)
private val POITags = Set("amenity", "shop", "craft", "office", "leisure", "aeroway")
private val HashtagMatcher: Regex = """#([^\u2000-\u206F\u2E00-\u2E7F\s\\'!\"#$%()*,.\/;<=>?@\[\]^{|}~]+)""".r
private def cleanDelimitedValues(values: Column): Column = regexp_replace(trim(values), "\\s*;\\s*", ";")
def splitDelimitedValues(values: Column, default: Column = lit("")): Column = split(lower(coalesce(cleanDelimitedValues(values), default)), ";")
def splitDelimitedValues(values: String): Set[String] = values.replaceAll("\\s*;\\s*", ";").toLowerCase().split(";").toSet
private val _isArea = (tags: Map[String, String]) =>
tags match {
case _ if tags.contains("area") && BooleanValues.toSet.intersect(splitDelimitedValues(tags("area"))).nonEmpty =>
TruthyValues.toSet.intersect(splitDelimitedValues(tags("area"))).nonEmpty
case _ =>
// see https://github.com/osmlab/id-area-keys (values are inverted)
val matchingKeys = tags.keySet.intersect(AreaKeys.keySet)
matchingKeys.exists(k => {
// break out semicolon-delimited values
val values = splitDelimitedValues(tags(k))
// values that should be considered as lines
AreaKeys(k).keySet
.intersect(values)
// at least one key passes the area test
.size < values.size
})
}
val isAreaUDF: UserDefinedFunction = udf(_isArea)
def isArea(tags: Column): Column = isAreaUDF(tags) as 'isArea
def isMultiPolygon(tags: Column): Column =
array_intersects(
splitDelimitedValues(tags.getItem("type")),
lit(MultiPolygonTypes.toArray)) as 'isMultiPolygon
def isNew(version: Column, minorVersion: Column): Column =
version <=> 1 && minorVersion <=> 0 as 'isNew
def isRoute(tags: Column): Column =
array_contains(splitDelimitedValues(tags.getItem("type")), "route") as 'isRoute
private lazy val MemberSchema = ArrayType(
StructType(
StructField("type", ByteType, nullable = false) ::
StructField("ref", LongType, nullable = false) ::
StructField("role", StringType, nullable = false) ::
Nil), containsNull = false)
private val _compressMemberTypes = (members: Seq[Row]) =>
members.map { row =>
val t = Member.typeFromString(row.getAs[String]("type"))
val ref = row.getAs[Long]("ref")
val role = row.getAs[String]("role")
Row(t, ref, role)
}
@transient lazy val compressMemberTypes: UserDefinedFunction = udf(_compressMemberTypes, MemberSchema)
/**
* Checks if members have byte-encoded types
*/
def hasCompressedMemberTypes(input: DataFrame): Boolean = {
Try(input.schema("members")
.dataType
.asInstanceOf[ArrayType]
.elementType
.asInstanceOf[StructType]
.apply("type")) match {
case Failure(_) => false
case Success(field) => field.dataType == ByteType
}
}
def ensureCompressedMembers(input: DataFrame): DataFrame = {
if (hasCompressedMemberTypes(input))
input
else {
input.withColumn("members", compressMemberTypes(col("members")))
}
}
case class StrMember(`type`: String, ref: Long, role: String)
private val elaborateMembers = org.apache.spark.sql.functions.udf { member: Seq[Row] =>
if (member == null)
null
else {
member.map { row: Row =>
StrMember(vectorpipe.model.Member.stringFromByte(row.getAs[Byte]("type")),
row.getAs[Long]("ref"),
row.getAs[String]("role"))
}
}
}
// matches letters or emoji (no numbers or punctuation)
private val ContentMatcher: Regex = """[\p{L}\uD83C-\uDBFF\uDC00-\uDFFF]""".r
private val TrailingPunctuationMatcher: Regex = """[:]$""".r
@transient lazy val extractHashtags: UserDefinedFunction = udf { comment: String =>
HashtagMatcher
.findAllMatchIn(comment)
// fetch the first group (after #)
.map(_.group(1).toLowerCase)
// check that each group contains at least one substantive character
.filter(ContentMatcher.findFirstIn(_).isDefined)
// strip trailing punctuation
.map(TrailingPunctuationMatcher.replaceAllIn(_, ""))
.toList // prevent a Stream from being returned
.distinct
}
def hashtags(comment: Column): Column =
// only call the UDF when necessary
when(comment.isNotNull and length(comment) > 0, extractHashtags(comment))
.otherwise(typedLit(Seq.empty[String])) as 'hashtags
def isTagged(tags: Column): Column = size(map_keys(tags)) > 0 as 'isTagged
def isBuilding(tags: Column): Column =
!lower(coalesce(tags.getItem("building"), lit("no"))).isin(FalsyValues: _*) as 'isBuilding
@transient lazy val isPOI: UserDefinedFunction = udf {
tags: Map[String, String] => POITags.intersect(tags.keySet).nonEmpty
}
def isRoad(tags: Column): Column =
tags.getItem("highway").isNotNull as 'isRoad
def isCoastline(tags: Column): Column =
array_contains(splitDelimitedValues(tags.getItem("natural")), "coastline") as 'isCoastline
def isWaterway(tags: Column): Column =
array_intersects(splitDelimitedValues(tags.getItem("waterway")), lit(WaterwayValues.toArray)) as 'isWaterway
def mergeTags: UserDefinedFunction = udf { (a: Map[String, String], b: Map[String, String]) =>
mergeMaps(a.mapValues(Set(_)), b.mapValues(Set(_)))(_ ++ _).mapValues(_.mkString(";"))
}
val reduceTags: UserDefinedFunction = udf { tags: Iterable[Map[String, String]] =>
tags.map(x => x.mapValues(Set(_))).reduce((a, b) => mergeMaps(a, b)(_ ++ _)).mapValues(_.mkString(";"))
}
val array_intersects: UserDefinedFunction = udf { (a: Seq[_], b: Seq[_]) =>
a.intersect(b).nonEmpty}
// from the top 200 single-use tags in 20190610's history dump
// select k, count(*) from history
// cross join unnest(map_keys(tags)) as t (k)
// where cardinality(tags) = 1
// group by k
// order by count(*) desc
val UninterestingTags: Set[String] = Set(
"created_by",
"source",
"comment",
"_ID",
"CLC",
"odbl",
"origen",
"converted_by",
"todo",
"import_tools",
"ID",
"importuuid",
"attribution",
"curve_geometry",
"memphis_fixup",
"importance",
"description=ru-mos-325",
"stamväg",
"_FID_",
"1",
"_description_",
"ccpr",
"dfg",
// https://github.com/gravitystorm/openstreetmap-carto/blob/master/openstreetmap-carto.lua#L51
"source_ref",
"ref:UrbIS",
"accuracy:meters",
"sub_sea:type",
"waterway:type",
"statscan:rbuid",
"ref:ruian:addr",
"ref:ruian",
"building:ruian:type",
"dibavod:id",
"uir_adr:ADRESA_KOD",
"gst:feat_id",
"maaamet:ETAK",
"ref:FR:FANTOIR",
"3dshapes:ggmodelk",
"AND_nosr_r",
"OPPDATERIN",
"addr:city:simc",
"addr:street:sym_ul",
"building:usage:pl",
"building:use:pl",
"teryt:simc",
"raba:id",
"dcgis:gis_id",
"nycdoitt:bin",
"chicago:building_id",
"lojic:bgnum",
"massgis:way_id",
"OBJTYPE",
"SK53_bulk:load"
).map(_.toLowerCase())
val SemiInterestingTags: Set[String] = Set("source").map(_.toLowerCase())
val UninterestingPrefixes: Set[String] = Set(
"CLC",
"tiger",
"sby",
"navibot",
"nps",
"hoot",
"error",
"project",
// https://github.com/gravitystorm/openstreetmap-carto/blob/master/openstreetmap-carto.lua#L51
"geobase",
"canvec",
"osak",
"kms",
"ngbe",
"it:fvg",
"KSJ2",
"yh",
"LINZ2OSM",
"LINZ",
"WroclawGIS",
"naptap",
"gnis",
"NHD",
"mvdgis"
).map(_.toLowerCase())
val SemiInterestingPrefixes: Set[String] = Set("source").map(_.toLowerCase())
val UninterestingSingleTags: Set[String] = Set("colour").map(_.toLowerCase())
lazy val removeUninterestingTags: UserDefinedFunction = udf(_removeUninterestingTags)
private val _removeUninterestingTags = (tags: Map[String, String]) =>
tags.filterKeys(key => {
val k = key.toLowerCase
!UninterestingTags.contains(k) &&
!(tags.size == 1 && UninterestingSingleTags.contains(k)) &&
!UninterestingPrefixes.exists(p => k.startsWith(s"$p:")) &&
!k.contains("=") &&
!k.contains(" ")
})
lazy val removeSemiInterestingTags: UserDefinedFunction = udf(_removeSemiInterestingTags)
private val _removeSemiInterestingTags = (tags: Map[String, String]) =>
tags.filterKeys(key => {
val k = key.toLowerCase
!SemiInterestingTags.contains(k) &&
!SemiInterestingPrefixes.exists(p => k.startsWith(s"$p:"))
})
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy