
com.johnsnowlabs.nlp.annotators.DateMatcher.scala Maven / Gradle / Ivy
package com.johnsnowlabs.nlp.annotators
import java.text.SimpleDateFormat
import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, AnnotatorType}
import scala.util.matching.Regex
import java.util.Calendar
import com.johnsnowlabs.nlp.util.regex.{MatchStrategy, RuleFactory}
import org.apache.spark.ml.param.Param
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
/**
* Matches standard date formats into a provided format
* @param uid internal uid required to generate writable annotators
* @@ dateFormat: allows to define expected output format. Follows SimpleDateFormat standard.
*/
class DateMatcher(override val uid: String) extends AnnotatorModel[DateMatcher] {
import com.johnsnowlabs.nlp.AnnotatorType._
/**
* Container of a parsed date with identified bounds
* @param calendar [[Calendar]] holding parsed date
* @param start start bound of detected match
* @param end end bound of detected match
*/
private[annotators] case class MatchedDateTime(calendar: Calendar, start: Int, end: Int)
/** Standard formal dates, e.g. 05/17/2014 or 17/05/2014 or 2014/05/17 */
private val formalDate = new Regex("\\b([01]{0,1}[0-9])[-/]([0-3]{0,1}[0-9])[-/](\\d{2,4})\\b", "month", "day", "year")
private val formalDateAlt = new Regex("\\b([0-3]{0,1}[0-9])[-/]([01]{0,1}[0-9])[-/](\\d{2,4})\\b", "day", "month", "year")
private val formalDateAlt2 = new Regex("\\b(\\d{2,4})[-/]([01]{0,1}[0-9])[-/]([0-3]{0,1}[0-9])\\b", "year", "month", "day")
private val months = Seq("january","february","march","april","may","june","july","august","september","october","november","december")
private val shortMonths = Seq("jan","feb","mar","apr","may","jun","jul","aug","sep","oct","nov","dec")
/** Relaxed dates, e.g. March 2nd */
private val relaxedDayNumbered = "\\b(\\d{1,2})(?:st|rd|nd|th)*\\b".r
private val relaxedMonths = "(?i)" + months.zip(shortMonths).map(m => m._1 + "|" + m._2).mkString("|")
private val relaxedYear = "\\d{4}\\b|\\B'\\d{2}\\b".r
/** Relative dates, e.g. tomorrow */
private val relativeDate = "(?i)(next|last)\\s(week|month|year)".r
private val relativeDay = "(?i)(today|tomorrow|yesterday|past tomorrow|day before|day after|day before yesterday|day after tomorrow)".r
private val relativeExactDay = "(?i)(next|last|past)\\s(mon|tue|wed|thu|fri)".r
/** standard time representations e.g. 05:42:16 or 5am*/
private val clockTime = new Regex("(?i)([0-2][0-9]):([0-5][0-9])(?::([0-5][0-9]))?", "hour", "minutes", "seconds")
private val altTime = new Regex("([0-2]?[0-9])\\.([0-5][0-9])\\.?([0-5][0-9])?", "hour", "minutes", "seconds")
private val coordTIme = new Regex("([0-2]?[0-9])([0-5][0-9])?\\.?([0-5][0-9])?\\s*(?:h|a\\.?m|p\\.?m)", "hour", "minutes", "seconds")
private val refTime = new Regex("at\\s+([0-9])\\s*([0-5][0-9])*\\s*([0-5][0-9])*")
private val amDefinition = "(?i)(a\\.?m)".r
/** Annotator param containing expected output format of parsed date*/
val dateFormat: Param[String] = new Param(this, "dateFormat", "SimpleDateFormat standard criteria")
override val outputAnnotatorType: AnnotatorType = DATE
override val inputAnnotatorTypes: Array[AnnotatorType] = Array(DOCUMENT)
setDefault(
inputCols -> Array(DOCUMENT),
dateFormat -> "yyyy/MM/dd"
)
/** Internal constructor to submit a random UID */
def this() = this(Identifiable.randomUID("DATE"))
def getFormat: String = $(dateFormat)
def setFormat(value: String): this.type = set(dateFormat, value)
/**
* Finds dates in a specific order, from formal to more relaxed. Add time of any, or stand-alone time
* @param text input text coming from target document
* @return a possible date-time match
*/
private[annotators] def extractDate(text: String): Option[MatchedDateTime] = {
val possibleDate = extractFormalDate(text)
.orElse(extractRelaxedDate(text))
.orElse(extractRelativeDate(text))
.orElse(extractTomorrowYesterday(text))
.orElse(extractRelativeExactDay(text))
possibleDate.orElse(setTimeIfAny(possibleDate, text))
}
/**
* Searches formal date by ordered rules
* Matching strategy is to find first match only, ignore additional matches from then
* Any 4 digit year will be assumed a year, any 2 digit year will be as part of XX Century e.g. 1954
*/
private val formalFactory = new RuleFactory(MatchStrategy.MATCH_FIRST)
.addRule(formalDate, "formal date matcher with year at first")
.addRule(formalDateAlt, "formal date with year at end")
.addRule(formalDateAlt2, "formal date with day at beginning")
private def extractFormalDate(text: String): Option[MatchedDateTime] = {
formalFactory.findMatchFirstOnly(text).map{ possibleDate =>
val formalDate = possibleDate.content
val calendar = new Calendar.Builder()
MatchedDateTime(
calendar.setDate(
if (formalDate.group("year").toInt > 999)
formalDate.group("year").toInt
/** If year found is greater than <10> years from now, assume text is talking about 20th century */
else if (formalDate.group("year").toInt > Calendar.getInstance.get(Calendar.YEAR).toString.takeRight(2).toInt + 10)
formalDate.group("year").toInt + 1900
else
formalDate.group("year").toInt + 2000,
formalDate.group("month").toInt - 1,
formalDate.group("day").toInt
).build(),
formalDate.start,
formalDate.end
)
}
}
/**
* Searches relaxed dates by ordered rules by more exhaustive to less
* Strategy used is to match first only. any other matches discarded
* Auto completes short versions of months. Any two digit year is considered to be XX century
*/
private val relaxedFactory = new RuleFactory(MatchStrategy.MATCH_FIRST)
.addRule(relaxedDayNumbered, "relaxed days")
.addRule(relaxedMonths.r, "relaxed months exclusive")
.addRule(relaxedYear, "relaxed year")
private def extractRelaxedDate(text: String): Option[MatchedDateTime] = {
val possibleDates = relaxedFactory.findMatch(text)
if (possibleDates.length > 1) {
val dayMatch = possibleDates.head.content
val day = dayMatch.matched.filter(_.isDigit).toInt
val monthMatch = possibleDates(1).content
val month = shortMonths.indexOf(monthMatch.matched.toLowerCase.take(3))
val yearMatch = possibleDates.last.content
val year = {
if (possibleDates.length > 2) {
val number = yearMatch.matched.filter(_.isDigit).toInt
if (number > 999) number else number + 1900
} else {
Calendar.getInstance.get(Calendar.YEAR)
}
}
val calendar = new Calendar.Builder()
calendar.setDate(year, month, day)
Some(MatchedDateTime(
calendar.build(),
Seq(yearMatch, monthMatch, dayMatch).map(_.start).min,
Seq(yearMatch, monthMatch, dayMatch).map(_.end).max
))
} else None
}
/**
* extracts relative dates. Strategy is to get only first match.
* Will always assume relative day from current time at processing
* ToDo: Support relative dates from input date
*/
private val relativeFactory = new RuleFactory(MatchStrategy.MATCH_FIRST)
.addRule(relativeDate, "relative dates")
private def extractRelativeDate(text: String): Option[MatchedDateTime] = {
relativeFactory.findMatchFirstOnly(text).map(possibleDate => {
val relativeDate = possibleDate.content
val calendar = Calendar.getInstance()
val amount = if (relativeDate.group(1) == "next") 1 else -1
relativeDate.group(2) match {
case "week" => calendar.add(Calendar.WEEK_OF_MONTH, amount)
case "month" => calendar.add(Calendar.MONTH, amount)
case "year" => calendar.add(Calendar.YEAR, amount)
}
MatchedDateTime(calendar, relativeDate.start, relativeDate.end)
})
}
/** Searches for relative informal dates such as today or the day after tomorrow */
private val tyFactory = new RuleFactory(MatchStrategy.MATCH_FIRST)
.addRule(relativeDay, "relative days")
private def extractTomorrowYesterday(text: String): Option[MatchedDateTime] = {
tyFactory.findMatchFirstOnly(text).map (possibleDate => {
val tyDate = possibleDate.content
tyDate.matched.toLowerCase match {
case "today" =>
val calendar = Calendar.getInstance()
MatchedDateTime(calendar, tyDate.start, tyDate.end)
case "tomorrow" =>
val calendar = Calendar.getInstance()
calendar.add(Calendar.DAY_OF_MONTH, 1)
MatchedDateTime(calendar, tyDate.start, tyDate.end)
case "past tomorrow" =>
val calendar = Calendar.getInstance()
calendar.add(Calendar.DAY_OF_MONTH, 2)
MatchedDateTime(calendar, tyDate.start, tyDate.end)
case "yesterday" =>
val calendar = Calendar.getInstance()
calendar.add(Calendar.DAY_OF_MONTH, -1)
MatchedDateTime(calendar, tyDate.start, tyDate.end)
case "day after" =>
val calendar = Calendar.getInstance()
calendar.add(Calendar.DAY_OF_MONTH, 1)
MatchedDateTime(calendar, tyDate.start, tyDate.end)
case "day before" =>
val calendar = Calendar.getInstance()
calendar.add(Calendar.DAY_OF_MONTH, -1)
MatchedDateTime(calendar, tyDate.start, tyDate.end)
case "day after tomorrow" =>
val calendar = Calendar.getInstance()
calendar.add(Calendar.DAY_OF_MONTH, 2)
MatchedDateTime(calendar, tyDate.start, tyDate.end)
case "day before yesterday" =>
val calendar = Calendar.getInstance()
calendar.add(Calendar.DAY_OF_MONTH, -2)
MatchedDateTime(calendar, tyDate.start, tyDate.end)
}})
}
/** Searches for exactly provided days of the week. Always relative from current time at processing */
private val relativeExactFactory = new RuleFactory(MatchStrategy.MATCH_FIRST)
.addRule(relativeExactDay, "relative precise dates")
private def extractRelativeExactDay(text: String): Option[MatchedDateTime] = {
relativeExactFactory.findMatchFirstOnly(text).map(possibleDate => {
val relativeDate = possibleDate.content
val calendar = Calendar.getInstance()
val amount = if (relativeDate.group(1) == "next") 1 else -1
calendar.add(Calendar.DAY_OF_MONTH, amount)
relativeDate.group(2) match {
case "mon" =>
while (calendar.get(Calendar.DAY_OF_WEEK) != Calendar.MONDAY) {
calendar.add(Calendar.DAY_OF_MONTH, amount)
}
case "tue" =>
while (calendar.get(Calendar.DAY_OF_WEEK) != Calendar.TUESDAY) {
calendar.add(Calendar.DAY_OF_MONTH, amount)
}
case "wed" =>
while (calendar.get(Calendar.DAY_OF_WEEK) != Calendar.WEDNESDAY) {
calendar.add(Calendar.DAY_OF_MONTH, amount)
}
case "thu" =>
while (calendar.get(Calendar.DAY_OF_WEEK) != Calendar.THURSDAY) {
calendar.add(Calendar.DAY_OF_MONTH, amount)
}
case "fri" =>
while (calendar.get(Calendar.DAY_OF_WEEK) != Calendar.FRIDAY) {
calendar.add(Calendar.DAY_OF_MONTH, amount)
}
}
MatchedDateTime(calendar, relativeDate.start, relativeDate.end)
})
}
/**
* Searches for times of the day
* dateTime If any dates found previously, keep it as part of the final result
* text target document
* @return a final possible date if any found
*/
private val timeFactory = new RuleFactory(MatchStrategy.MATCH_FIRST)
.addRule(clockTime, "standard time extraction")
.addRule(altTime, "alternative time format")
.addRule(coordTIme, "coordinate like time")
.addRule(refTime, "referred time")
private def setTimeIfAny(dateTime: Option[MatchedDateTime], text: String): Option[MatchedDateTime] = {
timeFactory.findMatchFirstOnly(text).map { possibleTime => {
val calendarBuild = new Calendar.Builder
val currentCalendar = dateTime.map(_.calendar).getOrElse(Calendar.getInstance)
calendarBuild.setDate(
currentCalendar.get(Calendar.YEAR),
currentCalendar.get(Calendar.MONTH),
currentCalendar.get(Calendar.DAY_OF_MONTH)
)
val times = possibleTime.content.subgroups
val hour = {
/** assuming PM if 2 digits regex-subgroup hour is defined, is ot AM and is less than number 12 e.g. meet you at 5*/
if (
times.head != null && // hour is defined
amDefinition.findFirstIn(text).isDefined && // no explicit am
times.head.toInt < 12 // hour is within smaller than 12
) times.head.toInt + 12
else if (times.head.toInt < 25) times.head.toInt
else 0
}
/** Minutes are valid if regex-subgroup matched and less than number 60*/
val minutes = {
if (times(1) != null && times(1).toInt < 60) times(1).toInt
else 0
}
/** Seconds are valid if regex-subgroup matched and less than number 60*/
val seconds = {
if (times(2) != null && times(2).toInt < 60) times(2).toInt
else 0
}
calendarBuild.setTimeOfDay(hour, minutes, seconds)
MatchedDateTime(calendarBuild.build, possibleTime.content.start, possibleTime.content.end)
}}
}
/** One to one relationship between content document and output annotation
* @return Any found date, empty if not. Final format is [[dateFormat]] or default yyyy/MM/dd
*/
override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = {
val simpleDateFormat = new SimpleDateFormat(getFormat)
annotations.flatMap( annotation =>
extractDate(annotation.result).map(matchedDate => Annotation(
outputAnnotatorType,
matchedDate.start,
matchedDate.end - 1,
simpleDateFormat.format(matchedDate.calendar.getTime),
Map.empty[String, String]
))
)
}
}
object DateMatcher extends DefaultParamsReadable[DateMatcher]
© 2015 - 2025 Weber Informatics LLC | Privacy Policy