com.johnsnowlabs.nlp.annotators.MultiDateMatcher.scala Maven / Gradle / Ivy
The newest version!
/*
* Copyright 2017-2022 John Snow Labs
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.johnsnowlabs.nlp.annotators
import com.johnsnowlabs.nlp.util.regex.RuleFactory
import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, HasSimpleAnnotate}
import org.apache.commons.lang3.time.DateUtils
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
import java.text.SimpleDateFormat
import java.util.Calendar
import scala.collection.mutable.ListBuffer
import scala.util.matching.Regex
/** Matches standard date formats into a provided format.
*
* Reads the following kind of dates:
* {{{
* "1978-01-28", "1984/04/02,1/02/1980", "2/28/79", "The 31st of April in the year 2008",
* "Fri, 21 Nov 1997", "Jan 21, ‘97", "Sun", "Nov 21", "jan 1st", "next thursday",
* "last wednesday", "today", "tomorrow", "yesterday", "next week", "next month",
* "next year", "day after", "the day before", "0600h", "06:00 hours", "6pm", "5:30 a.m.",
* "at 5", "12:59", "23:59", "1988/11/23 6pm", "next week at 7.30", "5 am tomorrow"
* }}}
*
* For example `"The 31st of April in the year 2008"` will be converted into `2008/04/31`.
*
* For extended examples of usage, see the
* [[https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/MultiDateMatcherMultiLanguage_en.ipynb Examples]]
* and the
* [[https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/MultiDateMatcherTestSpec.scala MultiDateMatcherTestSpec]].
*
* ==Example==
* {{{
* import spark.implicits._
* import com.johnsnowlabs.nlp.base.DocumentAssembler
* import com.johnsnowlabs.nlp.annotators.MultiDateMatcher
* import org.apache.spark.ml.Pipeline
*
* val documentAssembler = new DocumentAssembler()
* .setInputCol("text")
* .setOutputCol("document")
*
* val date = new MultiDateMatcher()
* .setInputCols("document")
* .setOutputCol("date")
* .setAnchorDateYear(2020)
* .setAnchorDateMonth(1)
* .setAnchorDateDay(11)
*
* val pipeline = new Pipeline().setStages(Array(
* documentAssembler,
* date
* ))
*
* val data = Seq("I saw him yesterday and he told me that he will visit us next week")
* .toDF("text")
* val result = pipeline.fit(data).transform(data)
*
* result.selectExpr("explode(date) as dates").show(false)
* +-----------------------------------------------+
* |dates |
* +-----------------------------------------------+
* |[date, 57, 65, 2020/01/18, [sentence -> 0], []]|
* |[date, 10, 18, 2020/01/10, [sentence -> 0], []]|
* +-----------------------------------------------+
* }}}
*
* @param uid
* internal uid required to generate writable annotators
* @groupname anno Annotator types
* @groupdesc anno
* Required input and expected output annotator types
* @groupname Ungrouped Members
* @groupname param Parameters
* @groupname setParam Parameter setters
* @groupname getParam Parameter getters
* @groupname Ungrouped Members
* @groupprio param 1
* @groupprio anno 2
* @groupprio Ungrouped 3
* @groupprio setParam 4
* @groupprio getParam 5
* @groupdesc param
* A list of (hyper-)parameter keys this annotator can take. Users can set and get the
* parameter values through setters and getters, respectively.
*/
class MultiDateMatcher(override val uid: String)
extends AnnotatorModel[MultiDateMatcher]
with HasSimpleAnnotate[MultiDateMatcher]
with DateMatcherUtils {
import com.johnsnowlabs.nlp.AnnotatorType._
/** Output Annotator Type : DATE
*
* @group anno
*/
override val outputAnnotatorType: AnnotatorType = DATE
/** Input Annotator Type : DOCUMENT
*
* @group anno
*/
override val inputAnnotatorTypes: Array[AnnotatorType] = Array(DOCUMENT)
/** Internal constructor to submit a random UID */
def this() = this(Identifiable.randomUID("MULTI_DATE"))
private def runTranslation(text: String) = {
val sourceLanguage = getSourceLanguage
val translationPreds = Array(sourceLanguage.length == 2, !sourceLanguage.equals("en"))
if (translationPreds.forall(_.equals(true)))
new DateMatcherTranslator(MultiDatePolicy).translate(text, sourceLanguage)
else
text
}
private def findByInputFormatsRules(text: String, factory: RuleFactory): Seq[MatchedDateTime] =
factory
.findMatch(text)
.map(formalDateContentParse(_))
.groupBy(_.calendar)
.map { case (_, group) => group.head }
.toSeq
def runInputFormatsSearch(text: String): Seq[MatchedDateTime] = {
val regexes: Array[Regex] = getInputFormats
.filter(formalInputFormats.contains(_))
.map(formalInputFormats(_))
for (r <- regexes) {
formalFactoryInputFormats.addRule(r, "formal rule from input formats")
}
findByInputFormatsRules(text, formalFactoryInputFormats)
}
def runDateExtractorChain(_text: String): Seq[MatchedDateTime] = {
val strategies: Seq[() => Seq[MatchedDateTime]] = Seq(
() => extractFormalDate(_text),
() => extractRelativeDatePast(_text),
() => extractRelativeDateFuture(_text),
() => extractRelaxedDate(_text),
() => extractRelativeDate(_text),
() => extractTomorrowYesterday(_text),
() => extractRelativeExactDay(_text))
strategies.foldLeft(Seq.empty[MatchedDateTime])((previousResults, strategy) => {
// Always keep earliest match of each strategy by date found
val newResults = strategy()
newResults.foldLeft(previousResults)((previous, newResult) => {
// Prioritize previous results on this index, ignore new ones if overlapping previous results
if (previous.exists(_.start == newResult.start))
previous
else
previous :+ newResult
})
})
}
/** Finds dates in a specific order, from formal to more relaxed. Add time of any, or
* stand-alone time
*
* @param text
* input text coming from target document
* @return
* a possible date-time match
*/
private[annotators] def extractDate(text: String): Seq[MatchedDateTime] = {
val _text: String = runTranslation(text)
def inputFormatsAreDefined = !getInputFormats.sameElements(EMPTY_INIT_ARRAY)
val possibleDates: Seq[MatchedDateTime] =
if (inputFormatsAreDefined)
runInputFormatsSearch(_text)
else
runDateExtractorChain(_text)
if (getAggressiveMatching && possibleDates.isEmpty) {
runDateExtractorChain(_text)
} else possibleDates
}
private def extractRelativeDateFuture(text: String): Seq[MatchedDateTime] = {
if ("(.*)\\s*in\\s*[0-9](.*)".r.findFirstMatchIn(text).isDefined)
relativeFutureFactory
.findMatch(text.toLowerCase())
.map(possibleDate => relativeDateFutureContentParse(possibleDate))
else
Seq.empty
}
private def extractRelativeDatePast(text: String): Seq[MatchedDateTime] = {
if ("(.*)\\s*[0-9]\\s*(.*)\\s*(ago)(.*)".r.findFirstMatchIn(text).isDefined)
relativePastFactory
.findMatch(text.toLowerCase())
.map(possibleDate => relativeDatePastContentParse(possibleDate))
else
Seq.empty
}
private def extractFormalDate(text: String): Seq[MatchedDateTime] = {
val allFormalDateMatches = formalFactory.findMatch(text).map { possibleDate =>
formalDateContentParse(possibleDate)
}
regularizeFormalDateMatches(allFormalDateMatches)
}
private def regularizeFormalDateMatches: Seq[MatchedDateTime] => Seq[MatchedDateTime] =
allFormalDateMatches => {
def truncatedExists(e: Calendar, candidate: Calendar) = {
DateUtils.truncate(e, Calendar.MONTH).equals(candidate)
}
val indexedMatches: Seq[(MatchedDateTime, Int)] = allFormalDateMatches.zipWithIndex
val indexesToRemove = new ListBuffer[Int]()
for (e <- indexedMatches) {
val candidates = indexedMatches.filterNot(_._2 == e._2)
val accTempIdx: Seq[Int] =
for (candidate <- candidates
// if true, the candidate is the truncated match of the existing match
if truncatedExists(e._1.calendar, candidate._1.calendar)) yield candidate._2
accTempIdx.foreach(indexesToRemove.append(_))
}
val regularized =
indexedMatches.filterNot { case (_, i) => indexesToRemove.contains(i) }.map(_._1)
regularized
}
private def extractRelaxedDate(text: String): Seq[MatchedDateTime] = {
val possibleDates = relaxedFactory.findMatch(text)
val possibleDatesByIndexMatch = possibleDates.groupBy(_.indexMatch)
possibleDatesByIndexMatch.flatMap { case (_, possibleDates) =>
computePossibleDates(possibleDates)
}.toSeq
}
private def computePossibleDates(
possibleDates: Seq[RuleFactory.RuleMatch]): Seq[MatchedDateTime] = {
var dayMatch = $(defaultDayWhenMissing)
var monthMatch = defaultMonthWhenMissing
var yearMatch = defaultYearWhenMissing
var changes = 0
possibleDates.foreach(possibleDate => {
if (possibleDate.identifier == "relaxed days" && possibleDate.content.matched.exists(
_.isDigit)) {
changes += 1
dayMatch = possibleDate.content.matched.filter(_.isDigit).toInt
}
if (possibleDate.identifier == "relaxed months exclusive" && possibleDate.content.matched.length > 2) {
changes += 1
val month = possibleDate.content.matched.toLowerCase().take(3)
if (shortMonths.contains(month))
monthMatch = shortMonths.indexOf(month)
}
if (possibleDate.identifier == "relaxed year" &&
possibleDate.content.matched.exists(_.isDigit) &&
possibleDate.content.matched.length > 2) {
changes += 1
val year = possibleDate.content.matched.filter(_.isDigit).toInt
yearMatch = if (year > 999) year else year + 1900
}
})
if (possibleDates.nonEmpty && changes > 1) {
val calendar = new Calendar.Builder()
calendar.setDate(yearMatch, monthMatch, dayMatch)
Seq(
MatchedDateTime(
calendar.build(),
possibleDates.map(_.content.start).min,
possibleDates.map(_.content.end).max))
} else Seq.empty
}
private def extractRelativeDate(text: String): Seq[MatchedDateTime] = {
relativeFactory.findMatch(text).map(possibleDate => relativeDateContentParse(possibleDate))
}
private def extractTomorrowYesterday(text: String): Seq[MatchedDateTime] = {
tyFactory
.findMatch(text)
.map(possibleDate => tomorrowYesterdayContentParse(possibleDate))
}
private def extractRelativeExactDay(text: String): Seq[MatchedDateTime] = {
relativeExactFactory
.findMatch(text.toLowerCase)
.map(possibleDate => relativeExactContentParse(possibleDate))
}
/** One to one relationship between content document and output annotation
*
* @return
* Any found date, empty if not. Final format is [[outputFormat]] or default yyyy/MM/dd
*/
override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = {
val simpleDateFormat = new SimpleDateFormat(getOutputFormat)
annotations.flatMap(annotation =>
extractDate(annotation.result)
.map(matchedDate =>
Annotation(
outputAnnotatorType,
matchedDate.start,
matchedDate.end - 1,
simpleDateFormat.format(matchedDate.calendar.getTime),
annotation.metadata)))
}
}
/** This is the companion object of [[MultiDateMatcher]]. Please refer to that class for the
* documentation.
*/
object MultiDateMatcher extends DefaultParamsReadable[MultiDateMatcher]