All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.xiaomi.duckling.engine.Engine.scala Maven / Gradle / Ivy

The newest version!
/*
 * Copyright (c) 2020, Xiaomi and/or its affiliates. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.xiaomi.duckling.engine

import com.typesafe.scalalogging.LazyLogging

import com.xiaomi.duckling.Document
import com.xiaomi.duckling.Types._
import com.xiaomi.duckling.dimension.implicits._
import com.xiaomi.duckling.dimension.time.Prods.limitedSequenceByRange
import com.xiaomi.duckling.engine.LexiconLookup.{lookupLexicon, lookupLexiconAnywhere}
import com.xiaomi.duckling.engine.MultiCharLookup.{lookupMultiChar, lookupMultiCharAnywhere}
import com.xiaomi.duckling.engine.PhraseLookup._
import com.xiaomi.duckling.engine.RegexLookup._
import com.xiaomi.duckling.engine.VarcharLookup._
import com.xiaomi.duckling.types.{LanguageInfo, Node}

object Engine extends LazyLogging {

  private val verbose = conf.getConfig("engine.verbose")
  private val verboseParse = verbose.getBoolean("parse")
  private val verboseMatch = verbose.getBoolean("match")
  private val verboseProduce = verbose.getBoolean("produce")
  private val verboseLookup = verbose.getBoolean("lookup")

  /**
    * A match is full if its rule pattern is empty.
    * (rule, endPosition, reversedRoute)
    */
  type Match = (Rule, Int, List[Node])

  def parse(rules: List[Rule],
            lang: LanguageInfo,
            options: Options): List[Node] = {
    val input = lang.sentence
    val doc = Document.fromLang(lang)
    val stash = parseString(rules, doc, options)
    val orderedList = stash.toPosOrderedList()
    val full =
      if (options.full) orderedList.filter(r => r.range.rangeEq(0, input.length)) else orderedList
    full.distinct
  }

  def parseAndResolve(rules: List[Rule],
                      doc: Document,
                      context: Context,
                      options: Options): List[ResolvedToken] = {
    val input = doc.rawInput
    val stash = parseString(rules, doc, options)
    val orderedList = stash.toPosOrderedList()
    val rs = orderedList.flatMap(resolveNode(doc, context, options))
    val full =
      if (options.full) rs.filter(r => r.range.rangeEq(0, input.length)) else rs
    full.distinct
  }

  def parseString(rules: List[Rule], doc: Document, options: Options): Stash = {
    // One the first pass we try all the rules
    val (new_, partialMatches) =
      parseString1(rules, doc, Stash.empty(), Stash.empty(), Nil, options)
    // For subsequent passes, we only try rules starting with a predicate.
    if (new_.isEmpty) Stash.empty()
    else {
      val headPredicateRules = rules.filter {
        case Rule(_, ItemPredicate(_) :: _, _, _) => true
        case _                                    => false
      }
      saturateParseString(headPredicateRules, doc, new_, new_, partialMatches, options)
    }
  }

  /**
    * Finds new matches resulting from newly added tokens.
    * Produces new tokens from full matches.
    *
    * @param rules
    * @param doc
    * @param stash
    * @param new_
    * @param matches
    * @return
    */
  def parseString1(rules: List[Rule],
                   doc: Document,
                   stash: Stash,
                   new_ : Stash,
                   matches: List[Match],
                   options: Options): (Stash, List[Match]) = {
    // Recursively match patterns.
    // Find which `matches` can advance because of `new`.
    val newPartial = matches.flatMap(matchFirst(doc, new_))

    // Find new matches resulting from newly added tokens (`new`)
    val newMatches = rules.flatMap(matchFirstAnywhere(doc, new_))

    val (full, partial) =
      matchAll(doc, stash, newPartial ++ newMatches, options.rankOptions.nodesLimit).partition {
        case (Rule(_, pattern, _, _), _, _) => pattern.isEmpty
      }
    if (verboseParse) {
      if (full.isEmpty) logger.info("full: []")
      else {
        logger.info("full: [")
        full.foreach(m => logger.info(s" - $m"))
        logger.info("full: ]")
      }

      if (partial.isEmpty) logger.info("full: []")
      else {
        logger.info("partial: [")
        partial.foreach(m => logger.info(s" - $m"))
        logger.info("partial: ]")
      }
    }
    val _matches =
      if (options.rankOptions.sequence1EndsPrune) limitedSequenceByRange(full, doc.validSequenceHeads, options)
      else full.flatMap(produce(options)).distinct
    (Stash.fromList(_matches), partial ++ matches)
  }

  /**
    * Produces all tokens recursively.
    *
    * @param rules
    * @param sentence
    * @param stash
    * @param new_
    */
  def saturateParseString(rules: List[Rule],
                          sentence: Document,
                          stash: Stash,
                          new_ : Stash,
                          matches: List[Match],
                          options: Options): Stash = {
    val (new__, matches_) = parseString1(rules, sentence, stash, new_, matches, options)
    val stash_ = stash.union(new__)
    if (new__.isEmpty) stash
    else saturateParseString(rules, sentence, stash_, new__, matches_, options)
  }

  def resolveNode(doc: Document, context: Context, options: Options)(
    node: Node
  ): Option[ResolvedToken] = {
    val unode @ Node(r, Token(dim, data), _, _, _, _) =
      if (options.varcharExpand) endsVarcharExpansion(doc, node, options) else node
    if (unode.isValid(doc)) {
      data.resolve(context, options).map {
        case (value, latent) =>
          ResolvedToken(range = r, node = unode, value = value, isLatent = latent)
      }
    } else None
  }

  /**
    * Returns all matches matching the first pattern item of `match`, resuming from a Match position
    *
    * @param sentence
    * @param stash
    * @param `match`
    * @return
    */
  def matchFirst(sentence: Document, stash: Stash)(`match`: Match): List[Match] = {
    val (rule, position, route) = `match`
    if (rule.pattern.isEmpty) Nil
    else {
      val p :: ps = rule.pattern
      val newRule = rule.copy(pattern = ps)
      if (verboseMatch) {
        logger.info(s"match first - apply rule of: ${rule.name}")
      }
      val valid = lookupItem(sentence, p, stash, position)
      valid.map(mkMatch(route, newRule))
    }
  }

  /**
    * Returns all matches matching the first pattern item of `match`,
    * starting anywhere
    *
    * @param sentence
    * @param stash
    * @param rule
    * @return
    */
  def matchFirstAnywhere(sentence: Document, stash: Stash)(rule: Rule): List[Match] = {
    if (rule.pattern.isEmpty) Nil
    else {
      val p :: ps = rule.pattern
      lookupItemAnywhere(sentence, p, stash).map(mkMatch(Nil, rule.copy(pattern = ps)))
    }
  }

  def lookupItemAnywhere(doc: Document, patternItem: PatternItem, stash: Stash): List[Node] = {
    patternItem match {
      case ItemRegex(re) => lookupRegexAnywhere(doc, re)
      case ItemPredicate(p) =>
        stash.toPosOrderedList().filter(node => (p orElse emptyPredicate)(node.token))
      case ItemVarchar(lower, upper, excludes) => lookupVar(doc, lower, upper, 0, excludes)
      case ItemPhrase(fn, min, max)            => lookupPhraseAnywhere(doc, 0, fn, min, max)
      case ItemMultiChar                       => lookupMultiCharAnywhere(doc, 0)
      case ItemLexicon(dict)                  => lookupLexiconAnywhere(doc, 0, dict)
    }
  }

  def produce(options: Options)(`match`: Match): Option[Node] = `match` match {
    case (Rule(name, _, _, _), _, Nil) =>
      if (verboseProduce) logger.info(s"rule: $name, reverse route: []")
      None
    case (
        Rule(name, _, production, extraction),
        _,
        etuor @ Node(Range(_, e), _, _, _, _, _) :: _
        ) =>
      val route = etuor.reverse
      val maybeToken = production.orElse(emptyProduction).apply((options, route.map(_.token)))

      if (verboseProduce) {
        logger.info(s"rule: $name, nodes: \n${route.map(n => s"  -- $n").mkString("\n")}")
        logger.info(s"prod: ${maybeToken match {
          case None        => "nothing"
          case Some(token) => token.toString
        }}")
      }

      route match {
        case Node(Range(p, _), _, _, _, _, _) :: _ if maybeToken.nonEmpty =>
          Some(
            Node(
              range = Range(p, e),
              token = maybeToken.get,
              children = route,
              rule = Some(name),
              production = production,
              features = extraction
            )
          )
        case _ => None
      }
  }

  def mkMatch(route: List[Node], newRule: Rule)(node: Node): Match = {
    val newRoute = node :: route
    (newRule, node.range.end, newRoute)
  }

  /**
    * Recursively augments `matches`.
    * Discards partial matches stuck by a regex.
    *
    * @param doc
    * @param stash
    * @param matches
    * @return
    */
  def matchAll(doc: Document, stash: Stash, matches: List[Match], limit: Int): List[Match] = {
    def mkNextMatches(`match`: Match): List[Match] = {
      `match` match {
        case (Rule(_, Nil, _, _), _, _) => List(`match`)
        case (Rule(_, p :: _, _, _), _, _) =>
          val firstMatches = matchFirst(doc, stash)(`match`)
          val nextMatches = matchAll(doc, stash, firstMatches, limit)
          p match {
            case _: ItemPredicate => `match` :: nextMatches
            case _                => nextMatches
          }
      }
    }

    val nodes = stash.getSet.map(_._2.size).sum
    if (nodes > limit) {
      logger.warn(s"${doc.rawInput} parsed node size exceed $limit($nodes)")
      Nil
    } else {
      matches.flatMap(mkNextMatches).flatMap {
        case (rule, n, nodes) =>
          val validNodes = nodes.filter(_.isValid(doc))
          if (validNodes.isEmpty) None
          else Some(rule, n, validNodes.distinct)
      }
    }
  }

  // lookupItem :: Document -> PatternItem -> Stash -> Int -> Duckling [Node]
  def lookupItem(doc: Document,
                 patternItem: PatternItem,
                 stash: Stash,
                 position: Int): List[Node] = {
    patternItem match {
      case ItemRegex(re) =>
        lookupRegex(doc, re, position).filter(doc.isPositionValid(position))
      case ItemPredicate(p) =>
        val after = stash.toPosOrderedListFrom(position)
        val valid = after.takeWhile(doc.isPositionValid(position))
        val left = valid.filter(n => (p orElse emptyPredicate)(n.token))
        if (verboseLookup) {
          logger.info(s"lookup: after $position => \n${after.map(n => s" -- $n").mkString("\n")}")
          logger.info(s"lookup: position valid => \n${valid.map(n => s" -- $n").mkString("\n")}")
          logger.info(s"lookup: predicate valid => \n${left.map(n => s" -- $n").mkString("\n")}")
        }
        left
      case ItemVarchar(lower, upper, excludes) =>
        lookupVarLength(doc, lower, upper, position, excludes).filter(doc.isPositionValid(position))
      case ItemPhrase(fn, min, max) => lookupPhrase(doc, position, fn, min, max)
      case ItemMultiChar            => lookupMultiChar(doc, position)
      case ItemLexicon(dict)       => lookupLexicon(doc, position, dict)
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy