All Downloads are FREE. Search and download functionalities are using the official Maven repository.

scaps.nucleus.querying.QueryExpression.scala Maven / Gradle / Ivy

The newest version!
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

package scaps.nucleus.querying

import scaps.nucleus.indexing.FingerprintTerm

private[nucleus] sealed trait QueryExpression {
  import QueryExpression._

  def score(fingerprint: List[FingerprintTerm]): (Float, Float, List[(String, Float)]) = {
    /*
     * This is just an heuristic that generally yields accurate results but
     * may not return the maximum score for a fingerprint (see ignored test cases).
     *
     * Scoring a fingerprint against a query is a harder problem as one would
     * intuitively think. An additional value in the fingerprint may require
     * reassignment of all previously matched values. Thus, probably the only
     * approach to yield an optimal result is to check all permutations of the
     * fingerprint.
     *
     * The following heuristic first orders the fingerprint by the maximum
     * achievable score of each individual term and uses this order to score
     * the fingerprint as a whole.
     */
    val termsWithIsOpt = fingerprint.map(t => (t.key, t.isOptional))

    val termsByMaxPotentialScore: List[(String, Boolean)] =
      termsWithIsOpt.foldLeft((List[(String, Boolean, Float)](), termScores)) {
        case ((acc, termScores), (fp, isOpt)) =>
          termScores.getOrElse(fp, List(0f)) match {
            case x :: Nil =>
              ((fp, isOpt, x) :: acc, termScores - fp)
            case x :: rest =>
              ((fp, isOpt, x) :: acc, termScores + (fp -> rest))
            case Nil => ???
          }
      }._1
        .sortBy { case (_, _, maxScore) => -maxScore }
        .map(t => (t._1, t._2))

    val (score, unmatchedTerms, reducedExpr, scorePerValue) =
      termsByMaxPotentialScore.foldLeft((0f, 0f, this, List[(String, Float)]())) {
        case ((score, unmatchedTerms, expr, scorePerValue), (fpt, isOpt)) =>
          expr.score(fpt).fold {
            val unmatched = unmatchedTerms + (if (isOpt) 0.1f else 1f)
            (score, unmatched, expr, scorePerValue)
          } {
            case (newScore, newExpr) =>
              (score + newScore, unmatchedTerms, newExpr, scorePerValue :+ (fpt -> newScore))
          }
      }

    val penalty = unmatchedTerms + reducedExpr.unevaluatedBranches

    (score, penalty, scorePerValue)
  }

  def unevaluatedBranches: Float = this match {
    case Sum(children) => children.map(_.unevaluatedBranches).sum
    case Max(children) => children.map(_.unevaluatedBranches).sum / children.length
    case _: Leaf       => 1
    case DeadLeaf      => 0
  }

  lazy val termScores: Map[String, List[Float]] = {
    def rec(scorer: QueryExpression): Map[String, List[Float]] = scorer match {
      case Leaf(tpe, s, _) => Map(tpe -> List(s))
      case DeadLeaf        => Map()
      case InnerNode(children) =>
        children.flatMap(rec).foldLeft(Map[String, List[Float]]()) {
          case (acc, (tpe, scores)) =>
            acc + (tpe -> (scores ++ acc.getOrElse(tpe, Nil)))
        }
    }

    rec(this).mapValues(_.sortBy(-_))
  }

  /**
   * Calculates the score for a individual fingerprint type.
   *
   * If this node or one of the subnodes match `fpt` it returns some score
   * with a new scorer that wont match that particular leaf again.
   */
  def score(fpt: String): Option[(Float, QueryExpression)] = this match {
    case Sum(children) =>
      (for {
        (child, idx) <- children.zipWithIndex
        (s, replacement) <- child.score(fpt)
      } yield (idx, s, replacement)) match {
        case Seq() => None
        case matches =>
          val (idx, score, newChild) = matches.maxBy(_._2)

          children.updated(idx, newChild).filter(_ != DeadLeaf) match {
            case Nil      => Some((score, DeadLeaf))
            case c :: Nil => Some((score, c))
            case cs       => Some((score, Sum(cs)))
          }
      }
    case Max(children) =>
      children.flatMap(_.score(fpt)) match {
        case Seq() => None
        case matches =>
          Some(matches.maxBy(_._1))
      }
    case Leaf(tpe, boost, _) =>
      if (tpe == fpt)
        Some((boost, DeadLeaf))
      else
        None
    case DeadLeaf =>
      None
  }

  def leaves: List[QueryExpression.Leaf] = this match {
    case InnerNode(cs) => cs.flatMap(_.leaves)
    case l: Leaf       => l :: Nil
  }

  def termsBelowCutoff(frequencyCutoff: Double): List[String] = {
    val rankedTermsWithFreq = leaves
      .sortBy(-_.score)
      .map(t => (t.term, t.frequency))
      .distinct

    val (terms, _) = rankedTermsWithFreq.foldLeft((List[String](), 0d)) {
      case (acc @ (accTerms, accFreq), (value, freq)) =>
        if (accFreq + freq < frequencyCutoff)
          (value :: accTerms, accFreq + freq)
        else
          acc
    }

    terms
  }

  override def toString: String = this match {
    case Sum(cs)                    => cs.mkString("sum(", ", ", ")")
    case Max(cs)                    => cs.mkString("max(", ", ", ")")
    case Leaf(fp, boost, frequency) => s"$fp^($boost, $frequency)"
    case DeadLeaf                   => "∅"
  }

}

private[nucleus] object QueryExpression {
  case class Sum(children: List[QueryExpression]) extends QueryExpression
  case class Max(children: List[QueryExpression]) extends QueryExpression
  case class Leaf(term: String, score: Float, frequency: Float) extends QueryExpression
  case object DeadLeaf extends QueryExpression

  object InnerNode {
    def unapply(expr: QueryExpression): Option[List[QueryExpression]] = expr match {
      case Sum(cs) => Some(cs)
      case Max(cs) => Some(cs)
      case _       => None
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy