org.yupana.externallinks.items.ItemsInvertedIndexImpl.scala Maven / Gradle / Ivy
/*
* Copyright 2019 Rusexpertiza LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.yupana.externallinks.items
import com.typesafe.scalalogging.StrictLogging
import org.yupana.api.query._
import org.yupana.api.schema.Schema
import org.yupana.api.utils.SortedSetIterator
import org.yupana.core.ExternalLinkService
import org.yupana.core.dao.InvertedIndexDao
import org.yupana.core.model.InternalRow
import org.yupana.core.utils.FlatAndCondition
import org.yupana.externallinks.ExternalLinkUtils
import org.yupana.schema.externallinks.ItemsInvertedIndex
import org.yupana.schema.{ Dimensions, ItemDimension }
object ItemsInvertedIndexImpl {
val TABLE_NAME: String = "ts_items_reverse_index"
def indexItems(schema: Schema)(items: Seq[(ItemDimension.KeyType, String)]): Map[String, Seq[ItemDimension.KeyType]] =
items
.flatMap {
case (id, n) =>
val words = schema.tokenizer.transliteratedTokens(schema.itemFixer.fix(n))
words.map(_ -> id)
}
.groupBy {
case (word, _) =>
word
}
.map {
case (word, group) =>
(word, group.map(_._2))
}
}
class ItemsInvertedIndexImpl(
override val schema: Schema,
invertedIndexDao: InvertedIndexDao[String, ItemDimension.KeyType],
override val putEnabled: Boolean,
override val externalLink: ItemsInvertedIndex
) extends ExternalLinkService[ItemsInvertedIndex]
with StrictLogging {
import ItemsInvertedIndexImpl._
import externalLink._
override def put(dataPoints: Seq[DataPoint]): Unit = {
if (putEnabled) {
val items = dataPoints
.flatMap(dp => dp.dimensionValue(Dimensions.ITEM))
.toSet
.filter(_.trim.nonEmpty)
putItemNames(items)
}
}
def putItemNames(names: Set[String]): Unit = {
val items = names.map(n => Dimensions.ITEM.hashFunction(n) -> n).toSeq
val wordIdMap = indexItems(schema)(items)
invertedIndexDao.batchPut(wordIdMap.map { case (k, v) => k -> v.toSet })
}
def dimIdsForStemmedWord(word: String): SortedSetIterator[ItemDimension.KeyType] = {
invertedIndexDao.values(word)
}
def dimIdsForPrefix(prefix: String): SortedSetIterator[ItemDimension.KeyType] = {
invertedIndexDao.valuesByPrefix(prefix)
}
private def includeTransform(values: Seq[(SimpleCondition, String, Set[String])]): Seq[ConditionTransformation] = {
val ids = getPhraseIds(values)
val it = SortedSetIterator.intersectAll(ids)
ConditionTransformation.replace(values.map(_._1), DimIdInExpr(externalLink.dimension, it))
}
private def excludeTransform(values: Seq[(SimpleCondition, String, Set[String])]): Seq[ConditionTransformation] = {
val ids = getPhraseIds(values)
val it = SortedSetIterator.unionAll(ids)
ConditionTransformation.replace(values.map(_._1), DimIdNotInExpr(externalLink.dimension, it))
}
// Read only external link
override def setLinkedValues(
exprIndex: collection.Map[Expression[_], Int],
rows: Seq[InternalRow],
exprs: Set[LinkExpr[_]]
): Unit = {}
override def transformCondition(condition: FlatAndCondition): Seq[ConditionTransformation] = {
ExternalLinkUtils.transformConditionT[String](
externalLink.linkName,
condition,
includeTransform,
excludeTransform
)
}
private def getPhraseIds(
fieldsValues: Seq[(SimpleCondition, String, Set[String])]
): Seq[SortedSetIterator[ItemDimension.KeyType]] = {
fieldsValues.map {
case (_, PHRASE_FIELD, phrases) => SortedSetIterator.unionAll(phrases.toSeq.map(dimIdsForPhrase))
case (_, x, _) => throw new IllegalArgumentException(s"Unknown field $x")
}
}
private def dimIdsForPhrase(phrase: String): SortedSetIterator[ItemDimension.KeyType] = {
val (prefixes, words) = phrase.split(' ').partition(_.endsWith("%"))
val stemmedWords = words.flatMap(schema.tokenizer.transliteratedTokens)
val idsPerWord = stemmedWords.map(dimIdsForStemmedWord)
val transPrefixes = prefixes
.map(s => s.substring(0, s.length - 1).trim.toLowerCase)
.filter(_.nonEmpty)
.map(schema.transliterator.transliterate)
val idsPerPrefix = transPrefixes.map(dimIdsForPrefix)
SortedSetIterator.intersectAll(idsPerWord.toSeq ++ idsPerPrefix)
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy