All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.yupana.schema.ItemDimension.scala Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2019 Rusexpertiza LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.yupana.schema

import java.nio.charset.StandardCharsets
import java.util.UUID

import org.yupana.api.schema.HashDimension
import org.yupana.api.utils.{ ItemFixer, Transliterator }

import scala.collection.mutable.ListBuffer

object ItemDimension {

  type KeyType = (Int, Long)

  def apply(
      fixer: ItemFixer,
      transliterator: Transliterator,
      name: String
  ): HashDimension[String, KeyType] = {
    HashDimension(
      name,
      (s: String) => {
        val sl = s.toLowerCase
        (
          hash(fixer, transliterator, sl),
          UUID.nameUUIDFromBytes(sl.getBytes(StandardCharsets.UTF_8)).getMostSignificantBits
        )
      }
    )
  }

  val stopWords: Set[String] = Set("kg", "ml", "litrov", "litra", "litr", "gr", "sht")
  val numOfChars: Int = 4

  private val bs: List[String] = List("skdgnl", "pmfzrc", "tboi", "vaei", "jq", "uw", "x", "y")
  private val charIdx8: Map[Char, Int] = bs.zipWithIndex.flatMap { case (b, i) => b.map(c => (c, i)) }.toMap

  def hash(fixer: ItemFixer, transliterator: Transliterator, item: String): Int = {

    val tokens = split(fixer.fix(item)).map(transliterator.transliterate).toArray

    val filteredTokens = tokens.filterNot(stopWords.contains).filter(i => i.length > 1)

    (0 until numOfChars).foldLeft(0) { (h, pos) =>
      val code = if (pos == 0) {
        encode(chars(filteredTokens, pos), charIdx8, 8)
      } else {
        math.abs(new String(chars(filteredTokens, pos - 1).sorted).hashCode) % 255
      }
      (h << 8) | code
    }
  }

  def split(s: String): Seq[String] = {
    val res = ListBuffer.empty[String]
    var start = 0
    var end = 0
    var skip = false
    while (end < s.length) {
      val ch = s(end)
      if (ch.isLetter) {
        end += 1
      } else if (ch.isDigit) {
        end += 1
        skip = true
      } else {
        if (end != start && !skip) {
          res += s.substring(start, end)
        }
        end += 1
        start = end
        skip = false
      }
    }
    if (end != start && !skip) {
      res += s.substring(start, end)
    }
    res.toList
  }

  private def encode(chars: Array[Char], charIdx: Map[Char, Int], nBits: Int) = {
    val bits = chars.map(c => charIdx(c) % nBits)
    bits.foldLeft(0) { (a, i) =>
      (1 << i) | a
    }
  }

  private def chars(words: Array[String], pos: Int) = {
    words
      .map { w =>
        if (w.length > pos) w(pos) else ' '
      }
      .filter(charIdx8.contains)
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy