All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.johnsnowlabs.nlp.annotators.er.EntityRulerUtil.scala Maven / Gradle / Ivy

/*
 * Copyright 2017-2022 John Snow Labs
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.johnsnowlabs.nlp.annotators.er

import com.johnsnowlabs.nlp.util.io.{ExternalResource, ReadAs, ResourceHelper}

import scala.collection.mutable.ListBuffer
import scala.util.{Failure, Success, Try}

object EntityRulerUtil {

  def mergeIntervals(intervals: List[List[Int]]): List[List[Int]] = {

    val mergedIntervals = ListBuffer[List[Int]]()
    var currentMergedInterval = List[Int]()
    val sortedIntervals = intervals.sortBy(interval => interval.head)

    sortedIntervals.zipWithIndex.foreach { case (interval, index) =>
      if (index == 0) {
        currentMergedInterval = interval
      } else {
        val mergedEnd = currentMergedInterval(1)
        val currentBegin = interval.head
        if (mergedEnd >= currentBegin) {
          val currentEnd = interval(1)
          val maxEnd = math.max(currentEnd, mergedEnd)
          currentMergedInterval = List(currentMergedInterval.head, maxEnd)
        } else {
          mergedIntervals.append(currentMergedInterval)
          currentMergedInterval = interval
        }
      }
    }

    mergedIntervals.append(currentMergedInterval)
    mergedIntervals.toList

  }

  def toBoolean(string: String): Boolean = {
    castStringToBoolean(string) match {
      case Success(value) => value
      case Failure(_) =>
        throw new IllegalArgumentException(
          "Column regex has a wrong format. It should be false or true")
    }
  }

  private def castStringToBoolean(string: String): Try[Boolean] = Try {
    string.toBoolean
  }

  def splitString(string: String, delimiter: String): Array[String] = {
    var result = string.split(delimiter)
    if (result.length > 3) {
      result = string.split(s"\\$delimiter")
    }
    result
  }

  private val symbols = """:$&(){}[]?/\\!><@=#-;,%_“.|'`"*#^+~€"""
  private val numbers = "0123456789"
  private val englishAlphabet = "abcdefghijklmnopqrstuvwxyz"
  private val spanishAlphabet = "abcdefghijklmnñopqrstuvwxyz" + "áéíóú"
  private val frenchAlphabet = "abcdefghijklmnopqrstuvwxyz" + "éàèùâêîôûëïüç"
  private val germanAlphabet = "abcdefghijklmnopqrstuvwxyz" + "äöüß"

  def loadAlphabet(path: String): String = {
    if (path.contains("/") | path.contains("\\")) {
      val externalResource = ExternalResource(path, ReadAs.TEXT, Map())
      val alphabet = ResourceHelper.parseLines(externalResource).mkString("")
      alphabet
    } else {
      path.toLowerCase() match {
        case "english" => englishAlphabet + englishAlphabet.toUpperCase + symbols + numbers
        case "spanish" => spanishAlphabet + spanishAlphabet.toUpperCase + symbols + numbers
        case "french" => frenchAlphabet + frenchAlphabet.toUpperCase + symbols + numbers
        case "german" => germanAlphabet + germanAlphabet.toUpperCase + symbols + numbers
        case _ =>
          throw new IllegalArgumentException(
            s"Alphabet $path not available." +
              s" Please load it using a path to a plain text file")
      }
    }
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy