com.johnsnowlabs.nlp.annotators.TextMatcher.scala Maven / Gradle / Ivy
/*
* Copyright 2017-2022 John Snow Labs
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.johnsnowlabs.nlp.annotators
import com.johnsnowlabs.collections.SearchTrie
import com.johnsnowlabs.nlp.AnnotatorType._
import com.johnsnowlabs.nlp.annotators.param.ExternalResourceParam
import com.johnsnowlabs.nlp.serialization.StructFeature
import com.johnsnowlabs.nlp.util.io.{ExternalResource, ReadAs, ResourceHelper}
import com.johnsnowlabs.nlp.{Annotation, AnnotatorApproach, ParamsAndFeaturesWritable}
import org.apache.spark.ml.PipelineModel
import org.apache.spark.ml.param.{BooleanParam, Param}
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
import org.apache.spark.sql.Dataset
/** Annotator to match exact phrases (by token) provided in a file against a Document.
*
* A text file of predefined phrases must be provided with `setEntities`. The text file can als
* be set directly as an [[com.johnsnowlabs.nlp.util.io.ExternalResource ExternalResource]].
*
* For extended examples of usage, see the
* [[https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/2.Text_Preprocessing_with_SparkNLP_Annotators_Transformers.ipynb Spark NLP Workshop]]
* and the
* [[https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/TextMatcherTestSpec.scala TextMatcherTestSpec]].
*
* ==Example==
* In this example, the entities file is of the form
* {{{
* ...
* dolore magna aliqua
* lorem ipsum dolor. sit
* laborum
* ...
* }}}
* where each line represents an entity phrase to be extracted.
* {{{
* import spark.implicits._
* import com.johnsnowlabs.nlp.DocumentAssembler
* import com.johnsnowlabs.nlp.annotator.Tokenizer
* import com.johnsnowlabs.nlp.annotator.TextMatcher
* import com.johnsnowlabs.nlp.util.io.ReadAs
* import org.apache.spark.ml.Pipeline
*
* val documentAssembler = new DocumentAssembler()
* .setInputCol("text")
* .setOutputCol("document")
*
* val tokenizer = new Tokenizer()
* .setInputCols("document")
* .setOutputCol("token")
*
* val data = Seq("Hello dolore magna aliqua. Lorem ipsum dolor. sit in laborum").toDF("text")
* val entityExtractor = new TextMatcher()
* .setInputCols("document", "token")
* .setEntities("src/test/resources/entity-extractor/test-phrases.txt", ReadAs.TEXT)
* .setOutputCol("entity")
* .setCaseSensitive(false)
* .setTokenizer(tokenizer.fit(data))
*
* val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, entityExtractor))
* val results = pipeline.fit(data).transform(data)
*
* results.selectExpr("explode(entity) as result").show(false)
* +------------------------------------------------------------------------------------------+
* |result |
* +------------------------------------------------------------------------------------------+
* |[chunk, 6, 24, dolore magna aliqua, [entity -> entity, sentence -> 0, chunk -> 0], []] |
* |[chunk, 27, 48, Lorem ipsum dolor. sit, [entity -> entity, sentence -> 0, chunk -> 1], []]|
* |[chunk, 53, 59, laborum, [entity -> entity, sentence -> 0, chunk -> 2], []] |
* +------------------------------------------------------------------------------------------+
* }}}
*
* @see
* [[com.johnsnowlabs.nlp.annotators.btm.BigTextMatcher BigTextMatcher]] to match large amounts
* of text
* @param uid
* internal uid required to generate writable annotators
* @groupname anno Annotator types
* @groupdesc anno
* Required input and expected output annotator types
* @groupname Ungrouped Members
* @groupname param Parameters
* @groupname setParam Parameter setters
* @groupname getParam Parameter getters
* @groupname Ungrouped Members
* @groupprio anno 1
* @groupprio param 2
* @groupprio setParam 3
* @groupprio getParam 4
* @groupprio Ungrouped 5
* @groupdesc param
* A list of (hyper-)parameter keys this annotator can take. Users can set and get the
* parameter values through setters and getters, respectively.
*/
class TextMatcher(override val uid: String)
extends AnnotatorApproach[TextMatcherModel]
with ParamsAndFeaturesWritable {
def this() = this(Identifiable.randomUID("ENTITY_EXTRACTOR"))
/** Output annotator type : DOCUMENT, TOKEN
*
* @group anno
*/
override val inputAnnotatorTypes = Array(DOCUMENT, TOKEN)
/** Output annotator type : CHUNK
*
* @group anno
*/
override val outputAnnotatorType: AnnotatorType = CHUNK
/** Extracts entities from target dataset given in a text file */
override val description: String = "Extracts entities from target dataset given in a text file"
/** External resource for the entities, e.g. a text file where each line is the string of an
* entity
*
* @group param
*/
val entities = new ExternalResourceParam(this, "entities", "External resource for the entities")
/** Whether to match regardless of case (Default: `true`)
*
* @group param
*/
val caseSensitive =
new BooleanParam(this, "caseSensitive", "Whether to match regardless of case. Defaults true")
/** Whether to merge overlapping matched chunks (Default: `false`)
*
* @group param
*/
val mergeOverlapping = new BooleanParam(
this,
"mergeOverlapping",
"Whether to merge overlapping matched chunks. Defaults false")
/** Value for the entity metadata field (Default: `"entity"`)
*
* @group param
*/
val entityValue = new Param[String](this, "entityValue", "Value for the entity metadata field")
/** Whether the TextMatcher should take the CHUNK from TOKEN or not (Default: `false`)
*
* @group param
*/
val buildFromTokens = new BooleanParam(
this,
"buildFromTokens",
"Whether the TextMatcher should take the CHUNK from TOKEN or not")
/** The [[Tokenizer]] to perform tokenization with
*
* @group param
*/
val tokenizer = new StructFeature[TokenizerModel](this, "tokenizer")
setDefault(inputCols, Array(TOKEN))
setDefault(caseSensitive, true)
setDefault(mergeOverlapping, false)
setDefault(entityValue, "entity")
setDefault(buildFromTokens, false)
/** Provides a file with phrases to match (Default: Looks up path in configuration)
*
* @group getParam
*/
def setEntities(value: ExternalResource): this.type =
set(entities, value)
/** Provides a file with phrases to match. Default: Looks up path in configuration.
*
* @param path
* a path to a file that contains the entities in the specified format.
* @param readAs
* the format of the file, can be one of {ReadAs.TEXT, ReadAs.SPARK}. Defaults to
* ReadAs.TEXT.
* @param options
* a map of additional parameters. Defaults to `Map("format" -> "text")`.
* @return
* this
* @group getParam
*/
def setEntities(
path: String,
readAs: ReadAs.Format,
options: Map[String, String] = Map("format" -> "text")): this.type =
set(entities, ExternalResource(path, readAs, options))
/** The [[Tokenizer]] to perform tokenization with
*
* @group setParam
*/
def setTokenizer(tokenizer: TokenizerModel): this.type = set(this.tokenizer, tokenizer)
/** The [[Tokenizer]] to perform tokenization with
*
* @group getParam
*/
def getTokenizer: TokenizerModel = $$(tokenizer)
/** Whether to match regardless of case (Default: `true`)
*
* @group setParam
*/
def setCaseSensitive(v: Boolean): this.type = set(caseSensitive, v)
/** Whether to match regardless of case (Default: `true`)
*
* @group getParam
*/
def getCaseSensitive: Boolean = $(caseSensitive)
/** Whether to merge overlapping matched chunks (Default: `false`)
*
* @group setParam
*/
def setMergeOverlapping(v: Boolean): this.type = set(mergeOverlapping, v)
/** Whether to merge overlapping matched chunks (Default: `false`)
*
* @group getParam
*/
def getMergeOverlapping: Boolean = $(mergeOverlapping)
/** Setter for Value for the entity metadata field
*
* @group setParam
*/
def setEntityValue(v: String): this.type = set(entityValue, v)
/** Getter for Value for the entity metadata field
*
* @group getParam
*/
def getEntityValue: String = $(entityValue)
/** Setter for buildFromTokens param
*
* @group setParam
*/
def setBuildFromTokens(v: Boolean): this.type = set(buildFromTokens, v)
/** Getter for buildFromTokens param
*
* @group getParam
*/
def getBuildFromTokens: Boolean = $(buildFromTokens)
/** Loads entities from a provided source. */
private def loadEntities(dataset: Dataset[_]): Array[Array[String]] = {
val phrases: Array[String] = ResourceHelper.parseLines($(entities))
val parsedEntities: Array[Array[String]] = {
get(tokenizer) match {
case Some(tokenizerModel: TokenizerModel) =>
phrases.map { line =>
val annotation = Seq(Annotation(line))
val tokens = tokenizerModel.annotate(annotation)
tokens.map(_.result).toArray
}
case _ =>
phrases.map { line =>
line.split(" ")
}
}
}
parsedEntities
}
override def train(
dataset: Dataset[_],
recursivePipeline: Option[PipelineModel]): TextMatcherModel = {
new TextMatcherModel()
.setSearchTrie(SearchTrie.apply(loadEntities(dataset), $(caseSensitive)))
.setMergeOverlapping($(mergeOverlapping))
.setBuildFromTokens($(buildFromTokens))
}
}
/** This is the companion object of [[TextMatcher]]. Please refer to that class for the
* documentation.
*/
object TextMatcher extends DefaultParamsReadable[TextMatcher]
© 2015 - 2024 Weber Informatics LLC | Privacy Policy