Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Copyright 2017-2023 John Snow Labs
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.johnsnowlabs.nlp.annotators
import com.johnsnowlabs.nlp.functions.ExplodeAnnotations
import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, AnnotatorType, HasSimpleAnnotate}
import org.apache.spark.ml.param.{BooleanParam, IntParam, StringArrayParam}
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
import org.apache.spark.sql.DataFrame
import scala.util.matching.Regex
/** Annotator which splits large documents into chunks of roughly given size.
*
* DocumentCharacterTextSplitter takes a list of separators. It takes the separators in order and
* splits subtexts if they are over the chunk length, considering optional overlap of the chunks.
*
* For example, given chunk size 20 and overlap 5:
* {{{
* He was, I take it, the most perfect reasoning and observing machine that the world has seen.
*
* ["He was, I take it,", "it, the most", "most perfect", "reasoning and", "and observing", "machine that the", "the world has seen."]
* }}}
*
* Additionally, you can set
*
* - custom patterns with [[setSplitPatterns]]
* - whether patterns should be interpreted as regex with [[setPatternsAreRegex]]
* - whether to keep the separators with [[setKeepSeparators]]
* - whether to trim whitespaces with [[setTrimWhitespace]]
* - whether to explode the splits to individual rows with [[setExplodeSplits]]
*
* For extended examples of usage, see the
* [[https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/DocumentCharacterTextSplitterTest.scala DocumentCharacterTextSplitterTest]].
*
* ==Example==
* {{{
* import com.johnsnowlabs.nlp.annotator._
* import com.johnsnowlabs.nlp.DocumentAssembler
* import org.apache.spark.ml.Pipeline
*
* val textDF =
* spark.read
* .option("wholetext", "true")
* .text("src/test/resources/spell/sherlockholmes.txt")
* .toDF("text")
*
* val documentAssembler = new DocumentAssembler().setInputCol("text")
* val textSplitter = new DocumentCharacterTextSplitter()
* .setInputCols("document")
* .setOutputCol("splits")
* .setChunkSize(20000)
* .setChunkOverlap(200)
* .setExplodeSplits(true)
*
* val pipeline = new Pipeline().setStages(Array(documentAssembler, textSplitter))
* val result = pipeline.fit(textDF).transform(textDF)
*
* result
* .selectExpr(
* "splits.result",
* "splits[0].begin",
* "splits[0].end",
* "splits[0].end - splits[0].begin as length")
* .show(8, truncate = 80)
* +--------------------------------------------------------------------------------+---------------+-------------+------+
* | result|splits[0].begin|splits[0].end|length|
* +--------------------------------------------------------------------------------+---------------+-------------+------+
* |[ Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyl...| 0| 19994| 19994|
* |["And Mademoiselle's address?" he asked.\n\n"Is Briony Lodge, Serpentine Aven...| 19798| 39395| 19597|
* |["How did that help you?"\n\n"It was all-important. When a woman thinks that ...| 39371| 59242| 19871|
* |["'But,' said I, 'there would be millions of red-headed men who\nwould apply....| 59166| 77833| 18667|
* |[My friend was an enthusiastic musician, being himself not only a\nvery capab...| 77835| 97769| 19934|
* |["And yet I am not convinced of it," I answered. "The cases which\ncome to li...| 97771| 117248| 19477|
* |["Well, she had a slate-coloured, broad-brimmed straw hat, with a\nfeather of...| 117250| 137242| 19992|
* |["That sounds a little paradoxical."\n\n"But it is profoundly true. Singulari...| 137244| 157171| 19927|
* +--------------------------------------------------------------------------------+---------------+-------------+------+
* }}}
*
* @param uid
* required uid for storing annotator to disk
* @groupname anno Annotator types
* @groupdesc anno
* Required input and expected output annotator types
* @groupname Ungrouped Members
* @groupname param Parameters
* @groupname setParam Parameter setters
* @groupname getParam Parameter getters
* @groupname Ungrouped Members
* @groupprio param 1
* @groupprio anno 2
* @groupprio Ungrouped 3
* @groupprio setParam 4
* @groupprio getParam 5
* @groupdesc param
* A list of (hyper-)parameter keys this annotator can take. Users can set and get the
* parameter values through setters and getters, respectively.
*/
class DocumentCharacterTextSplitter(override val uid: String)
extends AnnotatorModel[DocumentCharacterTextSplitter]
with HasSimpleAnnotate[DocumentCharacterTextSplitter] {
def this() = this(Identifiable.randomUID("DocumentCharacterTextSplitter"))
override val inputAnnotatorTypes: Array[AnnotatorType] = Array(AnnotatorType.DOCUMENT)
override val outputAnnotatorType: AnnotatorType = AnnotatorType.DOCUMENT
/** Size of each chunk of text
*
* @group param
*/
val chunkSize: IntParam =
new IntParam(this, "chunkSize", "Size of each chunk of text")
/** @group setParam */
def setChunkSize(value: Int): this.type = {
require(value > 0, "Chunk size should be larger than 0.")
set(chunkSize, value)
}
/** @group setParam */
def getChunkSize: Int = $(chunkSize)
/** Length of the overlap between text chunks (Default: `0`)
*
* @group param
*/
val chunkOverlap: IntParam =
new IntParam(this, "chunkOverlap", "Length of the overlap between text chunks")
/** @group setParam */
def setChunkOverlap(value: Int): this.type = {
require(value <= getChunkSize, "Chunk overlap can't be larger than chunk size.")
set(chunkOverlap, value)
}
/** @group getParam */
def getChunkOverlap: Int = $(chunkOverlap)
/** Patterns to separate the text by in decreasing priority (Default: `Array("\n\n", "\n", " ",
* "")`)
*
* Can be interpreted as regular expressions, if `patternsAreRegex` is set to true.
*
* @group param
*/
val splitPatterns: StringArrayParam =
new StringArrayParam(
this,
"splitPatterns",
"Patterns to separate the text by in decreasing priority")
/** @group setParam */
def setSplitPatterns(value: Array[String]): this.type = {
require(value.nonEmpty, "Patterns are empty")
set(splitPatterns, value)
}
/** @group getParam */
def getSplitPatterns: Array[String] = $(splitPatterns)
/** Whether to interpret the split patterns as regular expressions (Default: `false`)
*
* @group param
*/
val patternsAreRegex: BooleanParam =
new BooleanParam(
this,
"patternsAreRegex",
"Whether to interpret the split patterns as regular expressions")
/** @group setParam */
def setPatternsAreRegex(value: Boolean): this.type = set(patternsAreRegex, value)
/** @group getParam */
def getPatternsAreRegex: Boolean = $(patternsAreRegex)
/** Whether to keep the separators in the final result (Default: `false`)
*
* @group param
*/
val keepSeparators: BooleanParam =
new BooleanParam(this, "keepSeparators", "Whether to keep the separators in the final result")
/** @group setParam */
def setKeepSeparators(value: Boolean): this.type = set(keepSeparators, value)
/** @group getParam */
def getKeepSeparators: Boolean = $(keepSeparators)
/** Whether to explode split chunks to separate rows (Default: `false`)
*
* @group param
*/
val explodeSplits: BooleanParam =
new BooleanParam(this, "explodeSplits", "Whether to explode split chunks to separate rows")
/** @group setParam */
def setExplodeSplits(value: Boolean): this.type = set(explodeSplits, value)
/** @group getParam */
def getExplodeSplits: Boolean = $(explodeSplits)
/** Whether to trim whitespaces of extracted chunks (Default: `true`)
* @group param
*/
val trimWhitespace: BooleanParam =
new BooleanParam(this, "trimWhitespace", "Whether to trim whitespaces of extracted chunks")
/** @group setParam */
def setTrimWhitespace(value: Boolean): this.type = set(trimWhitespace, value)
/** @group getParam */
def getTrimWhitespace: Boolean = $(trimWhitespace)
setDefault(
chunkOverlap -> 0,
explodeSplits -> false,
keepSeparators -> true,
patternsAreRegex -> false,
splitPatterns -> Array("\n\n", "\n", " ", ""),
trimWhitespace -> true)
/** Takes a document and annotations and produces new annotations of this annotator's annotation
* type
*
* @param annotations
* Annotations that correspond to inputAnnotationCols generated by previous annotators if any
* @return
* any number of annotations processed for every input annotation. Not necessary one to one
* relationship
*/
override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = {
val textSplitter =
new TextSplitter(
getChunkSize,
getChunkOverlap,
getKeepSeparators,
getPatternsAreRegex,
getTrimWhitespace)
annotations.zipWithIndex
.flatMap { case (annotation, i) =>
val text = annotation.result
val textChunks = textSplitter.splitText(text, getSplitPatterns)
textChunks.zipWithIndex.map { case (textChunk, index) =>
val textChunkBegin = Regex.quote(textChunk).r.findFirstMatchIn(text) match {
case Some(m) => m.start
case None => -1
}
val textChunkEnd = if (textChunkBegin >= 0) textChunkBegin + textChunk.length else -1
(
i,
new Annotation(
AnnotatorType.DOCUMENT,
textChunkBegin,
textChunkEnd,
textChunk,
annotation.metadata ++ Map("document" -> index.toString),
annotation.embeddings))
}
}
.sortBy(_._1)
.map(_._2)
}
override protected def afterAnnotate(dataset: DataFrame): DataFrame = {
if (getExplodeSplits) dataset.explodeAnnotationsCol(getOutputCol, getOutputCol) else dataset
}
}
/** This is the companion object of [[DocumentCharacterTextSplitter]]. Please refer to that class
* for the documentation.
*/
object DocumentCharacterTextSplitter extends DefaultParamsReadable[DocumentCharacterTextSplitter]