com.johnsnowlabs.nlp.annotators.ws.WordSegmenterApproach.scala Maven / Gradle / Ivy
/*
* Copyright 2017-2022 John Snow Labs
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.johnsnowlabs.nlp.annotators.ws
import com.johnsnowlabs.nlp.AnnotatorApproach
import com.johnsnowlabs.nlp.AnnotatorType.{DOCUMENT, TOKEN}
import com.johnsnowlabs.nlp.annotators.pos.perceptron.{
PerceptronTrainingUtils,
TrainingPerceptronLegacy
}
import org.apache.spark.ml.PipelineModel
import org.apache.spark.ml.param.{BooleanParam, DoubleParam, IntParam, Param}
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
import org.apache.spark.sql.Dataset
import scala.collection.mutable.{Map => MMap}
/** Trains a WordSegmenter which tokenizes non-english or non-whitespace separated texts.
*
* Many languages are not whitespace separated and their sentences are a concatenation of many
* symbols, like Korean, Japanese or Chinese. Without understanding the language, splitting the
* words into their corresponding tokens is impossible. The WordSegmenter is trained to
* understand these languages and split them into semantically correct parts.
*
* For instantiated/pretrained models, see [[WordSegmenterModel]].
*
* To train your own model, a training dataset consisting of
* [[https://en.wikipedia.org/wiki/Part-of-speech_tagging Part-Of-Speech tags]] is required. The
* data has to be loaded into a dataframe, where the column is an
* [[com.johnsnowlabs.nlp.Annotation Annotation]] of type `"POS"`. This can be set with
* `setPosColumn`.
*
* '''Tip''': The helper class [[com.johnsnowlabs.nlp.training.POS POS]] might be useful to read
* training data into data frames.
*
* For extended examples of usage, see the
* [[https://github.com/JohnSnowLabs/spark-nlp-workshop/tree/master/jupyter/annotation/chinese/word_segmentation Spark NLP Workshop]]
* and the
* [[https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/WordSegmenterTest.scala WordSegmenterTest]].
*
* ==Example==
* In this example, `"chinese_train.utf8"` is in the form of
* {{{
* 十|LL 四|RR 不|LL 是|RR 四|LL 十|RR
* }}}
* and is loaded with the `POS` class to create a dataframe of `"POS"` type Annotations.
* {{{
* import com.johnsnowlabs.nlp.base.DocumentAssembler
* import com.johnsnowlabs.nlp.annotators.ws.WordSegmenterApproach
* import com.johnsnowlabs.nlp.training.POS
* import org.apache.spark.ml.Pipeline
*
* val documentAssembler = new DocumentAssembler()
* .setInputCol("text")
* .setOutputCol("document")
*
* val wordSegmenter = new WordSegmenterApproach()
* .setInputCols("document")
* .setOutputCol("token")
* .setPosColumn("tags")
* .setNIterations(5)
*
* val pipeline = new Pipeline().setStages(Array(
* documentAssembler,
* wordSegmenter
* ))
*
* val trainingDataSet = POS().readDataset(
* spark,
* "src/test/resources/word-segmenter/chinese_train.utf8"
* )
*
* val pipelineModel = pipeline.fit(trainingDataSet)
* }}}
*
* @param uid
* required uid for storing annotator to disk
* @groupname anno Annotator types
* @groupdesc anno
* Required input and expected output annotator types
* @groupname Ungrouped Members
* @groupname param Parameters
* @groupname setParam Parameter setters
* @groupname getParam Parameter getters
* @groupname Ungrouped Members
* @groupprio param 1
* @groupprio anno 2
* @groupprio Ungrouped 3
* @groupprio setParam 4
* @groupprio getParam 5
* @groupdesc param
* A list of (hyper-)parameter keys this annotator can take. Users can set and get the
* parameter values through setters and getters, respectively.
*/
class WordSegmenterApproach(override val uid: String)
extends AnnotatorApproach[WordSegmenterModel]
with PerceptronTrainingUtils {
/** Annotator reference id. Used to identify elements in metadata or to refer to this annotator
* type
*/
def this() = this(Identifiable.randomUID("WORD_SEGMENTER"))
override val description: String = "Word segmentation"
/** Column of Array of POS tags that match tokens
*
* @group param
*/
val posCol = new Param[String](this, "posCol", "column of Array of POS tags that match tokens")
/** Number of iterations in training, converges to better accuracy (Default: `5`)
*
* @group param
*/
val nIterations = new IntParam(
this,
"nIterations",
"Number of iterations in training, converges to better accuracy")
/** How many times at least a tag on a word to be marked as frequent (Default: `20`)
*
* @group param
*/
val frequencyThreshold = new IntParam(
this,
"frequencyThreshold",
"How many times at least a tag on a word to be marked as frequent")
/** How much percentage of total amount of words are covered to be marked as frequent (Default:
* `0.97`)
*
* @group param
*/
val ambiguityThreshold = new DoubleParam(
this,
"ambiguityThreshold",
"How much percentage of total amount of words are covered to be marked as frequent")
val enableRegexTokenizer: BooleanParam = new BooleanParam(
this,
"enableRegexTokenizer",
"Whether to use RegexTokenizer before segmentation. Useful for multilingual text")
/** Indicates whether to convert all characters to lowercase before tokenizing (Default:
* `false`).
*
* @group param
*/
val toLowercase: BooleanParam = new BooleanParam(
this,
"toLowercase",
"Indicates whether to convert all characters to lowercase before tokenizing. Used only when enableRegexTokenizer is true")
/** Regex pattern used to match delimiters (Default: `"\\s+"`)
*
* @group param
*/
val pattern: Param[String] = new Param(
this,
"pattern",
"regex pattern used for tokenizing. Used only when enableRegexTokenizer is true")
/** @group setParam */
def setPosColumn(value: String): this.type = set(posCol, value)
/** @group setParam */
def setNIterations(value: Int): this.type = set(nIterations, value)
/** @group setParam */
def setFrequencyThreshold(value: Int): this.type = set(frequencyThreshold, value)
/** @group setParam */
def setAmbiguityThreshold(value: Double): this.type = set(ambiguityThreshold, value)
/** @group setParam */
def setEnableRegexTokenizer(value: Boolean): this.type = set(enableRegexTokenizer, value)
/** @group setParam */
def setToLowercase(value: Boolean): this.type = set(toLowercase, value)
/** @group setParam */
def setPattern(value: String): this.type = set(pattern, value)
setDefault(
nIterations -> 5,
frequencyThreshold -> 20,
ambiguityThreshold -> 0.97,
enableRegexTokenizer -> false,
toLowercase -> false,
pattern -> "\\s+")
/** @group getParam */
def getNIterations: Int = $(nIterations)
override def train(
dataset: Dataset[_],
recursivePipeline: Option[PipelineModel]): WordSegmenterModel = {
val taggedSentences = generatesTagBook(dataset)
val taggedWordBook =
buildTagBook(taggedSentences, $(frequencyThreshold), $(ambiguityThreshold))
/** Finds all distinct tags and stores them */
val classes = taggedSentences.flatMap(_.tags).distinct
val initialModel = new TrainingPerceptronLegacy(classes, taggedWordBook, MMap())
val finalModel =
trainPerceptron($(nIterations), initialModel, taggedSentences, taggedWordBook)
new WordSegmenterModel()
.setModel(finalModel)
.setEnableRegexTokenizer($(enableRegexTokenizer))
.setToLowercase($(toLowercase))
.setPattern($(pattern))
}
/** Output Annotator Types: TOKEN
*
* @group anno
*/
override val outputAnnotatorType: AnnotatorType = TOKEN
/** Input Annotator Types: DOCUMENT
*
* @group anno
*/
override val inputAnnotatorTypes: Array[String] = Array(DOCUMENT)
}
/** This is the companion object of [[WordSegmenterApproach]]. Please refer to that class for the
* documentation.
*/
object WordSegmenterApproach extends DefaultParamsReadable[WordSegmenterApproach]
© 2015 - 2024 Weber Informatics LLC | Privacy Policy