com.johnsnowlabs.nlp.annotators.ws.WordSegmenterApproach.scala Maven / Gradle / Ivy
The newest version!
/*
* Copyright 2017-2022 John Snow Labs
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.johnsnowlabs.nlp.annotators.ws
import com.johnsnowlabs.nlp.AnnotatorApproach
import com.johnsnowlabs.nlp.AnnotatorType.{DOCUMENT, TOKEN}
import com.johnsnowlabs.nlp.annotators.pos.perceptron.{
PerceptronTrainingUtils,
TrainingPerceptronLegacy
}
import org.apache.spark.ml.PipelineModel
import org.apache.spark.ml.param.{BooleanParam, DoubleParam, IntParam, Param}
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
import org.apache.spark.sql.Dataset
import scala.collection.mutable.{Map => MMap}
/** Trains a WordSegmenter which tokenizes non-english or non-whitespace separated texts.
*
* Many languages are not whitespace separated and their sentences are a concatenation of many
* symbols, like Korean, Japanese or Chinese. Without understanding the language, splitting the
* words into their corresponding tokens is impossible. The WordSegmenter is trained to
* understand these languages and split them into semantically correct parts.
*
* This annotator is based on the paper Chinese Word Segmentation as Character Tagging [1]. Word
* segmentation is treated as a tagging problem. Each character is be tagged as on of four
* different labels: LL (left boundary), RR (right boundary), MM (middle) and LR (word by
* itself). The label depends on the position of the word in the sentence. LL tagged words will
* combine with the word on the right. Likewise, RR tagged words combine with words on the left.
* MM tagged words are treated as the middle of the word and combine with either side. LR tagged
* words are words by themselves.
*
* Example (from [1], Example 3(a) (raw), 3(b) (tagged), 3(c) (translation)):
* - 上海 计划 到 本 世纪 末 实现 人均 国内 生产 总值 五千 美元
* - 上/LL 海/RR 计/LL 划/RR 到/LR 本/LR 世/LL 纪/RR 末/LR 实/LL 现/RR 人/LL 均/RR 国/LL 内/RR 生/LL 产/RR 总/LL
* 值/RR 五/LL 千/RR 美/LL 元/RR
* - Shanghai plans to reach the goal of 5,000 dollars in per capita GDP by the end of the
* century.
*
* For instantiated/pretrained models, see [[WordSegmenterModel]].
*
* To train your own model, a training dataset consisting of
* [[https://en.wikipedia.org/wiki/Part-of-speech_tagging Part-Of-Speech tags]] is required. The
* data has to be loaded into a dataframe, where the column is an
* [[com.johnsnowlabs.nlp.Annotation Annotation]] of type `"POS"`. This can be set with
* `setPosColumn`.
*
* '''Tip''': The helper class [[com.johnsnowlabs.nlp.training.POS POS]] might be useful to read
* training data into data frames. nl For extended examples of usage, see the
* [[https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/chinese/word_segmentation Examples]]
* and the
* [[https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/WordSegmenterTest.scala WordSegmenterTest]].
*
* '''References:'''
* - [[https://aclanthology.org/O03-4002.pdf [1]]] Xue, Nianwen. “Chinese Word Segmentation as
* Character Tagging.” International Journal of Computational Linguistics & Chinese Language
* Processing, Volume 8, Number 1, February 2003: Special Issue on Word Formation and Chinese
* Language Processing, 2003, pp. 29-48. ACLWeb, https://aclanthology.org/O03-4002.
*
* ==Example==
* In this example, `"chinese_train.utf8"` is in the form of
* {{{
* 十|LL 四|RR 不|LL 是|RR 四|LL 十|RR
* }}}
* and is loaded with the `POS` class to create a dataframe of `"POS"` type Annotations.
* {{{
* import com.johnsnowlabs.nlp.base.DocumentAssembler
* import com.johnsnowlabs.nlp.annotators.ws.WordSegmenterApproach
* import com.johnsnowlabs.nlp.training.POS
* import org.apache.spark.ml.Pipeline
*
* val documentAssembler = new DocumentAssembler()
* .setInputCol("text")
* .setOutputCol("document")
*
* val wordSegmenter = new WordSegmenterApproach()
* .setInputCols("document")
* .setOutputCol("token")
* .setPosColumn("tags")
* .setNIterations(5)
*
* val pipeline = new Pipeline().setStages(Array(
* documentAssembler,
* wordSegmenter
* ))
*
* val trainingDataSet = POS().readDataset(
* spark,
* "src/test/resources/word-segmenter/chinese_train.utf8"
* )
*
* val pipelineModel = pipeline.fit(trainingDataSet)
* }}}
*
* @param uid
* required uid for storing annotator to disk
* @groupname anno Annotator types
* @groupdesc anno
* Required input and expected output annotator types
* @groupname Ungrouped Members
* @groupname param Parameters
* @groupname setParam Parameter setters
* @groupname getParam Parameter getters
* @groupname Ungrouped Members
* @groupprio param 1
* @groupprio anno 2
* @groupprio Ungrouped 3
* @groupprio setParam 4
* @groupprio getParam 5
* @groupdesc param
* A list of (hyper-)parameter keys this annotator can take. Users can set and get the
* parameter values through setters and getters, respectively.
*/
class WordSegmenterApproach(override val uid: String)
extends AnnotatorApproach[WordSegmenterModel]
with PerceptronTrainingUtils {
/** Annotator reference id. Used to identify elements in metadata or to refer to this annotator
* type
*/
def this() = this(Identifiable.randomUID("WORD_SEGMENTER"))
override val description: String = "Word segmentation"
/** Column of Array of POS tags that match tokens
*
* @group param
*/
val posCol = new Param[String](this, "posCol", "column of Array of POS tags that match tokens")
/** Number of iterations in training, converges to better accuracy (Default: `5`)
*
* @group param
*/
val nIterations = new IntParam(
this,
"nIterations",
"Number of iterations in training, converges to better accuracy")
/** How many times at least a tag on a word to be marked as frequent (Default: `20`)
*
* @group param
*/
val frequencyThreshold = new IntParam(
this,
"frequencyThreshold",
"How many times at least a tag on a word to be marked as frequent")
/** How much percentage of total amount of words are covered to be marked as frequent (Default:
* `0.97`)
*
* @group param
*/
val ambiguityThreshold = new DoubleParam(
this,
"ambiguityThreshold",
"How much percentage of total amount of words are covered to be marked as frequent")
val enableRegexTokenizer: BooleanParam = new BooleanParam(
this,
"enableRegexTokenizer",
"Whether to use RegexTokenizer before segmentation. Useful for multilingual text")
/** Indicates whether to convert all characters to lowercase before tokenizing (Default:
* `false`).
*
* @group param
*/
val toLowercase: BooleanParam = new BooleanParam(
this,
"toLowercase",
"Indicates whether to convert all characters to lowercase before tokenizing. Used only when enableRegexTokenizer is true")
/** Regex pattern used to match delimiters (Default: `"\\s+"`)
*
* @group param
*/
val pattern: Param[String] = new Param(
this,
"pattern",
"regex pattern used for tokenizing. Used only when enableRegexTokenizer is true")
/** @group setParam */
def setPosColumn(value: String): this.type = set(posCol, value)
/** @group setParam */
def setNIterations(value: Int): this.type = set(nIterations, value)
/** @group setParam */
def setFrequencyThreshold(value: Int): this.type = set(frequencyThreshold, value)
/** @group setParam */
def setAmbiguityThreshold(value: Double): this.type = set(ambiguityThreshold, value)
/** @group setParam */
def setEnableRegexTokenizer(value: Boolean): this.type = set(enableRegexTokenizer, value)
/** @group setParam */
def setToLowercase(value: Boolean): this.type = set(toLowercase, value)
/** @group setParam */
def setPattern(value: String): this.type = set(pattern, value)
setDefault(
nIterations -> 5,
frequencyThreshold -> 20,
ambiguityThreshold -> 0.97,
enableRegexTokenizer -> false,
toLowercase -> false,
pattern -> "\\s+")
/** @group getParam */
def getNIterations: Int = $(nIterations)
override def train(
dataset: Dataset[_],
recursivePipeline: Option[PipelineModel]): WordSegmenterModel = {
val taggedSentences = generatesTagBook(dataset)
val taggedWordBook =
buildTagBook(taggedSentences, $(frequencyThreshold), $(ambiguityThreshold))
/** Finds all distinct tags and stores them */
val classes = taggedSentences.flatMap(_.tags).distinct
val initialModel = new TrainingPerceptronLegacy(classes, taggedWordBook, MMap())
val finalModel =
trainPerceptron($(nIterations), initialModel, taggedSentences, taggedWordBook)
new WordSegmenterModel()
.setModel(finalModel)
.setEnableRegexTokenizer($(enableRegexTokenizer))
.setToLowercase($(toLowercase))
.setPattern($(pattern))
}
/** Output Annotator Types: TOKEN
*
* @group anno
*/
override val outputAnnotatorType: AnnotatorType = TOKEN
/** Input Annotator Types: DOCUMENT
*
* @group anno
*/
override val inputAnnotatorTypes: Array[String] = Array(DOCUMENT)
}
/** This is the companion object of [[WordSegmenterApproach]]. Please refer to that class for the
* documentation.
*/
object WordSegmenterApproach extends DefaultParamsReadable[WordSegmenterApproach]