All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.bitbucket.eunjeon.seunjeon.Analyzer.scala Maven / Gradle / Ivy

The newest version!
/**
 * Copyright 2015 youngho yu, yongwoon lee
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 **/
package org.bitbucket.eunjeon.seunjeon

import scala.collection.JavaConverters._


trait BasicAnalyzer {
  lazy private[seunjeon] val tokenizer: Tokenizer = {
    val lexiconDict = new LexiconDict().load(false)
    val connectionCostDict = new ConnectionCostDict().load()
    new Tokenizer(lexiconDict, connectionCostDict, false)
  }

  def parse(sentence: String): Iterable[LNode] = parseParagraph(sentence).flatMap(_.nodes)
  def parseParagraph(sentence: String): Iterable[Paragraph] = tokenizer.parseText(sentence, dePreAnalysis=true)
  def parse(sentence: String, preAnalysis: Boolean): Iterable[LNode] = parseParagraph(sentence, preAnalysis).flatMap(_.nodes)
  def parseParagraph(sentence: String, preAnalysis: Boolean): Iterable[Paragraph] = tokenizer.parseText(sentence, preAnalysis)

  def parseJava(sentence: String): java.lang.Iterable[LNode] = parseParagraph(sentence).flatMap(_.nodes).asJava
  def parseJavaParagraph(sentence: String): java.lang.Iterable[Paragraph] = tokenizer.parseTextJava(sentence, dePreAnalysis = true)
  def parseJava(sentence: String, preAnalysis: Boolean): java.lang.Iterable[LNode] = parseParagraph(sentence, preAnalysis).flatMap(_.nodes).asJava
  def parseJavaParagraph(sentence: String, preAnalysis: Boolean): java.lang.Iterable[Paragraph] = tokenizer.parseTextJava(sentence, preAnalysis)

  def setUserDictDir(path: String): Unit =
    tokenizer.setUserDict(new LexiconDict().loadFromDir(path))
  def setUserDictFile(file: String): Unit =
    tokenizer.setUserDict(new LexiconDict().loadFromFile(file))
  def setUserDict(iterator: Iterator[String]): Unit =
    tokenizer.setUserDict(new LexiconDict().loadFromIterator(iterator))
  def setUserDict(iterator: java.util.Iterator[String]): Unit =
    tokenizer.setUserDict(new LexiconDict().loadFromIterator(iterator.asScala))

  def parseEojeol(sentence: String): Iterable[Eojeol] = Eojeoler.build(parseParagraph(sentence)).flatMap(_.eojeols)
  def parseEojeol(paragraphs: Iterable[Paragraph]): Iterable[Eojeol] = Eojeoler.build(paragraphs).flatMap(_.eojeols)
  def parseEojeolParagraph(sentence: String): Iterable[EojeolParagraph] = Eojeoler.build(parseParagraph(sentence))
  def parseEojeolParagraph(paragraphs: Iterable[Paragraph]): Iterable[EojeolParagraph] = Eojeoler.build(paragraphs)

  def parseEojeolJava(sentence: String): java.lang.Iterable[Eojeol] = parseEojeol(sentence).asJava
  def parseEojeolJava(paragraphs: java.lang.Iterable[Paragraph]): java.lang.Iterable[Eojeol] = parseEojeol(paragraphs.asScala).asJava
  def parseEojeolParagraphJava(sentence: String): java.lang.Iterable[EojeolParagraph] = Eojeoler.build(parseParagraph(sentence)).asJava
  def parseEojeolParagraphJava(paragraphs: java.lang.Iterable[Paragraph]): java.lang.Iterable[EojeolParagraph] = parseEojeolParagraph(paragraphs.asScala).asJava

  def resetUserDict(): Unit =
    tokenizer.setUserDict(new LexiconDict().loadFromIterator(Seq[String]().toIterator))

  /**
    * UNKNOWN 키워드로 생성할 최대 길이를 설정합니다. 너무 길면 띄어쓰기 없이
    * 사용된 문장에서 잘못 분석될 가능성이 높습니다. 입력이 "농어촌체험휴양하누리마을"
    * 이면서 length 가 12로 설정될 경우 "농어촌체험휴양하누리마을/UNK" 으로 분석됩니다.
    * @param length
    */
  def setMaxUnkLength(length:Int): Unit = tokenizer.setMaxUnkLength(length)
}

object Analyzer extends BasicAnalyzer {}

object CompressedAnalyzer extends BasicAnalyzer {
  override lazy val tokenizer: Tokenizer = {
    val lexiconDict = new LexiconDict().load(true)
    val connectionCostDict = new ConnectionCostDict().load()
    new Tokenizer(lexiconDict, connectionCostDict, true)
  }

}


// TODO: CompressedAnalyzer ?





© 2015 - 2025 Weber Informatics LLC | Privacy Policy