ject.ja.docs.WordDoc.scala Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of ject-ja_2.13 Show documentation
ject-ja
The newest version!
package ject.ja.docs

import ject.ja.lucene.field.WordField
import ject.ja.text.Inflection
import ject.ja.text.WordType
import ject.ja.JapaneseText
import ject.lucene.field.LuceneField
import ject.lucene.DocDecoder
import ject.lucene.DocEncoder
import org.apache.lucene.analysis.Analyzer
import org.apache.lucene.document.Document
import org.apache.lucene.document.Field
import org.apache.lucene.document.NumericDocValuesField
import org.apache.lucene.document.StoredField
import org.apache.lucene.document.StringField
import org.apache.lucene.document.TextField
import zio.*

final case class WordDoc(
    id: String,
    kanjiTerms: Seq[String],
    readingTerms: Seq[String],
    definitions: Seq[String],
    tags: Seq[String],
    partsOfSpeech: Seq[String],
    priority: Int,
    frequency: Int
) {

  def terms: Seq[String] = kanjiTerms ++ readingTerms

  def render: String = {
    val terms = (kanjiTerms ++ readingTerms).mkString(" ")
    s"$terms: ${definitions.mkString("; ")}"
  }
}

object WordDoc {

  implicit val docDecoder: DocDecoder[WordDoc] = new DocDecoder[WordDoc] {
    val analyzer: Analyzer = LuceneField.perFieldAnalyzer(WordField.values)

    def decode(document: Document): WordDoc =
      WordDoc(
        id = document.get(WordField.Id.entryName),
        kanjiTerms = document.getValues(WordField.KanjiTerm.entryName).toIndexedSeq,
        readingTerms = document.getValues(WordField.ReadingTerm.entryName).toIndexedSeq,
        definitions = document.getValues(WordField.Definition.entryName).toIndexedSeq,
        tags = document.getValues(WordField.Tags.entryName).toIndexedSeq,
        partsOfSpeech = document.getValues(WordField.PartOfSpeech.entryName).toIndexedSeq,
        priority = document.get(WordField.Priority.entryName).toInt,
        frequency = document.get(WordField.Frequency.entryName).toInt
      )
  }

  def docEncoder(includeInflections: Boolean): DocEncoder[WordDoc] = (a: WordDoc) =>
    for {
      doc <- ZIO.attempt {
               val doc = new Document()

               doc.add(new StringField(WordField.Id.entryName, a.id, Field.Store.YES))

               a.kanjiTerms.foreach { value =>
                 doc.add(new StringField(WordField.KanjiTerm.entryName, value, Field.Store.YES))
                 doc.add(new TextField(WordField.KanjiTermAnalyzed.entryName, value, Field.Store.NO))
               }

               a.readingTerms.foreach { value =>
                 doc.add(new StringField(WordField.ReadingTerm.entryName, value, Field.Store.YES))
                 doc.add(new TextField(WordField.ReadingTermAnalyzed.entryName, value, Field.Store.NO))
               }

               a.definitions.foreach { value =>
                 doc.add(new TextField(WordField.Definition.entryName, value, Field.Store.YES))
                 doc.add(new TextField(WordField.DefinitionOther.entryName, value, Field.Store.NO))
               }

               a.tags.foreach { value =>
                 doc.add(new StringField(WordField.Tags.entryName, value, Field.Store.YES))
               }

               a.partsOfSpeech.foreach { value =>
                 doc.add(new StringField(WordField.PartOfSpeech.entryName, value, Field.Store.YES))
               }

               doc.add(new StoredField(WordField.Priority.entryName, a.priority))
               doc.add(new NumericDocValuesField(WordField.Priority.entryName, a.priority))

               doc.add(new StoredField(WordField.Frequency.entryName, a.frequency))
               doc.add(new NumericDocValuesField(WordField.Frequency.entryName, a.frequency))

               doc
             }
      _ <- indexInflections(a, doc).when(includeInflections)
    } yield doc

  private def indexInflections(d: WordDoc, document: Document): Task[Unit] = {
    def indexTerms(terms: Seq[String], field: WordField, wordType: WordType): Task[Unit] = {
      val allInflections = terms.flatMap { value =>
        Inflection.inflectAll(value, wordType).flatMap {
          case (_, Right(chunk)) =>
            (chunk ++ chunk.map(JapaneseText.toHiragana)).toChunk

          case _ =>
            Chunk.empty
        }
      }.distinct

      ZIO.foreachDiscard(allInflections) { value =>
        ZIO.attempt {
          document.add(new StringField(field.entryName, value, Field.Store.NO))
        }
      }
    }

    val wordTypeOpt =
      if (d.partsOfSpeech.contains("adj-i"))
        Some(WordType.AdjectiveI)
      else if (d.partsOfSpeech.contains("v1"))
        Some(WordType.VerbIchidan)
      else if (d.partsOfSpeech.exists(_.startsWith("v5k-s")))
        Some(WordType.VerbIku)
      else if (d.partsOfSpeech.exists(_.startsWith("v5")))
        Some(WordType.VerbGodan)
      else if (d.partsOfSpeech.contains("vs") || d.partsOfSpeech.contains("vs-s") || d.partsOfSpeech.contains("vs-i"))
        Some(WordType.VerbSuru)
      else
        None

    ZIO.foreachDiscard(wordTypeOpt) { wordType =>
      for {
        _ <- indexTerms(d.kanjiTerms, WordField.KanjiTermInflected, wordType)
        _ <- indexTerms(d.readingTerms, WordField.ReadingTermInflected, wordType)
      } yield ()
    }
  }
}