All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.mayabot.nlp.segment.IterableMode.kt Maven / Gradle / Ivy

package com.mayabot.nlp.segment

import java.util.*

enum class WordTermIterableMode {
    /**
     * 不输出子词
     */
    TOP,

    /**
     * 输出合并词和子词
     * 北京大学
     *
     * 北京大学 北京 大学
     */
    Overlap,

    /**
     * 只输出子词
     */
    ATOM;

}

class OverlapIterable(val wrap: Iterable) : Iterable {

    override fun iterator(): Iterator {
        return OverlapIterator(wrap.iterator())
    }

}

/**
 * 北京大学     的 学生
 * 北京 大学
 */
class OverlapIterator(val from: Iterator) : AbstractIterator() {
    var buffer: LinkedList? = null
    override fun computeNext() {
        val b = buffer

        if (b != null) {
            if (b.isEmpty()) {
                buffer = null
            } else {
                setNext(b.poll()!!)
                return
            }
        }

        if (from.hasNext()) {
            val next = from.next()
            setNext(next)
            if (next.hasSubword()) {
                buffer = LinkedList(next.subword).apply {
                    //第一个字词的pos是0
                    //https://lucene.apache.org/core/8_1_0/core/org/apache/lucene/analysis/package-summary.html#package.description
                    first.posInc = 0
                }
            }
        } else {
            done()
        }
    }
}


class AtomIterable(val wrap: Iterable) : Iterable {
    override fun iterator(): Iterator {
        return AtomIterator(wrap.iterator())
    }
}


class AtomIterator(val from: Iterator) : AbstractIterator() {

    var buffer: LinkedList? = null

    override fun computeNext() {
        val b = buffer
        if (b != null) {
            if (b.isEmpty()) {
                buffer = null
            } else {
                setNext(b.poll()!!)
                return
            }
        }

        if (from.hasNext()) {
            val term = from.next()
            if (term.hasSubword()) {
                val subwords = LinkedList(term.subword)
                buffer = subwords
                setNext(subwords.poll()!!)
            } else {
                setNext(term)
            }
        } else {
            done()
        }
    }

}


//
//
//class GraphIterable(val wrap: Iterable) : Iterable {
//
//    override fun iterator(): Iterator {
//        return GraphIterator(wrap.iterator())
//    }
//
//
//}
//
//
//class GraphIterator(val from: Iterator) : AbstractIterator() {
//    var buffer: LinkedList? = null
//    override fun computeNext() {
//        val b = buffer
//        if (b != null) {
//            if (b.isEmpty()) {
//                buffer = null
//            } else {
//                setNext(b.poll()!!)
//                return
//            }
//        }
//
//        if (from.hasNext()) {
//            val next = from.next()
//            setNext(next)
//            if (next.hasSubword()) {
//                buffer = LinkedList(next.subword).apply {
//                    //第一个字词的pos是0
//                    //https://lucene.apache.org/core/8_1_0/core/org/apache/lucene/analysis/package-summary.html#package.description
//                    first.posInc = 0
//                }
//            }
//        } else {
//            done()
//        }
//    }
//
//}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy