com.intel.analytics.bigdl.dataset.text.Dictionary.scala Maven / Gradle / Ivy
The newest version!
/*
* Copyright 2016 The BigDL Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.intel.analytics.bigdl.dataset.text
import java.io.{File, PrintWriter, Serializable}
import org.apache.log4j.Logger
import org.apache.log4j.spi.LoggerFactory
import org.apache.spark.rdd.RDD
import scala.util.Random
/**
* Class that help build a dictionary
* either from tokenized text or from saved dictionary
*
*/
class Dictionary()
extends Serializable {
/**
* The length of the vocabulary
*/
def getVocabSize(): Int = _vocabSize
/**
* Selected words with top-k frequencies and discarded the remaining words.
* Return the length of the discarded words.
*/
def getDiscardSize(): Int = _discardSize
/**
* Word encoding by its index in the dictionary
*/
def word2Index(): Map[String, Int] = _word2index
def index2Word(): Map[Int, String] = _index2word
/**
* Return the array of all selected words.
*/
def vocabulary(): Array[String] = _vocabulary.toArray
/**
* Return the array of all discarded words.
*/
def discardVocab(): Array[String] = _discardVocab.toArray
/**
* return the encoding number of a word,
* if word does not existed in the dictionary,
* it will return the dictionary length as the default index.
* @param word
*/
def getIndex(word: String): Int = {
_word2index.getOrElse(word, _vocabSize)
}
def getWord(index: Float): String = {
getWord(index.toInt)
}
def getWord(index: Double): String = {
getWord(index.toInt)
}
/**
* return the word with regard to the index,
* if index is out of boundary, it will randomly
* return a word in the discarded word list.
* If discard word list is Empty, it will randomly
* return a word in the existed dictionary.
* @param index
*/
def getWord(index: Int): String = {
_index2word.getOrElse(index,
if (_discardSize > 0) _discardVocab(Random.nextInt(_discardSize))
else getWord(Random.nextInt(_vocabSize)))
}
/**
* print word-to-index dictionary
*/
def print(): Unit = {
_word2index.foreach(x =>
logger.info(x._1 + " -> " + x._2))
}
/**
* print discard dictionary
*/
def printDiscard(): Unit = {
_discardVocab.foreach(x =>
logger.info(x))
}
/**
* Save the dictionary, discarded words to the saveFolder
* directory.
* @param saveFolder
*/
def save(saveFolder: String): Unit = {
val saveTo = new File(saveFolder)
require(saveTo.isDirectory,
"Dictionary: saveFolder should be a directory.")
new PrintWriter(saveTo.getAbsolutePath + "/dictionary.txt") {
write(word2Index().mkString("\n")); close
}
new PrintWriter(saveTo.getAbsolutePath + "/discard.txt") {
write(discardVocab().mkString("\n")); close
}
logger.info("save created dictionary.txt and discard.txt to" +
s"${saveTo.getAbsolutePath}")
}
def this(dataset: RDD[Array[String]], vocabSize: Int) = {
this()
val words = dataset.flatMap(x => x)
logger.info(s"${words.count()} words and" +
s"${dataset.count()} sentences processed")
val freqDict = words
.map(w => (w, 1))
.reduceByKey(_ + _)
.collect().sortBy(_._2)
update(freqDict.toSeq, vocabSize)
}
def this(sentences: Iterator[Array[String]],
vocabSize: Int) = {
this()
val freqDict = sentences
.flatMap(x => x)
.foldLeft(Map.empty[String, Int]) {
(count, word) => count + (word -> (count.getOrElse(word, 0) + 1))
}.toSeq.sortBy(_._2)
update(freqDict, vocabSize)
}
def this(words: Array[String],
vocabSize: Int) = {
this()
val freqDict = words
.foldLeft(Map.empty[String, Int]) {
(count, word) => count + (word -> (count.getOrElse(word, 0) + 1))
}.toSeq.sortBy(_._2)
update(freqDict, vocabSize)
}
def this(sentences: Stream[Array[String]],
vocabSize: Int) = {
this()
val freqDict = sentences
.flatMap(x => x)
.foldLeft(Map.empty[String, Int]) {
(count, word) => count + (word -> (count.getOrElse(word, 0) + 1))
}.toSeq.sortBy(_._2)
update(freqDict, vocabSize)
}
def this(directory: String) = {
this()
val dictionaryFile = new File(directory, "dictionary.txt")
require(dictionaryFile.exists() && dictionaryFile.isFile,
"dictionaryFile does not exist or is not a File type.")
val discardFile = new File(directory, "discard.txt")
require(discardFile.exists() && discardFile.isFile,
"discardFile does not exist or is not a File type.")
import scala.io.Source
_word2index = Source.fromFile(dictionaryFile.getAbsolutePath)
.getLines.map(_.stripLineEnd.split("->", -1))
.map(fields => fields(0).stripSuffix(" ") -> fields(1).stripPrefix(" ").toInt)
.toMap[String, Int]
_index2word = _word2index.map(x => (x._2, x._1))
_vocabulary = _word2index.keys.toSeq
_vocabSize = _word2index.size
_discardVocab = Source.fromFile(discardFile.getAbsolutePath)
.getLines().toSeq
_discardSize = _discardVocab.length
}
private def update[S <: Seq[(String, Int)]](freqDict : S, vocabSize: Int) = {
val length = math.min(vocabSize, freqDict.length)
_vocabulary = freqDict.drop(freqDict.length - length).map(_._1)
_vocabSize = _vocabulary.length
_word2index = _vocabulary.zipWithIndex.toMap
_index2word = _word2index.map(x => (x._2, x._1))
_discardVocab = freqDict.take(freqDict.length - length).map(_._1)
_discardSize = _discardVocab.length
}
@transient
private val logger = Logger.getLogger(getClass)
private var _vocabSize: Int = 0
private var _discardSize: Int = 0
private var _word2index: Map[String, Int] = null
private var _index2word: Map[Int, String] = null
private var _vocabulary: Seq[String] = null
private var _discardVocab: Seq[String] = null
}
object Dictionary {
def apply(sentences: Iterator[Array[String]], vocabSize: Int)
: Dictionary = new Dictionary(sentences, vocabSize)
def apply(words: Array[String], vocabSize: Int)
: Dictionary = new Dictionary(words, vocabSize)
def apply(dataset: Stream[Array[String]], vocabSize: Int)
: Dictionary = new Dictionary(dataset, vocabSize)
def apply(directory: String)
: Dictionary = new Dictionary(directory)
def apply(dataset: RDD[Array[String]], vocabSize: Int = 10000)
: Dictionary = new Dictionary(dataset, vocabSize)
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy