cc.factorie.tutorial.TopicsOverTime.scala Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of factorie_2.11 Show documentation
FACTORIE is a toolkit for deployable probabilistic modeling, implemented as a software library in Scala. It provides its users with a succinct language for creating relational factor graphs, estimating parameters and performing inference.
The newest version!
/* Copyright (C) 2008-2016 University of Massachusetts Amherst.
   This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible)
   http://factorie.cs.umass.edu, http://github.com/factorie
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License. */
package cc.factorie.tutorial
import java.io.File

import cc.factorie._
import cc.factorie.app.nlp.lexicon.StopWords
import cc.factorie.app.strings.alphaSegmenter
import cc.factorie.app.topics.lda.SparseLDAInferencer
import cc.factorie.directed._
import cc.factorie.la._
import cc.factorie.variable._

import scala.collection.mutable.ArrayBuffer

// A fast approximation to Topics-over-Time that leverages SparseLDAInferencer.
// Estimate a per-topic Beta distribution over normalized time stamps.
// Change alpha on a per-document basis to reflect a scaled version of this Beta distribution.
object TopicsOverTime {
  val numTopics = 100
  val beta1 = 0.1
  val alpha1 = 0.1
  val fitDirichlet = false
  implicit val random = new scala.util.Random(0)

  object ZDomain extends DiscreteDomain(numTopics)
  object ZSeqDomain extends DiscreteSeqDomain { def elementDomain = ZDomain }
  class Zs(len:Int) extends DiscreteSeqVariable(len) { def domain = ZSeqDomain }
  object WordSeqDomain extends CategoricalSeqDomain[String]
  val WordDomain = WordSeqDomain.elementDomain
  class Document(name:String, myTheta:ProportionsVariable, myZs:Zs, words:Seq[String]) extends cc.factorie.app.topics.lda.Document(WordSeqDomain, name, words) {
    this.theta = myTheta
    this.zs = myZs
    var timeStamp: Double = -1.0
  }
  val beta = MassesVariable.growableUniform(WordDomain, beta1)
  val alphas = MassesVariable.dense(numTopics, alpha1)
  val timeAlphas = new Array[Double](numTopics)
  val timeBetas  = new Array[Double](numTopics)
  val timeMeans = new DenseTensor1(numTopics)
  implicit val model = DirectedModel()
  
  def estimateTopicTimes(documents:Seq[Document]): Unit = {
    val topic2times = Array.tabulate(numTopics)(i => new cc.factorie.util.DoubleArrayBuffer)
    for (doc <- documents) {
      for (i <- 0 until doc.length) {
        if (!doc.timeStamp.isNaN)
          topic2times(doc.zs.intValue(i)) += doc.timeStamp
      }
    }
    val topic2mean = Array.tabulate(numTopics)(i => if (topic2times(i).length > 1) maths.sampleMean(topic2times(i)) else 0.5)
    val topic2variance = Array.tabulate(numTopics)(i => if (topic2times(i).length > 1) maths.sampleVariance(topic2times(i), topic2mean(i)) else 0.25)
    timeMeans := topic2mean
    for (i <- 0 until numTopics) {
      timeAlphas(i) = MaximizeBetaByMomentMatching.maxAlpha(topic2mean(i), topic2variance(i))
      timeBetas(i) =  MaximizeBetaByMomentMatching.maxBeta(topic2mean(i), topic2variance(i))
    }
  }

  def main(args: Array[String]): Unit = {
    val directories = 
      if (args.length > 0) args.toList 
      else if (true) List("12", "11", "10", "09", "08", "07").take(99).map("/Users/mccallum/research/data/text/nipstxt/nips"+_)
      else if (false) List("acq", "earn", "money-fx").map("/Users/mccallum/research/data/text/reuters/reuters-parsed/modapte/"+_)
      else List("comp.graphics", "comp.os.ms-windows.misc", "comp.sys.ibm.pc.hardware", "comp.sys.mac.hardware").map("/Users/mccallum/research/data/text/20_newsgroups/"+_)
    val phis = Mixture(numTopics)(ProportionsVariable.growableDense(WordDomain) ~ Dirichlet(beta))
    val documents = new ArrayBuffer[Document]
    val stopwords = StopWords
    stopwords += "rainbownum"
    for (directory <- directories) {
      for (file <- new File(directory).listFiles; if file.isFile) {
        val theta = ProportionsVariable.sortedSparseCounts(numTopics) ~ Dirichlet(alphas)
        val tokens = alphaSegmenter(file).map(_.toLowerCase).filter(!stopwords.contains(_)).toSeq
        val zs = new Zs(tokens.length) :~ PlatedDiscrete(theta)
        val doc = new Document(file.toString, theta, zs, tokens) ~ PlatedCategoricalMixture(phis, zs)
        doc.time = file.lastModified
        documents += doc
      }
    }

    // Now that we have the full min-max range of dates, set the doc.stamps values to a 0-1 normalized value
    val times = documents.map(_.time)
    val maxTime = times.max
    val minTime: Double = times.min
    val timeRange: Double = maxTime- minTime
    // given 0<=x<=1, return a value y<= <=(1-y) 
    def squeeze(x:Double, y:Double): Double = x * (1-y) + y
    documents.foreach(doc => doc.timeStamp = squeeze((doc.time - minTime) / timeRange, .2))
    estimateTopicTimes(documents)
        
    val sampler = SparseLDAInferencer(ZDomain, WordDomain, documents, alphas.value, beta1, model)

    for (i <- 1 to 30) {
      for (doc <- documents) {
        val timeSmoothing = Tensor.tabulate(numTopics)(i => { val m = timeMeans(i) + 0.5; m*m*m*m*m*m })
        sampler.resetSmoothing(alphas.value + (timeSmoothing * 3.0), beta1)
        sampler.process(doc.zs)
      }
      if (i % 5 == 0) {
        sampler.export(phis)
        if (fitDirichlet) {
          sampler.exportThetas(documents)
          MaximizeDirichletByMomentMatching(alphas, model)
          sampler.resetSmoothing(alphas.value, beta1)
        } else {
          estimateTopicTimes(documents)
        }
      }
    }
  }
  
  
}