cc.factorie.app.nlp.load.TacFileIterator.scala Maven / Gradle / Ivy

/* Copyright (C) 2008-2016 University of Massachusetts Amherst.
   This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible)
   http://factorie.cs.umass.edu, http://github.com/factorie
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License. */
package cc.factorie.app.nlp.load

import java.io._
import java.util.Scanner
import java.util.zip.GZIPInputStream

import cc.factorie.app.nlp.Document

object TACDocTypes {
  sealed trait TACDocumentType
  case object Newswire extends TACDocumentType
  case object DiscussionForum extends TACDocumentType
  case object WebDocument extends TACDocumentType

  object TACDocumentType {
    def fromFilePath(f:File):TACDocumentType = {
      val path = f.getAbsolutePath.toLowerCase
      if(path.contains("discussion_forums")) {
        DiscussionForum
      } else if(path.contains("newswire")) {
        Newswire
      } else if(path.contains("web")) {
        WebDocument
      } else {
        throw new Exception("Unable to assign document at path %s to a document type".format(path))
      }
    }
  }
}


/**
 * @author John Sullivan
 */
class TacFileIterator(tacDocFile:File) extends Iterator[Document] {
  import TACDocTypes._

  private val docEndString = """"""
  private val webDocStartString = """"""
  private val docIdRegex = """(?i)]*>""".r
  private val webDocIdRegex = """(?i) ([^ ]+) """.r

  /** we use scanner here so that when we recreate the lines by adding \n we don't change
    * the character count on documents that may use crlf to delimit lines
    */
  private val tacReader = new Scanner(if(tacDocFile.getName.endsWith(".gz")) {
    new GZIPInputStream(new FileInputStream(tacDocFile))
  } else {
    new FileInputStream(tacDocFile)
  }).useDelimiter("\n")

  private var docBuffer = new StringBuilder()
  private var line = null.asInstanceOf[String]
  private var lineNum = 0

  // grouping together to avoid forgetting something
  @inline
  private def advanceLine() {
    docBuffer append line
    docBuffer append "\n"
    line = if(tacReader.hasNext) tacReader.next() else null
    lineNum += 1
  }

  //priming the pump - we don't call advanceLine because we don't want to add a null to the start of our doc
  line = if(tacReader.hasNext) tacReader.next() else null
  lineNum += 1

  def next() = {

    val docIdMatchOpt = docIdRegex.unapplySeq(line).map(_.head)

    // We should be at the start of a new document here, otherwise we have a problem.
    assert(line.equalsIgnoreCase(webDocStartString) || docIdMatchOpt.isDefined, "Found line: |%s| that was not a valid doc start at line %d in %s".format(line, lineNum, tacDocFile.getName))
    val docId = if(docIdMatchOpt.isDefined) {
      docIdRegex.unapplySeq(line).get.head
      //var docIdRegex(docId) = line
    } else if(line equalsIgnoreCase webDocStartString) { // we know that one must be true but let's not tempt fate
      advanceLine()
      //var webDocIdRegex(docId) = line
      webDocIdRegex.unapplySeq(line).get.head
    } else {
      throw new Exception("Found line: |%s| that was not a valid doc start at line %d in %s".format(line, lineNum, tacDocFile.getName))
    }

    while(!line.equalsIgnoreCase(docEndString)) {
      advanceLine()
    }
    // the loop exits when the doc end is found, but that us still part of the previous document so we need to consume it.
    advanceLine()
    val docString = docBuffer.toString()
    docBuffer = new StringBuilder()
    val doc = new Document(docString).setName(docId)
    doc.attr += TACDocumentType.fromFilePath(tacDocFile)
    doc.annotators += classOf[TACDocumentType] -> this.getClass
    doc
  }

  def hasNext = line != null
}

object TacFileIterator {
  def main(args:Array[String]) {
    val f = new File(args(0))

    val doc = new TacFileIterator(f).next()
    println(doc.name)
    val wrt = new BufferedWriter(new FileWriter(doc.name))
    wrt.write(doc.string)
    wrt.flush()
    wrt.close()

  }
}