cc.factorie.app.nlp.load.LoadConll2008.scala Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of factorie_2.11 Show documentation
FACTORIE is a toolkit for deployable probabilistic modeling, implemented as a software library in Scala. It provides its users with a succinct language for creating relational factor graphs, estimating parameters and performing inference.
The newest version!
/* Copyright (C) 2008-2016 University of Massachusetts Amherst.
   This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible)
   http://factorie.cs.umass.edu, http://github.com/factorie
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License. */

package cc.factorie.app.nlp.load

import java.io.PrintWriter

import cc.factorie.app.nlp._
import cc.factorie.app.nlp.lemma.TokenLemma
import cc.factorie.app.nlp.parse.ParseTree
import cc.factorie.app.nlp.pos.{LabeledSpanishPosTag, PennPosTag, PosTag}
import cc.factorie.util.FastLogging

import scala.io.Source

/*
 * Loader for the CoNLL 2008 closed-track shared task data.
 * wordIndex word lemma POS parentIndex depLabel
 * Details on the format are available at http://barcelona.research.yahoo.net/dokuwiki/doku.php?id=conll2008:format
 *
 * @author Brian Martin
 */

object LoadSpanishConll2008 extends LoadConll2008(classOf[pos.SpanishPosTag]) {
  def makePosTag(token: Token, partOfSpeech: String): PosTag =
    new LabeledSpanishPosTag(token, partOfSpeech)
}

object LoadConll2008 extends LoadConll2008(classOf[pos.PennPosTag]) {
  def makePosTag(token: Token, partOfSpeech: String): PosTag =
    new PennPosTag(token, partOfSpeech)
}

abstract class LoadConll2008(val posType: Class[_]) extends Load with FastLogging {

  def makePosTag(token: Token, partOfSpeech: String): PosTag

  private def addDepInfo(s: Sentence, depInfoSeq: Seq[(Int, Int, String)]): Unit = {
    val tree = new ParseTree(s)
    for ((childIdx, parentIdx, depLabel) <- depInfoSeq) {
      tree.setParent(childIdx, parentIdx)
      tree.label(childIdx).setCategory(depLabel)(null)
    }
    s.attr += tree
  }

  var loadLemma = true

  def fromFilename(filename: String): Seq[Document] = {
    fromSource(Source.fromFile(filename))
  }

  def fromSource(source: Source): Seq[Document] = {
    val document: Document = new Document
    document.annotators(classOf[Token]) = UnknownDocumentAnnotator.getClass // register that we have token boundaries
    document.annotators(classOf[Sentence]) = UnknownDocumentAnnotator.getClass // register that we have sentence boundaries
    document.annotators(posType) = UnknownDocumentAnnotator.getClass // register that we have POS tags
    if (loadLemma) document.annotators(classOf[TokenLemma]) = UnknownDocumentAnnotator.getClass // register that we have lemma
    var sentence: Sentence = new Sentence(document)
//    var depInfoSeq = new collection.mutable.ArrayBuffer[(Int, Int, String)]
    for (line <- source.getLines()) {
      if (line.length < 2) {
        // Sentence boundary
        document.appendString("\n")
//        addDepInfo(sentence, depInfoSeq)
//        depInfoSeq = new collection.mutable.ArrayBuffer[(Int, Int, String)]
        sentence = null
      } else {
        if (sentence eq null)
          sentence = new Sentence(document) // avoids empty sentence at the end of doc
        val fields = line.split('\t')
        assert(fields.length >= 10)
        val currTokenIdx = fields(0).toInt - 1
        val word = fields(1)
        val lemma = fields(2)
        val partOfSpeech = fields(3)
        val parentIdx = fields(8).toInt - 1
        val depLabel = fields(9)
        document.appendString(" ")
        val token = new Token(sentence, word)
        token.attr += makePosTag(token, partOfSpeech)
        if (loadLemma)
          token.attr += new TokenLemma(token, lemma) // TODO Change this to some more specific TokenLemma subclass
//        depInfoSeq.append((currTokenIdx, parentIdx, depLabel))
      }
    }
//    if (sentence ne null)
//      addDepInfo(sentence, depInfoSeq)

    println("Loaded 1 document with " + document.sentences.size + " sentences with " + document.asSection.length + " tokens total from file.")
    Seq(document)
  }

  def printDocument(d: Document) =
    for (s <- d.sentences)
      println(s.attr[ParseTree].toString() + "\n")

//  def main(args: Array[String]) =
//    for (filename <- args)
//      printDocument(fromFilename(filename).head)

}


object WriteConll2008 {

  // if the source file is given, then include the fields that we don't know anything about
  // otherwise just give underscores for info we don't know.
  def toFile(outputFile: String, document: Document, sourceFile: String = null): Unit = {
    val source = {
      if (sourceFile eq null) None else Some(Source.fromFile(sourceFile).getLines())
    }
    val sentences = document.sentences.iterator
    val writer = new PrintWriter(outputFile)
    var sentence: Sentence = sentences.next()
    var currTokenIdx = 0
    var tree: ParseTree = sentence.parse
    while (true) {
      if (currTokenIdx == sentence.length) {
        writer.println()
        if (sentences.hasNext) {
          source match {
            case Some(source) => source.next()
            case _ => ()
          }
          sentence = sentences.next()
          tree = sentence.parse
          currTokenIdx = 0
        }
        else {
          writer.close()
          return
        }
      }
      else {
        val field8 = "" + (tree.parentIndex(currTokenIdx) + 1)
        val field9 = "" + {
          val category = tree.label(currTokenIdx).categoryValue;
          if (category == "") "_" else category
        }
        val fields = source match {
          case None => {
            val x = Array.fill[String](10)("_")
            x(0) = "" + (currTokenIdx + 1)
            x(1) = sentence.tokens(currTokenIdx).string
            x(3) = sentence.tokens(currTokenIdx).posTag.categoryValue
            x(8) = field8
            x(9) = field9
            x
          }
          case Some(source) => {
            val x = source.next().split("\t")
            x(8) = field8
            x(9) = field9
            x
          }
        }
        currTokenIdx += 1
        writer.println(fields.mkString("\t"))
      }
    }
  }

}