Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/* Copyright (C) 2008-2016 University of Massachusetts Amherst.
This file is part of "FACTORIE" (Factor graphs, Imperative, Extensible)
http://factorie.cs.umass.edu, http://github.com/factorie
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
package cc.factorie.app.nlp.load
import cc.factorie.app.nlp.pos.PennPosTag
import cc.factorie.app.nlp.{Document, Sentence, Token, UnknownDocumentAnnotator}
import cc.factorie.variable._
import scala.io.Source
/**
* @author John Sullivan
*
* Loads shallow parsing/chunking data from Conll 2000 shared task
* Each sentence becomes a document
*
* 1 token type
* 2 gold POS Tag
* 3 gold chunk (BIO notation default)
*/
object LoadConll2000 extends Load {
//Default BIO encoding for loadConll2000 from Source since this is the standard encoding for conll2000 training data
def fromSource(source: Source) = fromSource(source,"BIO")
def fromSource(source: Source,encoding:String): Seq[Document] = {
val doc = new Document()
doc.annotators(classOf[Token]) = UnknownDocumentAnnotator.getClass
doc.annotators(classOf[Sentence]) = UnknownDocumentAnnotator.getClass
doc.annotators(classOf[PennPosTag]) = UnknownDocumentAnnotator.getClass
doc.annotators(classOf[BIOChunkTag]) = UnknownDocumentAnnotator.getClass
//Enable multiple input encodings
val newChunkLabel = encoding match {
case "BILOU" => (t:Token,s:String) => new BILOUChunkTag(t,s)
case "BIO" => (t:Token,s:String) => new BIOChunkTag(t,s)
case "NESTED" => (t:Token,s:String) => new BILOUNestedChunkTag(t,s)
case _ => (t:Token,s:String) => new BIOChunkTag(t,s)
}
var sent = new Sentence(doc)
source.getLines().foreach{ line =>
sent = processWordLine(doc, sent, line, newChunkLabel)
}
Seq(doc)
}
val lineSplit = """([^\s]+) ([^\s]+) ([^\s]+)""".r
val posTranslations = Map("(" -> "-LRB-", ")" -> "-RRB-")
private def processWordLine(doc:Document, sent:Sentence, line:String,newChunkLabel: (Token,String) => ChunkTag):Sentence = line match {
case lineSplit(tokenType, posTagString, chunkTagString) => {
val t = new Token(sent, tokenType + " ")
t.attr += new PennPosTag(t, posTranslations.getOrElse(posTagString, identity(posTagString)))
t.attr += newChunkLabel(t, chunkTagString)
sent
}
case empty if empty.isEmpty => new Sentence(doc)
case otw => throw new Exception("Expected either a line with token pos tag chunk tag, or an empty line, received: %s".format(otw))
}
def convertBIOtoBILOU(sentences: Seq[Sentence]){
for(sentence <- sentences) {
for(token <- sentence.tokens) {
var prev : Token = null
var next : Token = null
if(token.sentenceHasPrev) prev = token.sentencePrev
if(token.sentenceHasNext) next = token.sentenceNext
token.sentenceNext
val newLabel : String = BIOtoBILOU(prev, token, next)
token.attr += new BILOUChunkTag(token, newLabel)
}
}
}
def BIOtoBILOU(prev : Token, token : Token, next : Token) : String = {
if(token.attr[BIOChunkTag].categoryValue == "O") return "O"
val ts = token.attr[BIOChunkTag].categoryValue.split("-")
var ps : Array[String] = null
var ns : Array[String] = null
if(next != null)
ns = splitLabel(next)
if(prev != null)
ps = splitLabel(prev)
if(token.attr[BIOChunkTag].categoryValue.contains("B-")) {
if(next == null || ns(1) != ts(1) || ns(0) == "B")
return "U-" + ts(1)
else
return token.attr[BIOChunkTag].categoryValue
}
if(next == null || ns(1) != ts(1) || ns(0) == "B")
return "L-" + ts(1)
"I-" + ts(1)
}
private def splitLabel(token : Token) : Array[String] = {
if(token.attr[BIOChunkTag].categoryValue.contains("-"))
token.attr[BIOChunkTag].categoryValue.split("-")
else
Array("", "O")
}
}
//Standard conll2000 Chunk Tags
object BIOChunkDomain extends CategoricalDomain[String] {
this ++= Vector("B-ADJP",
"B-ADVP",
"B-CONJP",
"B-INTJ",
"B-LST",
"B-NP",
"B-PP",
"B-PRT",
"B-SBAR",
"B-UCP",
"B-VP",
"I-ADJP",
"I-ADVP",
"I-CONJP",
"I-INTJ",
"I-LST",
"I-NP",
"I-PP",
"I-PRT",
"I-SBAR",
"I-UCP",
"I-VP",
"O")
freeze()
}
object BILOUChunkDomain extends CategoricalDomain[String] {
this ++= BIOChunkDomain.categories
this ++= Vector( "L-ADVP",
"L-ADJP",
"L-CONJP",
"L-INTJ",
"L-LST",
"L-NP",
"L-PP",
"L-PRT",
"L-SBAR",
"L-UCP",
"L-VP",
"U-ADJP",
"U-ADVP",
"U-CONJP",
"U-INTJ",
"U-LST",
"U-NP",
"U-PP",
"U-PRT",
"U-SBAR",
"U-UCP",
"U-VP")
freeze()
}
//For Noun Phrase Chunk Tagging
//Requires custom training data tagged in this notation
object BILOUNestedChunkDomain extends CategoricalDomain[String] {
this ++= Vector( "B-NP:B-NP",
"B-NP:I-NP",
"B-NP:L-NP",
"B-NP:U-NP",
"B-NP:O",
"I-NP:B-NP",
"I-NP:I-NP",
"I-NP:L-NP",
"I-NP:U-NP",
"I-NP:O",
"L-NP:B-NP",
"L-NP:I-NP",
"L-NP:L-NP",
"L-NP:U-NP",
"L-NP:O",
"U-NP:B-NP",
"U-NP:I-NP",
"U-NP:L-NP",
"U-NP:U-NP",
"U-NP:O",
"O:B-NP",
"O:I-NP",
"O:L-NP",
"O:U-NP",
"O:O"
)
freeze()
}
//This could be combined into a single LabeledCategoricalVariable with a settable domain
abstract class ChunkTag(val token:Token, tagValue:String) extends LabeledCategoricalVariable(tagValue)
class BIOChunkTag(token:Token, tagValue:String) extends ChunkTag(token, tagValue) {
def domain = BIOChunkDomain
}
class BILOUChunkTag(token:Token, tagValue:String) extends ChunkTag(token,tagValue) {
def domain = BILOUChunkDomain
}
class BILOUNestedChunkTag(token:Token, tagValue:String) extends ChunkTag(token,tagValue) {
def domain = BILOUNestedChunkDomain
}