![JAR search and dependency download from the Maven repository](/logo.png)
it.unipi.di.acube.batframework.datasetPlugins.ConllAidaDataset Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of bat-framework Show documentation
Show all versions of bat-framework Show documentation
A framework to compare entity annotation systems.
The newest version!
/**
* (C) Copyright 2012-2013 A-cube lab - Università di Pisa - Dipartimento di Informatica.
* BAT-Framework is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
* BAT-Framework is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with BAT-Framework. If not, see .
*/
package it.unipi.di.acube.batframework.datasetPlugins;
import java.io.*;
import java.nio.charset.Charset;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPathExpressionException;
import org.xml.sax.SAXException;
import it.unimi.dsi.lang.MutableString;
import it.unipi.di.acube.batframework.data.Annotation;
import it.unipi.di.acube.batframework.data.Mention;
import it.unipi.di.acube.batframework.data.Tag;
import it.unipi.di.acube.batframework.problems.A2WDataset;
import it.unipi.di.acube.batframework.utils.AnnotationException;
import it.unipi.di.acube.batframework.utils.ProblemReduction;
import it.unipi.di.acube.batframework.utils.WikipediaInterface;
public class ConllAidaDataset implements A2WDataset{
private List> annotations = new Vector>();
private List documents = new Vector();
private Pattern wikiUrlPattern = Pattern.compile("http://en.wikipedia.org/wiki/(.*)");
private Pattern mentionPattern = Pattern.compile("^(.*?)\t([BI]?)\t(.*?)\t(.*?)\t(.*?)(?:\t(.*))?$");
private Pattern nmePattern = Pattern.compile("^(.*)\t([BI])\t(.*)\t(.*)--NME--$");
private Pattern punctuationPattern = Pattern.compile("^\\W.*$");
public ConllAidaDataset (String file, WikipediaInterface api) throws IOException, AnnotationException, XPathExpressionException, ParserConfigurationException, SAXException{
List> aidaAnns = new Vector>();
List titlesToPrefetch = new Vector();
BufferedReader r = new BufferedReader( new InputStreamReader(new FileInputStream(file), Charset.forName("UTF-8")));
String line;
MutableString currentDoc = null;
HashSet currentAnns = null;
int currentPos = -1, currentLen = 0;
String currentTitle = null;
while ((line = r.readLine()) != null){
Matcher m = mentionPattern.matcher(line);
Matcher mneMatch = nmePattern.matcher(line);
MutableString append = new MutableString();
if ((!m.matches() || m.matches() && m.group(2).equals("B")) && currentPos != -1){ //if any, store the last tag
currentAnns.add(new AidaAnnotation(currentPos, currentLen, currentTitle));
currentPos = -1;
currentLen = 0;
currentTitle = null;
}
if (line.startsWith("-DOCSTART-")){ // a new document
currentDoc = new MutableString();
documents.add(currentDoc);
currentAnns = new HashSet();
aidaAnns.add(currentAnns);
}
else if (line.equals("")){ // the end of a sentence
append.replace("\n");
}
else if (!m.matches() && !mneMatch.matches()){ // a word not part of a mention
append.replace(line + " ");
}
else if (mneMatch.matches()){ // a word part of a non-recognized mention
append.replace(mneMatch.group(1) + " ");
}
else{ // a word with a recognized mention.
if (m.group(2).equals("B")){
Matcher m2 = wikiUrlPattern.matcher(m.group(5));
if (m2.matches()){
currentTitle = m2.group(1);
currentPos = currentDoc.length();
currentLen = m.group(1).length();
titlesToPrefetch.add(currentTitle);
}
else{
r.close();
throw new AnnotationException("Dataset is malformed: string "+m.group(5)+ " should be a wikipedia URL. Line=["+line+"]");
}
}
else {
if (!m.group(2).equals("B") && !m.group(2).equals("I")){
r.close();
throw new AnnotationException("Dataset is malformed: all mention should be marked as B or I. Bad mention: "+line);
}// found mention is a continuation
currentLen += m.group(1).length()+1;
}
append.replace(m.group(1) + " ");
}
//* Should the last whitespace be removed? */
Matcher punctuationMatch = punctuationPattern.matcher(append);
if (punctuationMatch.matches())
currentDoc.trimRight();
currentDoc.append(append);
}
r.close();
/** Prefetch titles */
api.prefetchTitles(titlesToPrefetch);
/** Create annotation list */
for (HashSet s : aidaAnns){
HashSet sA = new HashSet();
for (AidaAnnotation aA: s){
int wid = api.getIdByTitle(aA.title);
if (wid == -1)
System.out.println("ERROR: Dataset is malformed: Wikipedia API could not find page "+aA.title);
else
sA.add(new Annotation(aA.position, aA.length, wid));
}
HashSet sANonOverlapping = Annotation.deleteOverlappingAnnotations(sA);
annotations.add(sANonOverlapping);
}
}
@Override
public int getSize() {
return annotations.size();
}
@Override
public int getTagsCount() {
int count = 0;
for (HashSet s : annotations)
count += s.size();
return count;
}
@Override
public List> getC2WGoldStandardList() {
return ProblemReduction.A2WToC2WList(annotations);
}
@Override
public List> getA2WGoldStandardList() {
return annotations;
}
@Override
public List> getD2WGoldStandardList() {
return getA2WGoldStandardList();
}
@Override
public List getTextInstanceList() {
List stringDocuments = new Vector();
for (MutableString s : documents){
stringDocuments.add(s.toString());
}
return stringDocuments;
}
@Override
public List> getMentionsInstanceList() {
return ProblemReduction.A2WToD2WMentionsInstance(getA2WGoldStandardList());
}
@Override
public String getName() {
return "AIDA/CO-NLL";
}
private class AidaAnnotation{
public AidaAnnotation(int pos, int len, String title) {
this.length = len;
this.position = pos;
this.title = title;
}
public int length, position;
public String title;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy