![JAR search and dependency download from the Maven repository](/logo.png)
it.unipi.di.acube.batframework.datasetPlugins.NEEL2016Dataset Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of bat-framework Show documentation
Show all versions of bat-framework Show documentation
A framework to compare entity annotation systems.
The newest version!
package it.unipi.di.acube.batframework.datasetPlugins;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.LineIterator;
import it.unipi.di.acube.batframework.data.Annotation;
import it.unipi.di.acube.batframework.data.Mention;
import it.unipi.di.acube.batframework.data.Tag;
import it.unipi.di.acube.batframework.problems.A2WDataset;
import it.unipi.di.acube.batframework.utils.ProblemReduction;
import it.unipi.di.acube.batframework.utils.WikipediaInterface;
import it.unipi.di.acube.batframework.utils.WikipediaLocalInterface;
public class NEEL2016Dataset implements A2WDataset {
private static Pattern tweetsRE = Pattern.compile("^\\|(\\d+)\\|,\\|(.*)\\|$");
private List text;
private List> gold;
private String portion;
private static final Charset UTF_8 = Charset.forName("UTF-8");
public NEEL2016Dataset(InputStream annotationsIs, InputStream textIs, WikipediaInterface wikiApi, String portion)
throws IOException {
this.portion = portion;
HashMap idToBody = new HashMap<>();
{
LineIterator itText = IOUtils.lineIterator(textIs, "utf8");
try {
while (itText.hasNext()) {
String line = itText.nextLine();
Matcher m = tweetsRE.matcher(line);
if (!m.matches())
throw new IllegalArgumentException();
long docId = Long.parseLong(m.group(1));
String body = new String(m.group(2).getBytes(UTF_8), UTF_8);
idToBody.put(docId, body);
}
} finally {
LineIterator.closeQuietly(itText);
}
}
HashMap> idToAnnotations = new HashMap<>();
{
LineIterator itAnnotations = IOUtils.lineIterator(annotationsIs, "utf8");
try {
while (itAnnotations.hasNext()) {
String[] fields = itAnnotations.nextLine().split("\t");
if (fields.length != 6)
throw new IllegalArgumentException();
long docId = Long.parseLong(fields[0]);
int start = Integer.parseInt(fields[1]);
int end = Integer.parseInt(fields[2]);
String entity = fields[3];
if (entity.startsWith("NIL"))
continue;
int wid = wikiApi.dereference(wikiApi.getIdByTitle(WikipediaLocalInterface.dbPediaUrlToTitle(entity)));
if (!idToAnnotations.containsKey(docId))
idToAnnotations.put(docId, new HashSet());
idToAnnotations.get(docId).add(new Annotation(start, end - start, wid));
}
} finally {
LineIterator.closeQuietly(itAnnotations);
}
}
List docIds = new Vector<>(idToBody.keySet());
Collections.sort(docIds);
text = new Vector<>();
for (long docId : docIds)
text.add(idToBody.get(docId));
gold = new Vector<>();
for (long docId : docIds)
if (idToAnnotations.containsKey(docId))
gold.add(idToAnnotations.get(docId));
else
gold.add(new HashSet());
}
@Override
public int getSize() {
return text.size();
}
@Override
public int getTagsCount() {
int count = 0;
for (HashSet s : gold)
count += s.size();
return count;
}
@Override
public List> getC2WGoldStandardList() {
return ProblemReduction.A2WToC2WList(this.getA2WGoldStandardList());
}
@Override
public List> getD2WGoldStandardList() {
return getA2WGoldStandardList();
}
@Override
public List getTextInstanceList() {
return text;
}
@Override
public List> getMentionsInstanceList() {
return ProblemReduction.A2WToD2WMentionsInstance(getA2WGoldStandardList());
}
@Override
public String getName() {
return "#Microposts2016 NEEL " + portion;
}
@Override
public List> getA2WGoldStandardList() {
return gold;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy