![JAR search and dependency download from the Maven repository](/logo.png)
it.unipi.di.acube.batframework.datasetPlugins.CsvDataset Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of bat-framework Show documentation
Show all versions of bat-framework Show documentation
A framework to compare entity annotation systems.
The newest version!
package it.unipi.di.acube.batframework.datasetPlugins;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Vector;
import it.unipi.di.acube.batframework.data.Annotation;
import it.unipi.di.acube.batframework.data.Mention;
import it.unipi.di.acube.batframework.data.Tag;
import it.unipi.di.acube.batframework.problems.A2WDataset;
import it.unipi.di.acube.batframework.utils.AnnotationException;
import it.unipi.di.acube.batframework.utils.ProblemReduction;
public class CsvDataset implements A2WDataset {
String name;
List docIds;
List> gold;
public CsvDataset(String filename, String name)
throws NumberFormatException, AnnotationException, IOException {
this.name = name;
BufferedReader br = new BufferedReader(new InputStreamReader(
new FileInputStream(filename)));
String line = null;
HashMap> dsHm = new HashMap<>();
while ((line = br.readLine()) != null) {
String[] tokens = line.split(",");
if (tokens.length != 4){
br.close();
throw new RuntimeException(String.format(
"Line in file %s malformed: [%s]", filename, line));
}
String docId = tokens[0];
int start = Integer.parseInt(tokens[1]);
int end = Integer.parseInt(tokens[2]);
int wikiId = Integer.parseInt(tokens[3]);
if (start < 0 || end < 0 || wikiId < 0){
br.close();
throw new RuntimeException(
"start, end and wikipediaId must be greater that zero.");
}
if (!dsHm.containsKey(docId))
dsHm.put(docId, new HashSet());
dsHm.get(docId).add(
new Annotation(start, end - start, wikiId));
}
br.close();
docIds = new Vector<>(dsHm.keySet());
Collections.sort(docIds);
gold = new Vector<>();
for (String docId : docIds)
gold.add(dsHm.get(docId));
}
@Override
public int getSize() {
return docIds.size();
}
@Override
public int getTagsCount() {
int count = 0;
for (HashSet s: gold)
count += s.size();
return count;
}
@Override
public List> getC2WGoldStandardList() {
return ProblemReduction.A2WToC2WList(this.getA2WGoldStandardList());
}
@Override
public List> getD2WGoldStandardList() {
return getA2WGoldStandardList();
}
@Override
public List getTextInstanceList() {
return this.docIds;
}
@Override
public List> getMentionsInstanceList() {
return ProblemReduction.A2WToD2WMentionsInstance(getA2WGoldStandardList());
}
@Override
public String getName() {
return name;
}
@Override
public List> getA2WGoldStandardList() {
return gold;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy