it.unipi.di.acube.batframework.datasetPlugins.MeijDataset Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of bat-framework Show documentation
Show all versions of bat-framework Show documentation
A framework to compare entity annotation systems.
The newest version!
/**
* (C) Copyright 2012-2013 A-cube lab - Università di Pisa - Dipartimento di Informatica.
* BAT-Framework is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
* BAT-Framework is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with BAT-Framework. If not, see .
*/
package it.unipi.di.acube.batframework.datasetPlugins;
import it.unimi.dsi.fastutil.objects.Object2ObjectOpenHashMap;
import it.unipi.di.acube.batframework.data.Tag;
import it.unipi.di.acube.batframework.problems.Rc2WDataset;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class MeijDataset implements Rc2WDataset{
private List texts;
private List> tags;
private List> rankedTags;
public MeijDataset(String tweetsFile, String tagsFile, String rankFile) throws FileNotFoundException, IOException {
this(new FileInputStream(tweetsFile), new FileInputStream(tagsFile), new FileInputStream(rankFile));
}
public MeijDataset(InputStream tweetsIs, InputStream tagsIs, InputStream rankIs) throws IOException {
Object2ObjectOpenHashMap docs = ReadTweetFile(tweetsIs);
readTagFile(tagsIs, docs);
loadRankedTags(rankIs, docs);
this.texts = new Vector();
this.tags = new Vector>();
for (Map.Entry e: docs.entrySet()){
texts.add(e.getValue().text);
HashSet anns = new HashSet();
tags.add(anns);
for (int a: e.getValue().tags){
anns.add(new Tag(a));
}
}
this.rankedTags = new Vector>();
for (Map.Entry e: docs.entrySet()){
List rankedAnns = new Vector();
rankedTags.add(rankedAnns);
for (int a: e.getValue().ranked){
rankedAnns.add(new Tag(a));
}
}
}
private static Object2ObjectOpenHashMap ReadTweetFile(InputStream inputStream) throws IOException{
BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
Object2ObjectOpenHashMap docs= new Object2ObjectOpenHashMap();
String l;
while((l = br.readLine())!=null){
String[] seq= l.toString().split("\t");
MeijDocument d= new MeijDocument();
d.id=seq[0];
//d.author=seq[1];
d.text=CleanTweet(seq[4]);
//d.text=seq[4];
docs.put(d.id, d);
}
return docs;
}
private static void readTagFile(InputStream inputStream, Object2ObjectOpenHashMap docs)
throws NumberFormatException, IOException {
BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
String l;
while ((l = br.readLine()) != null) {
String[] seq = l.toString().split("\t");
// long id=Long.parseLong(seq[0]);
if (Integer.parseInt(seq[1]) >= 0)
docs.get(seq[0]).tags.add(Integer.parseInt(seq[1]));
// if(!seq[2].equals("-"))
// docs.get(seq[0]).annotations.add(HTMLParser.html2Unicode(seq[2]));
}
}
private static String CleanTweet(String original){
Pattern PAT_DOC = Pattern.compile("http://|bit|yfrog|tinyurl|twitpic|justgiving|plixi");
Matcher m = PAT_DOC.matcher(original);
while(m.find()){
int start=m.start(0);
int end=start;
while(end docs) throws NumberFormatException, IOException{
BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
String l;
while ((l = br.readLine()) != null) {
String[] seq= l.toString().split(" ");
if(docs.containsKey(seq[0]))
docs.get(seq[0]).ranked.add(new Integer(Integer.parseInt(seq[2])));
}
}
@Override
public int getSize() {
return texts.size();
}
@Override
public int getTagsCount() {
int c=0;
for (HashSet s: tags){
c+=s.size();
}
return c;
}
@Override
public List> getC2WGoldStandardList() {
return tags;
}
@Override
public List getTextInstanceList() {
return texts;
}
private static class MeijDocument implements Serializable {
private static final long serialVersionUID = 6977622102826151597L;
//String author;
String text;
String id;
HashSet tags;
Vector ranked;
public MeijDocument(){
tags=new HashSet();
ranked=new Vector();
}
}
@Override
public String getName() {
return "Meij";
}
@Override
public List> getRc2WGoldStandardList() {
return rankedTags;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy