All Downloads are FREE. Search and download functionalities are using the official Maven repository.

it.unipi.di.acube.batframework.datasetPlugins.MeijDataset Maven / Gradle / Ivy

The newest version!
/**
 * (C) Copyright 2012-2013 A-cube lab - Università di Pisa - Dipartimento di Informatica. 
 * BAT-Framework is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
 * BAT-Framework is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
 * You should have received a copy of the GNU General Public License along with BAT-Framework.  If not, see .
 */

package it.unipi.di.acube.batframework.datasetPlugins;

import it.unimi.dsi.fastutil.objects.Object2ObjectOpenHashMap;
import it.unipi.di.acube.batframework.data.Tag;
import it.unipi.di.acube.batframework.problems.Rc2WDataset;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


public class MeijDataset implements Rc2WDataset{
	private List texts;
	private List> tags;
	private List> rankedTags;

	public MeijDataset(String tweetsFile, String tagsFile, String rankFile) throws FileNotFoundException, IOException {
		this(new FileInputStream(tweetsFile), new FileInputStream(tagsFile), new FileInputStream(rankFile));
	}

	public MeijDataset(InputStream tweetsIs, InputStream tagsIs, InputStream rankIs) throws IOException {
		Object2ObjectOpenHashMap docs = ReadTweetFile(tweetsIs);
		readTagFile(tagsIs, docs);
		loadRankedTags(rankIs, docs);

		this.texts = new Vector();
		
		this.tags = new Vector>();
		for (Map.Entry e: docs.entrySet()){
			texts.add(e.getValue().text);
			HashSet anns = new HashSet();
			tags.add(anns);
			for (int a: e.getValue().tags){
				anns.add(new Tag(a));
			}
		}
		
		this.rankedTags = new Vector>();
		for (Map.Entry e: docs.entrySet()){
			List rankedAnns = new Vector();
			rankedTags.add(rankedAnns);
			for (int a: e.getValue().ranked){
				rankedAnns.add(new Tag(a));
			}
		}
    }

	private static Object2ObjectOpenHashMap ReadTweetFile(InputStream inputStream) throws IOException{
		BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
		Object2ObjectOpenHashMap docs= new Object2ObjectOpenHashMap();

		String l;
		while((l = br.readLine())!=null){
			String[] seq= l.toString().split("\t");

			MeijDocument d= new MeijDocument();
			d.id=seq[0];
			//d.author=seq[1];
			d.text=CleanTweet(seq[4]);
			//d.text=seq[4];

			docs.put(d.id, d);

		}
		return docs;
	}

	private static void readTagFile(InputStream inputStream, Object2ObjectOpenHashMap docs)
	        throws NumberFormatException, IOException {
		BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
		String l;
		while ((l = br.readLine()) != null) {
			String[] seq = l.toString().split("\t");
			// long id=Long.parseLong(seq[0]);
			if (Integer.parseInt(seq[1]) >= 0)
				docs.get(seq[0]).tags.add(Integer.parseInt(seq[1]));
			// if(!seq[2].equals("-"))
			// docs.get(seq[0]).annotations.add(HTMLParser.html2Unicode(seq[2]));
		}
	}

	private static String CleanTweet(String original){
		Pattern PAT_DOC = Pattern.compile("http://|bit|yfrog|tinyurl|twitpic|justgiving|plixi");
		Matcher m = PAT_DOC.matcher(original);

		while(m.find()){
			int start=m.start(0);
			int end=start;

			while(end docs) throws NumberFormatException, IOException{
		BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
		String l;
		while ((l = br.readLine()) != null) {
			String[] seq= l.toString().split(" ");
			if(docs.containsKey(seq[0]))
				docs.get(seq[0]).ranked.add(new Integer(Integer.parseInt(seq[2])));
		}
	}
	
	@Override
	public int getSize() {
		return texts.size();
	}


	@Override
	public int getTagsCount() {
		int c=0;
		for (HashSet s: tags){
			c+=s.size();
		}
		return c;
	}

	@Override
	public List> getC2WGoldStandardList() {
		return tags;
	}


	@Override
	public List getTextInstanceList() {
		return texts;
	}


	private static class MeijDocument implements Serializable {

		private static final long serialVersionUID = 6977622102826151597L;
		//String author;
		String text;
		String id;
		HashSet tags;
		Vector ranked;
		public MeijDocument(){
			tags=new HashSet();
			ranked=new Vector();
		}
	}


	@Override
	public String getName() {
		return "Meij";
	}

	@Override
	public List> getRc2WGoldStandardList() {
		return rankedTags;
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy