it.unipi.di.acube.batframework.datasetPlugins.KddDataset Maven / Gradle / Ivy

/**
 * (C) Copyright 2012-2013 A-cube lab - Università di Pisa - Dipartimento di Informatica. 
 * BAT-Framework is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
 * BAT-Framework is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
 * You should have received a copy of the GNU General Public License along with BAT-Framework.  If not, see .
 */

package it.unipi.di.acube.batframework.datasetPlugins;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPathExpressionException;

import org.xml.sax.SAXException;

import it.unimi.dsi.lang.MutableString;
import it.unipi.di.acube.batframework.data.Annotation;
import it.unipi.di.acube.batframework.data.Mention;
import it.unipi.di.acube.batframework.data.Tag;
import it.unipi.di.acube.batframework.problems.A2WDataset;
import it.unipi.di.acube.batframework.utils.AnnotationException;
import it.unipi.di.acube.batframework.utils.ProblemReduction;
import it.unipi.di.acube.batframework.utils.WikipediaApiInterface;

public class KddDataset implements A2WDataset{
	private List> tags = new Vector>();
	private List documents = new Vector();
	private Pattern nonePattern = Pattern.compile("^([^\t]*)\t([^\t]*)\tO\tB-.*NONE\tNONE$");
	private Pattern nonePattern2 = Pattern.compile("^([^\t]*)\tO\tO\tO\t.*NONE\tNONE$");
	private Pattern tagPattern = Pattern.compile("^([^\t]*)\t([^\t]*)\tO\tB-([^\t]*)\t([^\t]*)\t([^\t]*)(?:\t([^\t]*)\t([^\t]*))?$");
	private Pattern nonTagPattern = Pattern.compile("^([^\t]*)\t([^\t]*)\tO\tB-([^\t]*)");
	private Pattern skipPattern = Pattern.compile("^([^\t]*)\tO\tO\tI-([^\t]*)$");
	private Pattern endPattern = Pattern.compile("^\\.\tO\tO\t#$");
	private Pattern nonTagPattern2 = Pattern.compile("^([^\t]*)\tO\tO\tO$");


	public KddDataset (String[] files, WikipediaApiInterface api) throws IOException, AnnotationException, XPathExpressionException, ParserConfigurationException, SAXException{
		List> kddAnns = new Vector>();
		List titlesToPrefetch = new Vector();
		for (String file: files){
			BufferedReader r = new BufferedReader(new FileReader(file));
			String line;
			MutableString currentDoc = new MutableString();
			HashSet currentAnns = new HashSet();
			int currentPos = 0;
			while ((line = r.readLine()) != null){
				Matcher noneMatch = nonePattern.matcher(line);
				Matcher none2Match = nonePattern2.matcher(line);
				Matcher tagMatch = tagPattern.matcher(line);
				Matcher nonAnnMatch = nonTagPattern.matcher(line);
				Matcher skipMatch = skipPattern.matcher(line);
				Matcher endMatch = endPattern.matcher(line);
				Matcher nonAnn2Match = nonTagPattern2.matcher(line);

				if (endMatch.matches()){ // a new document
					if (currentDoc.length() >0){
						documents.add(currentDoc.trimRight());
						kddAnns.add(currentAnns);
						currentDoc = new MutableString();
						currentAnns = new HashSet();
					}
					currentPos = 0;
				}
				else if (noneMatch.matches()){ //tag with none concept
					currentDoc.append(noneMatch.group(2).replace('_', ' ')+" ");
					currentPos += noneMatch.group(2).length()+1;
				}
				else if (none2Match.matches()){ //tag with none concept 2
					currentDoc.append(none2Match.group(1)+" ");
					currentPos += none2Match.group(1).length()+1;
				}
				else if (tagMatch.matches()){ //tag with concept
					currentDoc.append(tagMatch.group(2).replace('_', ' ')+" ");
					currentAnns.add(new KddAnnotation(currentPos, tagMatch.group(2).length(), tagMatch.group(4)));				
					currentPos += tagMatch.group(2).length()+1;
					titlesToPrefetch.add(tagMatch.group(4));
				}
				else if (nonAnnMatch.matches()){ //tag with no concept
					currentDoc.append(nonAnnMatch.group(2).replace('_', ' ')+" ");
					currentPos += nonAnnMatch.group(2).length()+1;
				}
				else if (skipMatch.matches()){
					// a word part of continuing tag (has already been added)
				}
				else if (nonAnn2Match.matches()){ //tag with no concept 2
					currentDoc.append(nonAnn2Match.group(1)+" ");
					currentPos += nonAnn2Match.group(1).length()+1;
				}
				else{
					r.close();
					throw new AnnotationException("Dataset is malformed: string '"+line+ "' not recognized.");
				}
			}
			r.close();
		}
		
		/** Prefetch titles */
		api.prefetchTitles(titlesToPrefetch);

		/** Create annotation list */
		for (HashSet s : kddAnns){
			HashSet sA = new HashSet();
			tags.add(sA);
			for (KddAnnotation aA: s){
				int wid = api.getIdByTitle(aA.title);
				if (wid == -1)
					System.out.println("ERROR: Dataset is malformed: Wikipedia API could not find page "+aA.title);
				else
					sA.add(new Annotation(aA.position, aA.length, wid));

			}

		}



	}
	
	@Override
	public int getSize() {
		return tags.size();
	}

	@Override
	public int getTagsCount() {
		int count = 0;
		for (HashSet s : tags)
			count += s.size();
		return count;
	}

	@Override
	public List> getC2WGoldStandardList() {
		return ProblemReduction.A2WToC2WList(tags);
	}

	@Override
	public List> getD2WGoldStandardList() {
		return getA2WGoldStandardList();
	}

	@Override
	public List getTextInstanceList() {
		List stringDocuments = new Vector();
		for (MutableString s : documents){
			stringDocuments.add(s.toString());
		}
		return stringDocuments;
	}
	
	@Override
	public List> getMentionsInstanceList() {
		return ProblemReduction.A2WToD2WMentionsInstance(getA2WGoldStandardList());
	}

	@Override
	public String getName() {
		return "KDD";
	}

	@Override
	public List> getA2WGoldStandardList() {
		return tags;
	}

	private class KddAnnotation{
		public KddAnnotation(int pos, int len, String title) {
			this.length = len;
			this.position = pos;
			this.title = title;
		}
		public int length, position;
		public String title;
	}

}