All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.fbk.cit.hlt.thewikimachine.util.RDFParser Maven / Gradle / Ivy

package org.fbk.cit.hlt.thewikimachine.util;

import org.apache.commons.lang.StringEscapeUtils;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Created with IntelliJ IDEA.
 * User: aprosio
 * Date: 1/30/13
 * Time: 9:02 AM
 * To change this template use File | Settings | File Templates.
 */
public class RDFParser {

	BufferedReader support;
	Pattern rdfLine = Pattern.compile("^<([^>]+)>\\s+<([^>]+)>\\s+(<([^>]+)>|\"(.*)\"(@[a-z0-9]+|\\^\\^<([^>]+)>))(\\s+<([^>]+)>)?\\s*.$");
	static Pattern unicodePatt = Pattern.compile("\\\\u([0-9a-fA-F]{4})");

	public RDFParser(String fileName) throws IOException {
		this(new File(fileName));
	}

	public RDFParser(File file) throws IOException {
		if (!file.exists()) {
			throw new IOException();
		}
		support = new BufferedReader(new FileReader(file));
	}

	static public String decodeUnicode(String myString) {
		Matcher m = unicodePatt.matcher(myString);
		StringBuffer sb = new StringBuffer();
		while (m.find()) {
			String num = m.group(1);
			int hexVal = Integer.parseInt(num, 16);
			Character c = (char) hexVal;
			m.appendReplacement(sb, c.toString());
		}
		m.appendTail(sb);

		return sb.toString();
	}

	public String[] rawNext3(boolean decodeUnicode) throws IOException {
		String line;
		do {
			if ((line = support.readLine()) == null) {
				return null;
			}
			line = line.trim();
		} while (line.startsWith("#"));

		if (decodeUnicode) {
			line = decodeUnicode(line);
		}

		Matcher m = rdfLine.matcher(line);
		if (m.find()) {
			String[] tempRow = new String[m.groupCount() + 1];
			for (int i = 0; i < tempRow.length; i++) {
				tempRow[i] = m.group(i);
			}
			return tempRow;
		}

		throw new IOException("ERR: " + line);
	}

	public HashMap next3() throws IOException {
		return next3(false);
	}

	public HashMap next3(boolean decodeUnicode) throws IOException {
		String[] tempRow;
		do {
			try {
				tempRow = this.rawNext3(decodeUnicode);
				if (tempRow == null) {
					return null;
				}
				break;
			} catch (IOException e) {
				System.out.println(e.getMessage());
			}
		} while (true);

		HashMap ret = new HashMap();
		ret.put("domain", tempRow[1]);
		ret.put("relation", tempRow[2]);
		ret.put("simpleDomain", DBpediaOntology.cleanGenericName(ret.get("domain")));
		ret.put("simpleRelation", DBpediaOntology.cleanGenericName(ret.get("relation")));
		if (tempRow[6] == null) {
			// is a resource
			ret.put("type", "resource");
			String s = StringEscapeUtils.unescapeJava(tempRow[4]);
			ret.put("range", s);
			ret.put("simpleRange", DBpediaOntology.cleanGenericName(ret.get("range")));
		}
		else {
			if (tempRow[7] == null) {
				ret.put("type", "string");
				String s = StringEscapeUtils.unescapeJava(tempRow[5]);
				ret.put("range", s);
				ret.put("lang", tempRow[6].substring(1));
			}
			else {
				ret.put("type", DBpediaOntology.cleanName(tempRow[7], "http://www.w3.org/2001/XMLSchema#"));
				String s = StringEscapeUtils.unescapeJava(tempRow[5]);
				ret.put("range", s);
			}
		}
		return ret;
	}

	public static void main(String[] args) {
		try {
			RDFParser p = new RDFParser("/media/TANA/corpora/dbpedia/properties/it_properties-20121102.nt");
			HashMap line;
			while ((line = p.next3()) != null) {
				System.out.println(line);
				System.out.print(".");
			}

		} catch (IOException e) {
			e.printStackTrace();  //To change body of catch statement use File | Settings | File Templates.
		}
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy