org.fbk.cit.hlt.thewikimachine.util.RDFParser Maven / Gradle / Ivy
package org.fbk.cit.hlt.thewikimachine.util;
import org.apache.commons.lang.StringEscapeUtils;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Created with IntelliJ IDEA.
* User: aprosio
* Date: 1/30/13
* Time: 9:02 AM
* To change this template use File | Settings | File Templates.
*/
public class RDFParser {
BufferedReader support;
Pattern rdfLine = Pattern.compile("^<([^>]+)>\\s+<([^>]+)>\\s+(<([^>]+)>|\"(.*)\"(@[a-z0-9]+|\\^\\^<([^>]+)>))(\\s+<([^>]+)>)?\\s*.$");
static Pattern unicodePatt = Pattern.compile("\\\\u([0-9a-fA-F]{4})");
public RDFParser(String fileName) throws IOException {
this(new File(fileName));
}
public RDFParser(File file) throws IOException {
if (!file.exists()) {
throw new IOException();
}
support = new BufferedReader(new FileReader(file));
}
static public String decodeUnicode(String myString) {
Matcher m = unicodePatt.matcher(myString);
StringBuffer sb = new StringBuffer();
while (m.find()) {
String num = m.group(1);
int hexVal = Integer.parseInt(num, 16);
Character c = (char) hexVal;
m.appendReplacement(sb, c.toString());
}
m.appendTail(sb);
return sb.toString();
}
public String[] rawNext3(boolean decodeUnicode) throws IOException {
String line;
do {
if ((line = support.readLine()) == null) {
return null;
}
line = line.trim();
} while (line.startsWith("#"));
if (decodeUnicode) {
line = decodeUnicode(line);
}
Matcher m = rdfLine.matcher(line);
if (m.find()) {
String[] tempRow = new String[m.groupCount() + 1];
for (int i = 0; i < tempRow.length; i++) {
tempRow[i] = m.group(i);
}
return tempRow;
}
throw new IOException("ERR: " + line);
}
public HashMap next3() throws IOException {
return next3(false);
}
public HashMap next3(boolean decodeUnicode) throws IOException {
String[] tempRow;
do {
try {
tempRow = this.rawNext3(decodeUnicode);
if (tempRow == null) {
return null;
}
break;
} catch (IOException e) {
System.out.println(e.getMessage());
}
} while (true);
HashMap ret = new HashMap();
ret.put("domain", tempRow[1]);
ret.put("relation", tempRow[2]);
ret.put("simpleDomain", DBpediaOntology.cleanGenericName(ret.get("domain")));
ret.put("simpleRelation", DBpediaOntology.cleanGenericName(ret.get("relation")));
if (tempRow[6] == null) {
// is a resource
ret.put("type", "resource");
String s = StringEscapeUtils.unescapeJava(tempRow[4]);
ret.put("range", s);
ret.put("simpleRange", DBpediaOntology.cleanGenericName(ret.get("range")));
}
else {
if (tempRow[7] == null) {
ret.put("type", "string");
String s = StringEscapeUtils.unescapeJava(tempRow[5]);
ret.put("range", s);
ret.put("lang", tempRow[6].substring(1));
}
else {
ret.put("type", DBpediaOntology.cleanName(tempRow[7], "http://www.w3.org/2001/XMLSchema#"));
String s = StringEscapeUtils.unescapeJava(tempRow[5]);
ret.put("range", s);
}
}
return ret;
}
public static void main(String[] args) {
try {
RDFParser p = new RDFParser("/media/TANA/corpora/dbpedia/properties/it_properties-20121102.nt");
HashMap line;
while ((line = p.next3()) != null) {
System.out.println(line);
System.out.print(".");
}
} catch (IOException e) {
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy