
eu.fbk.dkm.pikes.resources.PropBank Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of pikes-resources Show documentation
Show all versions of pikes-resources Show documentation
A collection of Java classes for accessing and querying a number of NLP resources.
The newest version!
package eu.fbk.dkm.pikes.resources;
import com.google.common.base.Charsets;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.base.Strings;
import com.google.common.collect.*;
import com.google.common.io.Resources;
import eu.fbk.utils.core.CommandLine;
import eu.fbk.utils.core.StaxParser;
import org.slf4j.LoggerFactory;
import javax.annotation.Nullable;
import javax.xml.stream.XMLStreamException;
import java.io.*;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;
public final class PropBank {
private static final List ROLESETS;
private static final Map ID_INDEX;
private static final ListMultimap LEMMA_INDEX;
static {
try {
final Map corefMap = Maps.newHashMap();
for (final String line : Resources.readLines(
PropBank.class.getResource("PropBank.coref"), Charsets.UTF_8)) {
final String[] tokens = line.split("\\s+");
final int[] roles = new int[] { Integer.parseInt(tokens[1]),
Integer.parseInt(tokens[2]) };
corefMap.put(tokens[0], roles);
}
final Map idIndex = Maps.newLinkedHashMap();
final ListMultimap lemmaIndex = ArrayListMultimap.create();
final BufferedReader reader = Resources.asCharSource(
PropBank.class.getResource("PropBank.tsv"), Charsets.UTF_8)
.openBufferedStream();
String line;
while ((line = reader.readLine()) != null) {
// Extract frame data
final String[] tokens = Iterables.toArray(Splitter.on('\t').split(line),
String.class);
final String id = tokens[0];
final String lemma = tokens[1];
final String name = tokens[2];
final List vnFrames = Splitter.on('|').splitToList(tokens[3]);
final List fnFrames = Splitter.on('|').splitToList(tokens[4]);
final List eventTypes = Splitter.on('|').splitToList(tokens[5]);
// Extract role data
final List argDescr = Lists.newArrayList();
final List> argVNRoles = Lists.newArrayList();
final List> argFNRoles = Lists.newArrayList();
for (int i = 0; i < 6; ++i) {
argDescr.add(null);
argVNRoles.add(null);
argFNRoles.add(null);
}
for (int i = 6; i + 3 < tokens.length; i += 4) {
final int num = Integer.parseInt(tokens[i]);
argDescr.set(num, tokens[i + 1]);
argVNRoles.set(num, Splitter.on('|').splitToList(tokens[i + 2]));
argFNRoles.set(num, Splitter.on('|').splitToList(tokens[i + 3]));
}
// Create and index the roleset
final int[] corefRoles = corefMap.get(id);
final int entityRole = corefRoles == null ? -1 : corefRoles[0];
final int predicateRole = corefRoles == null ? -1 : corefRoles[1];
final Roleset roleset = new Roleset(id, lemma, name, vnFrames, fnFrames,
eventTypes, argDescr, argVNRoles, argFNRoles, entityRole, predicateRole);
idIndex.put(id, roleset);
lemmaIndex.put(lemma, roleset);
}
reader.close();
ROLESETS = ImmutableList.copyOf(idIndex.values());
ID_INDEX = ImmutableMap.copyOf(idIndex);
LEMMA_INDEX = ImmutableListMultimap.copyOf(lemmaIndex);
} catch (final IOException ex) {
throw new Error("Cannot load eu.fbk.dkm.pikes.resources.PropBank data", ex);
}
}
@Nullable
public static Roleset getRoleset(@Nullable final String id) {
return ID_INDEX.get(id == null ? null : id.toLowerCase());
}
public static List getRolesets(@Nullable final String lemma) {
return LEMMA_INDEX.get(lemma == null ? null : lemma.toLowerCase());
}
public static List getRolesets() {
return ROLESETS;
}
public static void main(final String[] args) throws IOException, XMLStreamException {
try {
final CommandLine cmd = CommandLine
.parser()
.withName("PropBankBank")
.withHeader(
"Generate a TSV file with indexed eu.fbk.dkm.pikes.resources.PropBank data, "
+ "including mapping to eu.fbk.dkm.pikes.resources.VerbNet and eu.fbk.dkm.pikes.resources.FrameNet from the PredicateMatrix")
.withOption("f", "frames", "the directory containing frame definitions",
"DIR", CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
.withOption("m", "matrix", "the file containing the predicate matrix", "FILE",
CommandLine.Type.FILE_EXISTING, true, false, true)
.withOption("o", "output", "output file", "FILE", CommandLine.Type.FILE, true, false, true)
.withLogger(LoggerFactory.getLogger("eu.fbk.nafview")).parse(args);
final File dir = cmd.getOptionValue("f", File.class);
final File pm = cmd.getOptionValue("m", File.class);
final File output = cmd.getOptionValue("o", File.class);
// Parse the predicate matrix
final Matrix matrix = new Matrix(pm);
final Writer writer = new OutputStreamWriter(new BufferedOutputStream(
new FileOutputStream(output)), Charsets.UTF_8);
final File[] files = dir.listFiles();
Arrays.sort(files);
for (final File file : files) {
if (file.getName().endsWith(".xml")) {
System.out.println("Processing " + file);
final Reader reader = new BufferedReader(new FileReader(file));
try {
new Parser(reader, matrix).parse(writer);
} finally {
reader.close();
}
}
}
} catch (final Throwable ex) {
CommandLine.fail(ex);
}
}
private static class Matrix {
final Multimap vnFrames;
final Multimap fnFrames;
final Multimap eventTypes;
final Multimap vnRoles;
final Multimap fnRoles;
Matrix(final File file) throws IOException {
this.vnFrames = HashMultimap.create();
this.fnFrames = HashMultimap.create();
this.eventTypes = HashMultimap.create();
this.vnRoles = HashMultimap.create();
this.fnRoles = HashMultimap.create();
parseMatrix(file);
}
private void parseMatrix(final File matrixFile) throws IOException {
final BufferedReader in = new BufferedReader(new InputStreamReader(
new FileInputStream(matrixFile), Charsets.UTF_8));
try {
// Process the predicate matrix file one line at a time
String line;
while ((line = in.readLine()) != null) {
// Split the line in its cells. Skip line if there are not enough cells
final String[] tokens = line.split("\t");
if (tokens.length <= 18) {
continue;
}
// Extract the eu.fbk.dkm.pikes.resources.PropBank frame and role. Skip line if NULL
final String pbFrame = parseMatrixValue(tokens[11]);
if (pbFrame == null) {
continue;
}
final String pbRole = parseMatrixValue(tokens[12]);
final String pbFrameRole = pbFrame + pbRole;
// Extract and index eu.fbk.dkm.pikes.resources.VerbNet data: class, subclass, role
final String vnClass = parseMatrixValue(tokens[0]);
final String vnSubClass = parseMatrixValue(tokens[2]);
final String vnFrame = vnSubClass != null ? vnSubClass : vnClass;
final String vnRole = parseMatrixValue(tokens[5]);
if (vnSubClass != null && vnClass != null && !vnSubClass.startsWith(vnClass)) {
System.err.println("Unexpected VN class / subclass pair: " + vnClass
+ ", " + vnSubClass);
}
if (vnFrame != null) {
this.vnFrames.put(pbFrame, vnFrame);
if (vnRole != null) {
this.vnRoles.put(pbFrameRole, vnRole);
}
}
// Extract and index eu.fbk.dkm.pikes.resources.FrameNet data: frame and frame element
final String fnFrame = parseMatrixValue(tokens[8]);
final String fnRole = parseMatrixValue(tokens[10]);
if (fnFrame != null) {
this.fnFrames.put(pbFrame, fnFrame);
if (fnRole != null) {
this.fnRoles.put(pbFrameRole, fnRole);
}
}
// Extract and index event type
final String eventType = parseMatrixValue(tokens[17]);
if (eventType != null) {
this.eventTypes.put(pbFrame, eventType);
}
}
} finally {
in.close();
}
}
@Nullable
private static String parseMatrixValue(@Nullable String string) {
if (string != null) {
// Skip an optional prefix (e.g., pb:)
final int index = string.indexOf(':');
if (index > 0) {
string = string.substring(index + 1);
}
// Return the value only if not NULL
if (!"NULL".equalsIgnoreCase(string)) {
return string;
}
}
return null;
}
}
private static class Parser extends StaxParser {
private final Matrix matrix;
Parser(final Reader reader, @Nullable final Matrix matrix) throws IOException {
super(reader);
this.matrix = matrix;
}
void parse(final Writer writer) throws IOException, XMLStreamException {
enter("frameset");
while (tryEnter("predicate")) {
// Extract the lemma (may be different from the one in the ID
final String lemma = attribute("lemma").trim().replace('_', ' ').toLowerCase();
// Process rolesets for the current predicate lemma
while (tryEnter("roleset")) {
// Extract eu.fbk.dkm.pikes.resources.PropBank sense and associated description
final String id = attribute("id").trim();
final String name = attribute("name").trim();
// Retrieve frame data from the predicate matrix
final String vnFrames = Joiner.on('|').join(
Ordering.natural().sortedCopy(this.matrix.vnFrames.get(id)));
final String fnFrames = Joiner.on('|').join(
Ordering.natural().sortedCopy(this.matrix.fnFrames.get(id)));
final String eventTypes = Joiner.on('|').join(
Ordering.natural().sortedCopy(this.matrix.eventTypes.get(id)));
// Emit frame data
writer.write(id);
writer.write('\t');
writer.write(lemma);
writer.write('\t');
writer.write(name);
writer.write('\t');
writer.write(vnFrames);
writer.write('\t');
writer.write(fnFrames);
writer.write('\t');
writer.write(eventTypes);
// Process eu.fbk.dkm.pikes.resources.PropBank roles for current roleset
if (tryEnter("roles")) {
while (tryEnter("role")) {
try {
// Extract role number and associated description
final int n = Integer.parseInt(attribute("n"));
final String descr = attribute("descr").trim();
// Retrieve role data from the predicate matrix
final String roleId = id + n;
final String vnRoles = Joiner.on('|').join(
Ordering.natural().sortedCopy(
this.matrix.vnRoles.get(roleId)));
final String fnRoles = Joiner.on('|').join(
Ordering.natural().sortedCopy(
this.matrix.fnRoles.get(roleId)));
// Emit role data
writer.write('\t');
writer.write(Integer.toString(n));
writer.write('\t');
writer.write(Strings.nullToEmpty(descr));
writer.write('\t');
writer.write(vnRoles);
writer.write('\t');
writer.write(fnRoles);
} catch (final NumberFormatException ex) {
// ignore
}
leave();
}
leave();
}
// End and flush the line
writer.write('\n');
writer.flush();
leave();
}
leave();
}
leave();
}
}
public static final class Roleset {
private static final Interner
© 2015 - 2025 Weber Informatics LLC | Privacy Policy