opennlp.tools.parser.lang.en.HeadRules Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.parser.lang.en;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.io.Writer;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Objects;
import java.util.Set;
import java.util.Stack;
import java.util.StringTokenizer;
import opennlp.tools.parser.Constituent;
import opennlp.tools.parser.GapLabeler;
import opennlp.tools.parser.Parse;
import opennlp.tools.parser.chunking.Parser;
import opennlp.tools.util.model.ArtifactSerializer;
import opennlp.tools.util.model.SerializableArtifact;
/**
* Class for storing the English head rules associated with parsing.
*/
public class HeadRules implements opennlp.tools.parser.HeadRules, GapLabeler, SerializableArtifact {
public static class HeadRulesSerializer implements ArtifactSerializer {
public HeadRules create(InputStream in) throws IOException {
return new HeadRules(new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8)));
}
public void serialize(opennlp.tools.parser.lang.en.HeadRules artifact, OutputStream out)
throws IOException {
artifact.serialize(new OutputStreamWriter(out, StandardCharsets.UTF_8));
}
}
private static class HeadRule {
public boolean leftToRight;
public String[] tags;
public HeadRule(boolean l2r, String[] tags) {
leftToRight = l2r;
for (String tag : tags) {
Objects.requireNonNull(tag, "tags must not contain null values");
}
this.tags = tags;
}
@Override
public int hashCode() {
return Objects.hash(leftToRight, Arrays.hashCode(tags));
}
@Override
public boolean equals(Object obj) {
if (obj == this) {
return true;
}
if (obj instanceof HeadRule) {
HeadRule rule = (HeadRule) obj;
return rule.leftToRight == leftToRight &&
Arrays.equals(rule.tags, tags);
}
return false;
}
}
private Map headRules;
private Set punctSet;
/**
* Creates a new set of head rules based on the specified head rules file.
*
* @param ruleFile the head rules file.
*
* @throws IOException if the head rules file can not be read.
*/
@Deprecated
public HeadRules(String ruleFile) throws IOException {
this(new BufferedReader(new FileReader(ruleFile)));
}
/**
* Creates a new set of head rules based on the specified reader.
*
* @param rulesReader the head rules reader.
*
* @throws IOException if the head rules reader can not be read.
*/
public HeadRules(Reader rulesReader) throws IOException {
BufferedReader in = new BufferedReader(rulesReader);
readHeadRules(in);
punctSet = new HashSet<>();
punctSet.add(".");
punctSet.add(",");
punctSet.add("``");
punctSet.add("''");
//punctSet.add(":");
}
public Set getPunctuationTags() {
return punctSet;
}
public Parse getHead(Parse[] constituents, String type) {
if (Parser.TOK_NODE.equals(constituents[0].getType())) {
return null;
}
HeadRule hr;
if (type.equals("NP") || type.equals("NX")) {
String[] tags1 = { "NN", "NNP", "NNPS", "NNS", "NX", "JJR", "POS" };
for (int ci = constituents.length - 1; ci >= 0; ci--) {
for (int ti = tags1.length - 1; ti >= 0; ti--) {
if (constituents[ci].getType().equals(tags1[ti])) {
return constituents[ci].getHead();
}
}
}
for (int ci = 0; ci < constituents.length; ci++) {
if (constituents[ci].getType().equals("NP")) {
return constituents[ci].getHead();
}
}
String[] tags2 = { "$", "ADJP", "PRN" };
for (int ci = constituents.length - 1; ci >= 0; ci--) {
for (int ti = tags2.length - 1; ti >= 0; ti--) {
if (constituents[ci].getType().equals(tags2[ti])) {
return constituents[ci].getHead();
}
}
}
String[] tags3 = { "JJ", "JJS", "RB", "QP" };
for (int ci = constituents.length - 1; ci >= 0; ci--) {
for (int ti = tags3.length - 1; ti >= 0; ti--) {
if (constituents[ci].getType().equals(tags3[ti])) {
return constituents[ci].getHead();
}
}
}
return constituents[constituents.length - 1].getHead();
}
else if ((hr = headRules.get(type)) != null) {
String[] tags = hr.tags;
int cl = constituents.length;
int tl = tags.length;
if (hr.leftToRight) {
for (int ti = 0; ti < tl; ti++) {
for (int ci = 0; ci < cl; ci++) {
if (constituents[ci].getType().equals(tags[ti])) {
return constituents[ci].getHead();
}
}
}
return constituents[0].getHead();
}
else {
for (int ti = 0; ti < tl; ti++) {
for (int ci = cl - 1; ci >= 0; ci--) {
if (constituents[ci].getType().equals(tags[ti])) {
return constituents[ci].getHead();
}
}
}
return constituents[cl - 1].getHead();
}
}
return constituents[constituents.length - 1].getHead();
}
private void readHeadRules(BufferedReader str) throws IOException {
String line;
headRules = new HashMap<>(30);
while ((line = str.readLine()) != null) {
StringTokenizer st = new StringTokenizer(line);
String num = st.nextToken();
String type = st.nextToken();
String dir = st.nextToken();
String[] tags = new String[Integer.parseInt(num) - 2];
int ti = 0;
while (st.hasMoreTokens()) {
tags[ti] = st.nextToken();
ti++;
}
headRules.put(type, new HeadRule(dir.equals("1"), tags));
}
}
public void labelGaps(Stack stack) {
if (stack.size() > 4) {
//Constituent con0 = (Constituent) stack.get(stack.size()-1);
Constituent con1 = stack.get(stack.size() - 2);
Constituent con2 = stack.get(stack.size() - 3);
Constituent con3 = stack.get(stack.size() - 4);
Constituent con4 = stack.get(stack.size() - 5);
// System.err.println("con0="+con0.label+" con1="+con1.label+" con2="
// +con2.label+" con3="+con3.label+" con4="+con4.label);
//subject extraction
if (con1.getLabel().equals("NP") && con2.getLabel().equals("S") && con3.getLabel().equals("SBAR")) {
con1.setLabel(con1.getLabel() + "-G");
con2.setLabel(con2.getLabel() + "-G");
con3.setLabel(con3.getLabel() + "-G");
}
//object extraction
else if (con1.getLabel().equals("NP") && con2.getLabel().equals("VP")
&& con3.getLabel().equals("S") && con4.getLabel().equals("SBAR")) {
con1.setLabel(con1.getLabel() + "-G");
con2.setLabel(con2.getLabel() + "-G");
con3.setLabel(con3.getLabel() + "-G");
con4.setLabel(con4.getLabel() + "-G");
}
}
}
/**
* Writes the head rules to the writer in a format suitable for loading
* the head rules again with the constructor. The encoding must be
* taken into account while working with the writer and reader.
*
* After the entries have been written, the writer is flushed.
* The writer remains open after this method returns.
*
* @param writer
* @throws IOException
*/
public void serialize(Writer writer) throws IOException {
for (Entry entry : headRules.entrySet()) {
String type = entry.getKey();
HeadRule headRule = entry.getValue();
// write num of tags
writer.write(Integer.toString(headRule.tags.length + 2));
writer.write(' ');
// write type
writer.write(type);
writer.write(' ');
// write l2r true == 1
if (headRule.leftToRight)
writer.write("1");
else
writer.write("0");
// write tags
for (String tag : headRule.tags) {
writer.write(' ');
writer.write(tag);
}
writer.write('\n');
}
writer.flush();
}
@Override
public int hashCode() {
return Objects.hash(headRules, punctSet);
}
@Override
public boolean equals(Object obj) {
if (obj == this) {
return true;
}
if (obj instanceof HeadRules) {
HeadRules rules = (HeadRules) obj;
return rules.headRules.equals(headRules)
&& rules.punctSet.equals(punctSet);
}
return false;
}
@Override
public Class> getArtifactSerializerClass() {
return HeadRulesSerializer.class;
}
}