opennlp.tools.parser.lang.es.AncoraSpanishHeadRules Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.parser.lang.es;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.io.Writer;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Objects;
import java.util.Set;
import java.util.Stack;
import java.util.StringTokenizer;
import opennlp.tools.parser.Constituent;
import opennlp.tools.parser.GapLabeler;
import opennlp.tools.parser.HeadRules;
import opennlp.tools.parser.Parse;
import opennlp.tools.parser.chunking.Parser;
import opennlp.tools.util.model.ArtifactSerializer;
import opennlp.tools.util.model.SerializableArtifact;
/**
* Class for storing the Ancora Spanish head rules associated with parsing. In this class
* headrules for noun phrases are specified. The rest of the rules are
* in opennlp-tools/lang/es/parser/es-head-rules
*
* NOTE: This class has been adapted from opennlp.tools.parser.lang.en.HeadRules
*
* The main change is the constituents search direction in the first for loop.
*
* Note also the change in the return of the getHead() method:
* In the lang.en.HeadRules class: return constituents[ci].getHead();
* Now: return constituents[ci];
*
* Other changes include removal of deprecated methods.
*
*/
public class AncoraSpanishHeadRules implements HeadRules, GapLabeler, SerializableArtifact {
public static class HeadRulesSerializer implements ArtifactSerializer {
public AncoraSpanishHeadRules create(InputStream in) throws IOException {
return new AncoraSpanishHeadRules(new BufferedReader(
new InputStreamReader(in, StandardCharsets.UTF_8)));
}
public void serialize(opennlp.tools.parser.lang.es.AncoraSpanishHeadRules artifact, OutputStream out)
throws IOException {
artifact.serialize(new OutputStreamWriter(out, StandardCharsets.UTF_8));
}
}
private static class HeadRule {
public boolean leftToRight;
public String[] tags;
public HeadRule(boolean l2r, String[] tags) {
leftToRight = l2r;
for (String tag : tags) {
Objects.requireNonNull(tag, "tags must not contain null values!");
}
this.tags = tags;
}
@Override
public int hashCode() {
return Objects.hash(leftToRight, Arrays.hashCode(tags));
}
@Override
public boolean equals(Object obj) {
if (obj == this) {
return true;
}
if (obj instanceof HeadRule) {
HeadRule rule = (HeadRule) obj;
return (rule.leftToRight == leftToRight) &&
Arrays.equals(rule.tags, tags);
}
return false;
}
}
private Map headRules;
private Set punctSet;
/**
* Creates a new set of head rules based on the specified reader.
*
* @param rulesReader the head rules reader.
*
* @throws IOException if the head rules reader can not be read.
*/
public AncoraSpanishHeadRules(Reader rulesReader) throws IOException {
BufferedReader in = new BufferedReader(rulesReader);
readHeadRules(in);
punctSet = new HashSet<>();
punctSet.add(".");
punctSet.add(",");
punctSet.add("``");
punctSet.add("''");
//punctSet.add(":");
}
public Set getPunctuationTags() {
return punctSet;
}
public Parse getHead(Parse[] constituents, String type) {
if (Parser.TOK_NODE.equals(constituents[0].getType())) {
return null;
}
HeadRule hr;
if (type.equals("SN") || type.equals("GRUP.NOM")) {
String[] tags1 = {"AQA.*","AQC.*","GRUP\\.A","S\\.A","NC.*S.*", "NP.*","NC.*P.*", "GRUP\\.NOM"};
for (int i = 0; i < constituents.length; i++) {
for (int t = tags1.length - 1; t >= 0; t--) {
if (constituents[i].getType().matches(tags1[t])) {
return constituents[i];
}
}
}
for (int ci = 0; ci < constituents.length; ci++) {
if (constituents[ci].getType().equals("SN") || constituents[ci].getType().equals("GRUP.NOM")) {
return constituents[ci];
}
}
String[] tags2 = {"\\$","GRUP\\.A","SA"};
for (int ci = constituents.length - 1; ci >= 0; ci--) {
for (int ti = tags2.length - 1; ti >= 0; ti--) {
if (constituents[ci].getType().matches(tags2[ti])) {
return constituents[ci];
}
}
}
String[] tags3 = {"AQ0.*", "AQ[AC].*","AO.*","GRUP\\.A","S\\.A","RG","RN","GRUP\\.NOM"};
for (int ci = constituents.length - 1; ci >= 0; ci--) {
for (int ti = tags3.length - 1; ti >= 0; ti--) {
if (constituents[ci].getType().matches(tags3[ti])) {
return constituents[ci];
}
}
}
return constituents[constituents.length - 1].getHead();
}
else if ((hr = headRules.get(type)) != null) {
String[] tags = hr.tags;
int cl = constituents.length;
int tl = tags.length;
if (hr.leftToRight) {
for (int ti = 0; ti < tl; ti++) {
for (int ci = 0; ci < cl; ci++) {
if (constituents[ci].getType().matches(tags[ti])) {
return constituents[ci];
}
}
}
return constituents[0].getHead();
}
else {
for (int ti = 0; ti < tl; ti++) {
for (int ci = cl - 1; ci >= 0; ci--) {
if (constituents[ci].getType().matches(tags[ti])) {
return constituents[ci];
}
}
}
return constituents[cl - 1].getHead();
}
}
return constituents[constituents.length - 1].getHead();
}
private void readHeadRules(BufferedReader str) throws IOException {
String line;
headRules = new HashMap<>(60);
while ((line = str.readLine()) != null) {
StringTokenizer st = new StringTokenizer(line);
String num = st.nextToken();
String type = st.nextToken();
String dir = st.nextToken();
String[] tags = new String[Integer.parseInt(num) - 2];
int ti = 0;
while (st.hasMoreTokens()) {
tags[ti] = st.nextToken();
ti++;
}
headRules.put(type, new HeadRule(dir.equals("1"), tags));
}
}
public void labelGaps(Stack stack) {
if (stack.size() > 4) {
//Constituent con0 = (Constituent) stack.get(stack.size()-1);
Constituent con1 = stack.get(stack.size() - 2);
Constituent con2 = stack.get(stack.size() - 3);
Constituent con3 = stack.get(stack.size() - 4);
Constituent con4 = stack.get(stack.size() - 5);
//subject extraction
if (con1.getLabel().equals("SN")
&& con2.getLabel().equals("S") && con3.getLabel().equals("GRUP.NOM")) {
con1.setLabel(con1.getLabel() + "-G");
con2.setLabel(con2.getLabel() + "-G");
con3.setLabel(con3.getLabel() + "-G");
}
//object extraction
else if (con1.getLabel().equals("SN") && con2.getLabel().equals("GRUP.VERB")
&& con3.getLabel().equals("S") && con4.getLabel().equals("GRUP.NOM")) {
con1.setLabel(con1.getLabel() + "-G");
con2.setLabel(con2.getLabel() + "-G");
con3.setLabel(con3.getLabel() + "-G");
con4.setLabel(con4.getLabel() + "-G");
}
}
}
/**
* Writes the head rules to the writer in a format suitable for loading
* the head rules again with the constructor. The encoding must be
* taken into account while working with the writer and reader.
*
* After the entries have been written, the writer is flushed.
* The writer remains open after this method returns.
*
* @param writer
* @throws IOException
*/
public void serialize(Writer writer) throws IOException {
for (Entry entry : headRules.entrySet()) {
String type = entry.getKey();
HeadRule headRule = entry.getValue();
// write num of tags
writer.write(Integer.toString(headRule.tags.length + 2));
writer.write(' ');
// write type
writer.write(type);
writer.write(' ');
// write l2r true == 1
if (headRule.leftToRight)
writer.write("1");
else
writer.write("0");
// write tags
for (String tag : headRule.tags) {
writer.write(' ');
writer.write(tag);
}
writer.write('\n');
}
writer.flush();
}
@Override
public int hashCode() {
return Objects.hash(headRules, punctSet);
}
@Override
public boolean equals(Object obj) {
if (obj == this) {
return true;
}
if (obj instanceof AncoraSpanishHeadRules) {
AncoraSpanishHeadRules rules = (AncoraSpanishHeadRules) obj;
return rules.headRules.equals(headRules)
&& rules.punctSet.equals(punctSet);
}
return false;
}
@Override
public Class> getArtifactSerializerClass() {
return HeadRulesSerializer.class;
}
}