edu.emory.mathcs.nlp.common.constituent.CTLibEn Maven / Gradle / Ivy
The newest version!
package edu.emory.mathcs.nlp.common.constituent;
/**
* Copyright 2015, Emory University
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.List;
import java.util.Set;
import java.util.function.Predicate;
import java.util.regex.Pattern;
import edu.emory.mathcs.nlp.common.constant.StringConst;
import edu.emory.mathcs.nlp.common.treebank.POSLibEn;
import edu.emory.mathcs.nlp.common.treebank.POSTagEn;
import edu.emory.mathcs.nlp.common.util.DSUtils;
import edu.emory.mathcs.nlp.common.util.ENUtils;
import edu.emory.mathcs.nlp.common.util.PatternUtils;
import edu.emory.mathcs.nlp.common.util.StringUtils;
/**
* @author Jinho D. Choi ({@code [email protected]})
*/
public class CTLibEn extends CTLib implements CTTagEn, POSTagEn
{
static final public Pattern P_PASSIVE_NULL = PatternUtils.createClosedORPattern("\\*","\\*-\\d+");
static final public Predicate M_NP = CTLib.matchC(C_NP);
static final public Predicate M_VP = CTLib.matchC(C_VP);
static final public Predicate M_QP = CTLib.matchC(C_QP);
static final public Predicate M_ADVP = CTLib.matchC(C_ADVP);
static final public Predicate M_SBAR = CTLib.matchC(C_SBAR);
static final public Predicate M_EDITED = CTLib.matchC(C_EDITED);
static final public Predicate M_SBJ = CTLib.matchF(F_SBJ);
static final public Predicate M_NOM = CTLib.matchF(F_NOM);
static final public Predicate M_PRD = CTLib.matchF(F_PRD);
static final public Predicate M_NP_SBJ = CTLib.matchCF(C_NP, F_SBJ);
static final public Predicate M_NNx = CTLib.matchCp(POS_NN);
static final public Predicate M_VBx = CTLib.matchCp(POS_VB);
static final public Predicate M_WHx = CTLib.matchCp("WH");
static final public Predicate M_Sx = CTLib.matchCp(C_S);
static final public Predicate M_SBARx = CTLib.matchCp(C_SBAR);
static final public Predicate M_S_SBAR = CTLib.matchCo(DSUtils.toHashSet(C_S, C_SBAR));
static final public Predicate M_NP_NML = CTLib.matchCo(DSUtils.toHashSet(C_NP, C_NML));
static final public Predicate M_VBD_VBN = CTLib.matchCo(DSUtils.toHashSet(POS_VBD, POS_VBN));
static final public Predicate M_VP_RRC_UCP = CTLib.matchCo(DSUtils.toHashSet(C_VP, C_RRC, C_UCP));
static final private Set S_LGS_PHRASE = DSUtils.toHashSet(C_PP, C_SBAR);
static final private Set S_MAIN_CLAUSE = DSUtils.toHashSet(C_S, C_SQ, C_SINV);
static final private Set S_EDITED_PHRASE = DSUtils.toHashSet(C_EDITED, C_EMBED);
static final private Set S_NOMINAL_PHRASE = DSUtils.toHashSet(C_NP, C_NML, C_NX, C_NAC);
static final private Set S_WH_LINK = DSUtils.toHashSet(C_WHNP, C_WHPP, C_WHADVP);
static final private Set S_SEPARATOR = DSUtils.toHashSet(POS_COMMA, POS_COLON);
static final private Set S_CONJUNCTION = DSUtils.toHashSet(POS_CC, C_CONJP);
private CTLibEn() {}
/**
* Fixes inconsistent function tags.
* Links antecedents of reduced passive nulls ({@code *}) and relativizers.
* @see #fixFunctionTags(CTTree)
* @see #linkReducedPassiveNulls(CTTree)
* @see #linkRelativizers(CTTree)
*/
static public void preprocess(CTTree tree)
{
fixFunctionTags(tree);
linkReducedPassiveNulls(tree);
linkRelativizers(tree);
}
// ======================== Fix function tags ========================
/**
* Fixes inconsistent function tags in the specific tree.
* @see CTLibEn#fixSBJ(CTNode)
* @see CTLibEn#fixLGS(CTNode)
* @see CTLibEn#fixCLF(CTNode)
*/
static public void fixFunctionTags(CTTree tree)
{
fixFunctionTagsAux(tree.getRoot());
}
/** Called by {@link CTLibEn#fixFunctionTags(CTTree)}. */
static private void fixFunctionTagsAux(CTNode node)
{
if (!fixSBJ(node) && !fixLGS(node) && !fixCLF(node))
; // no error in this node
for (CTNode child : node.getChildrenList())
fixFunctionTagsAux(child);
}
/** If the specific node contains the function tag {@link CTTagEn#F_SBJ} and it is the only child of its parent, moves the tag to its parent. */
static private boolean fixSBJ(CTNode node)
{
if (node.hasFunctionTag(F_SBJ))
{
CTNode parent = node.getParent();
if (parent.getChildrenSize() == 1 && !parent.isConstituentTagAny(S_EDITED_PHRASE) && parent.hasNoFunctionTag())
{
node.removeFunctionTag(F_SBJ);
parent.addFunctionTag(F_SBJ);
parent.setConstituentTag(node.getConstituentTag());
return true;
}
}
return false;
}
/** If the specific node contains the function tag {@link CTTagEn#F_LGS} and it is not a prepositional phrase, moves the tag to its parent. */
static private boolean fixLGS(CTNode node)
{
if (node.hasFunctionTag(F_LGS) && !node.isConstituentTag(C_PP))
{
CTNode parent = node.getParent();
if (parent.isConstituentTagAny(S_LGS_PHRASE))
{
node.removeFunctionTag(F_LGS);
parent.addFunctionTag(F_LGS);
return true;
}
}
return false;
}
/** If the specific node contains the function tag {@link CTTagEn#F_CLF} and it is not a subordinate clause, moves the tag to the subordinate clause. */
static private boolean fixCLF(CTNode node)
{
if (node.hasFunctionTag(F_CLF) && isMainClause(node))
{
CTNode desc = node.getFirstDescendant(M_SBARx);
node.removeFunctionTag(F_CLF);
if (desc != null)
{
desc.addFunctionTag(F_CLF);
return true;
}
}
return false;
}
// ======================== Passive nulls ========================
/**
* Finds reduced passive empty category ({@code *}) and links them to their antecedents in the specific tree.
* This method links most but not all antecedents; especially ones related to parenthetical phrases and topicalization.
* @see CTLibEn#isPassiveEmptyCategory(CTNode)
*/
static public void linkReducedPassiveNulls(CTTree tree)
{
linkReducedPassiveNullsAux(tree, tree.getRoot());
}
/** Called by {@link #linkReducedPassiveNulls(CTTree)}. */
static private void linkReducedPassiveNullsAux(CTTree tree, CTNode curr)
{
if (isPassiveEmptyCategory(curr) && curr.isWordForm("*"))
{
CTNode parent = curr.getParent(); // NP
int index = parent.getParent().getEmptyCategoryIndex();
if (index != -1) // VP
{
List list = tree.getEmptyCategoryList(index);
if (list != null) parent = list.get(0);
}
CTNode vp = parent.getHighestChainedAncestor(M_VP_RRC_UCP);
if (vp.getParent().matches(M_NP_NML) || vp.getParent().hasFunctionTag(F_NOM))
{
curr.setAntecedent(vp.getLeftNearestSibling(M_NP_NML));
if (!curr.hasAntecedent())
curr.setAntecedent(vp.getLeftNearestSibling(M_NNx));
if (!curr.hasAntecedent())
curr.setAntecedent(vp.getLeftNearestSibling(M_QP));
if (!curr.hasAntecedent())
curr.setAntecedent(vp.getLeftNearestSibling(M_NOM));
}
else if (isClause(vp.getParent()))
{
curr.setAntecedent(vp.getLeftNearestSibling(M_NP_SBJ));
if (!curr.hasAntecedent()) // VP-TPC
curr.setAntecedent(vp.getRightNearestSibling(M_NP_SBJ));
}
}
for (CTNode child : curr.getChildrenList())
linkReducedPassiveNullsAux(tree, child);
}
/** @return {@code true} if the specific node represents a passive null ({@code *|*-\d}). */
static public boolean isPassiveEmptyCategory(CTNode node)
{
if (node.isEmptyCategory() && node.matchesWordForm(P_PASSIVE_NULL) && node.hasParent())
{
node = node.getParent();
if (node.isConstituentTag(C_NP) && node.hasNoFunctionTag() &&
node.hasParent() && node.getParent().isConstituentTag(C_VP) &&
node.hasLeftSibling() && node.getLeftSibling().matches(M_VBD_VBN))
return true;
}
return false;
}
// ======================== Complementizers ========================
/**
* Finds relativizers and links them to their antecedents in the specific tree.
* This method links most but not all antecedents; especially when the relativizers are under {@code *-PRD} phrases.
*/
static public void linkRelativizers(CTTree tree)
{
linkComlementizersAux(tree, tree.getRoot());
}
/** Called by {@link #linkRelativizers(CTTree)}. */
static private void linkComlementizersAux(CTTree tree, CTNode curr)
{
if (isWhPhraseLink(curr))
{
CTNode comp = getRelativizer(curr);
CTNode sbar = curr.getHighestChainedAncestor(M_SBAR);
if (comp != null && sbar != null && !sbar.hasFunctionTag(F_NOM) && ENUtils.isLinkingRelativizer(comp.getWordForm()))
{
if (sbar.getEmptyCategoryIndex() != -1)
{
List ecs = tree.getEmptyCategoryList(sbar.getEmptyCategoryIndex());
if (ecs != null)
{
for (CTNode ec : ecs)
{
if (ec.getWordForm().startsWith(E_ICH) && ec.getParent().isConstituentTag(C_SBAR))
{
sbar = ec.getParent();
break;
}
}
}
}
else if (sbar.hasParent() && sbar.getParent().isConstituentTag(C_UCP))
sbar = sbar.getParent();
CTNode p = sbar.getParent(), ante;
if (p == null) return;
if (p.isConstituentTag(C_NP))
{
if ((ante = sbar.getLeftNearestSibling(M_NP)) != null)
comp.setAntecedent(ante);
}
else if (p.isConstituentTag(C_ADVP))
{
if ((ante = sbar.getLeftNearestSibling(M_ADVP)) != null)
comp.setAntecedent(ante);
}
else if (p.isConstituentTag(C_VP))
{
if ((ante = sbar.getLeftNearestSibling(M_PRD)) != null)
{
if (sbar.hasFunctionTag(F_CLF) ||
(curr.isConstituentTag(C_WHNP) && ante.isConstituentTag(C_NP)) ||
(curr.isConstituentTag(C_WHPP) && ante.isConstituentTag(C_PP)) ||
(curr.isConstituentTag(C_WHADVP) && ante.isConstituentTag(C_ADVP)))
comp.setAntecedent(ante);
}
}
ante = comp.getAntecedent();
while (ante != null && ante.isEmptyCategoryTerminal())
ante = ante.getFirstTerminal().getAntecedent();
comp.setAntecedent(ante);
}
}
else
{
for (CTNode child : curr.getChildrenList())
linkComlementizersAux(tree, child);
}
}
/**
* @return the first relativizer under the specific node if exists; otherwise, {@code null}.
* The specific node must be a wh-phrase.
*/
static public CTNode getRelativizer(CTNode node)
{
if (!isWhPhrase(node))
return null;
List terminals = node.getTerminalList();
if (node.isEmptyCategoryTerminal())
return terminals.get(0);
for (CTNode term : terminals)
{
if (POSLibEn.isRelativizer(term.getConstituentTag()))
return term;
}
for (CTNode term : terminals)
{
if (ENUtils.isRelativizer(term.getWordForm()))
return term;
}
return null;
}
static public CTNode getWhPhrase(CTNode node)
{
return getNode(node, M_WHx, true);
}
// ======================== Coordination ========================
/** @return {@code true} if the specific node contains coordination. */
static public boolean containsCoordination(CTNode node)
{
return containsCoordination(node, node.getChildrenList());
}
/** @return {@code true} if the specific list of children contains coordination. */
static public boolean containsCoordination(CTNode parent, List siblings)
{
if (parent.isConstituentTag(C_UCP))
return true;
if (parent.matches(M_NP_NML) && containsEtc(siblings))
return true;
for (CTNode child : siblings)
{
if (isConjunction(child))
return true;
}
return false;
}
/** Called by {@link CTLibEn#containsCoordination(CTNode, List)}. */
static private boolean containsEtc(List children)
{
int i, size = children.size();
CTNode child;
for (i=size-1; i>0; i--)
{
child = children.get(i);
if (POSLibEn.isPunctuation(child.getConstituentTag())) continue;
if (isEtc(child)) return true;
break;
}
return false;
}
/** @return {@code true} if the specific node is et cetera (e.g., etc). */
static public boolean isEtc(CTNode node)
{
return node.hasFunctionTag(F_ETC) || node.getFirstTerminal().isWordFormIgnoreCase("etc.");
}
/**
* @return {@code true} if this node is a conjunction.
* @see CTLibEn#isConjunction(CTNode)
* @see CTLibEn#isSeparator(CTNode)
*/
static public boolean isCoordinator(CTNode node)
{
return isConjunction(node) || isSeparator(node);
}
/** @return {@code true} if this node is a conjunction. */
static public boolean isConjunction(CTNode node)
{
return node.isConstituentTagAny(S_CONJUNCTION);
}
/** @return {@code true} if this node is a separator. */
static public boolean isSeparator(CTNode node)
{
return node.isConstituentTagAny(S_SEPARATOR);
}
/** @return {@code true} if this node is a correlative conjunction. */
static public boolean isCorrelativeConjunction(CTNode node)
{
if (node.isConstituentTag(POS_CC))
{
return ENUtils.isCorrelativeConjunction(node.getWordForm());
}
else if (node.isConstituentTag(C_CONJP))
{
String form = StringUtils.toLowerCase(node.toWordForms(false, StringConst.SPACE));
return form.equals("not only");
}
return false;
}
// ======================== Constituent ========================
static public boolean isClause(CTNode node)
{
return isMainClause(node) || isSubordinateClause(node);
}
/** @return {@code true} if "S|SQ|SINV". */
static public boolean isMainClause(CTNode node)
{
return node.isConstituentTagAny(S_MAIN_CLAUSE);
}
static public boolean isSubordinateClause(CTNode node)
{
return node.getConstituentTag().startsWith(C_SBAR);
}
static public boolean isNominalPhrase(CTNode node)
{
return node.isConstituentTagAny(S_NOMINAL_PHRASE);
}
static public boolean isWhPhraseLink(CTNode node)
{
return node.isConstituentTagAny(S_WH_LINK);
}
static public boolean isWhPhrase(CTNode node)
{
return M_WHx.test(node);
}
static public boolean isEditedPhrase(CTNode node)
{
return getNode(node, M_EDITED, true) != null;
}
static public boolean isDiscontinuousConstituent(CTNode node)
{
String tag = node.getWordForm();
return tag.startsWith(E_ICH) || tag.startsWith(E_PPA) || isRNR(node);
}
static public boolean isRNR(CTNode node)
{
return node.getWordForm().startsWith(E_RNR);
}
static public CTNode getNode(CTNode node, Predicate matcher, boolean recursive)
{
if (matcher.test(node))
return node;
if (recursive && node.getChildrenSize() == 1)
return getNode(node.getFirstChild(), matcher, recursive);
return null;
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy