org.cogroo.entities.tree.OldStyleModel Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of cogroo-gc Show documentation
Show all versions of cogroo-gc Show documentation
Annotators specialized in grammar checking.
/**
* Copyright (C) 2012 cogroo
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.cogroo.entities.tree;
import java.util.ArrayList;
import java.util.List;
import org.cogroo.entities.Sentence;
import org.cogroo.entities.Token;
import org.cogroo.entities.impl.ChunkTag;
import org.cogroo.entities.impl.SyntacticTag;
import org.cogroo.tools.checker.rules.model.TagMask.ChunkFunction;
import org.cogroo.tools.checker.rules.model.TagMask.SyntacticFunction;
public class OldStyleModel {
private final static ChunkTag BOUNDARY_NOUN_PHRASE;
private final static ChunkTag BOUNDARY_NOUN_PHRASE_MAIN;
private final static ChunkTag BOUNDARY_VERB_PHRASE_MAIN;
private final static ChunkTag INTERMEDIARY_NOUN_PHRASE;
private final static ChunkTag INTERMEDIARY_NOUN_PHRASE_MAIN;
private final static ChunkTag INTERMEDIARY_VERB_PHRASE;
private final static ChunkTag OTHER;
private final static SyntacticTag SYNT_NONE;
private final static SyntacticTag SYNT_SUBJECT;
private final static SyntacticTag SYNT_VERB;
static {
BOUNDARY_NOUN_PHRASE = new ChunkTag();
BOUNDARY_NOUN_PHRASE.setChunkFunction(ChunkFunction.BOUNDARY_NOUN_PHRASE);
BOUNDARY_NOUN_PHRASE_MAIN = new ChunkTag();
BOUNDARY_NOUN_PHRASE_MAIN
.setChunkFunction(ChunkFunction.BOUNDARY_NOUN_PHRASE_MAIN);
BOUNDARY_VERB_PHRASE_MAIN = new ChunkTag();
BOUNDARY_VERB_PHRASE_MAIN
.setChunkFunction(ChunkFunction.BOUNDARY_VERB_PHRASE_MAIN);
INTERMEDIARY_NOUN_PHRASE = new ChunkTag();
INTERMEDIARY_NOUN_PHRASE
.setChunkFunction(ChunkFunction.INTERMEDIARY_NOUN_PHRASE);
INTERMEDIARY_NOUN_PHRASE_MAIN = new ChunkTag();
INTERMEDIARY_NOUN_PHRASE_MAIN
.setChunkFunction(ChunkFunction.INTERMEDIARY_NOUN_PHRASE_MAIN);
INTERMEDIARY_VERB_PHRASE = new ChunkTag();
INTERMEDIARY_VERB_PHRASE
.setChunkFunction(ChunkFunction.INTERMEDIARY_VERB_PHRASE);
OTHER = new ChunkTag();
OTHER.setChunkFunction(ChunkFunction.OTHER);
SYNT_NONE = new SyntacticTag();
SYNT_NONE.setSyntacticFunction(SyntacticFunction.NONE);
SYNT_SUBJECT = new SyntacticTag();
SYNT_SUBJECT.setSyntacticFunction(SyntacticFunction.SUBJECT);
SYNT_VERB = new SyntacticTag();
SYNT_VERB.setSyntacticFunction(SyntacticFunction.VERB);
}
public static Node createTree(Sentence sent) {
List tokens = sent.getTokens();
List> tokenClusters = new ArrayList>();
for (Token token : tokens) {
if (isOtherPhrase(token) || isBoundary(token)) {
addNewCluster(tokenClusters, token);
} else if (isContinuation(tokenClusters, token)) {
merge(tokenClusters, token);
}
}
Node root = new Node();
root.setLevel(0);
root.setSyntacticTag("S");
for (List cluster : tokenClusters) {
String syntTag = syntactTagForCluster(cluster);
if (syntTag != null) {
addSyntNode(cluster, syntTag, root);
} else {
addPhraseNode(cluster, root);
}
}
return root;
}
private static void addPhraseNode(List cluster, Node parent) {
String tag = phraseTagForCluster(cluster);
if (tag != null) {
addPhraseNode(cluster, tag, parent);
} else {
addLeafs(cluster, parent);
}
}
private static void addLeafs(List cluster, Node parent) {
for (Token token : cluster) {
Leaf n = new Leaf();
n.setLevel(parent.getLevel() + 1);
n.setMorphologicalTag(token.getMorphologicalTag().getClazzE().toString());
n.setLexeme(token.getLexeme());
n.setLemma(token.getPrimitive());
parent.addElement(n);
}
}
private static void addPhraseNode(List cluster, String tag, Node parent) {
Node n = new Node();
n.setLevel(parent.getLevel() + 1);
n.setSyntacticTag(tag);
parent.addElement(n);
addLeafs(cluster, n);
}
private static String phraseTagForCluster(List cluster) {
for (Token token : cluster) {
if (isBoundaryOfNounPhrase(token) || isIntermediaryNounPhrase(token)) {
return "NP";
} else if (isBoundaryOfVerbPhrase(token)
|| isIntermediaryVerbPhrase(token)) {
return "VP";
}
}
return null;
}
private static void addSyntNode(List cluster, String syntTag,
Node parent) {
Node n = new Node();
n.setLevel(parent.getLevel() + 1);
n.setSyntacticTag(syntTag);
parent.addElement(n);
addPhraseNode(cluster, n);
}
private static String syntactTagForCluster(List cluster) {
for (Token token : cluster) {
if (token.getSyntacticTag().match(SYNT_SUBJECT)) {
return "SUBJ";
} else if (token.getSyntacticTag().match(SYNT_VERB)) {
return "VERB";
}
}
return null;
}
private static void merge(List> tokenClusters, Token token) {
tokenClusters.get(tokenClusters.size() - 1).add(token);
}
private static Token getLastTokenOfCluster(List> tokenClusters) {
if (tokenClusters.size() > 0) {
List tokenCluster = tokenClusters.get(tokenClusters.size() - 1);
return tokenCluster.get(tokenCluster.size() - 1); // never empty
}
return null;
}
private static boolean isContinuation(List> tokenClusters,
Token token) {
Token lastToken = getLastTokenOfCluster(tokenClusters);
if (lastToken == null) {
return false;
}
if (isBoundaryOfNounPhrase(lastToken)
|| isIntermediaryNounPhrase(lastToken)) {
if (isIntermediaryNounPhrase(token)) {
return true;
} else {
return false;
}
} else if (isBoundaryOfVerbPhrase(lastToken)
|| isIntermediaryVerbPhrase(lastToken)) {
if (isIntermediaryVerbPhrase(token)) {
return true;
} else {
return false;
}
}
return false;
}
private static void addNewCluster(List> tokenClusters, Token token) {
List other = new ArrayList();
other.add(token);
tokenClusters.add(other);
}
private static boolean isBoundary(Token token) {
return token.getChunkTag().match(BOUNDARY_NOUN_PHRASE)
|| token.getChunkTag().match(BOUNDARY_NOUN_PHRASE_MAIN)
|| token.getChunkTag().match(BOUNDARY_VERB_PHRASE_MAIN);
}
private static boolean isBoundaryOfNounPhrase(Token token) {
return token.getChunkTag().match(BOUNDARY_NOUN_PHRASE)
|| token.getChunkTag().match(BOUNDARY_NOUN_PHRASE_MAIN);
}
private static boolean isBoundaryOfVerbPhrase(Token token) {
return token.getChunkTag().match(BOUNDARY_VERB_PHRASE_MAIN);
}
private static boolean isIntermediaryNounPhrase(Token token) {
return token.getChunkTag().match(INTERMEDIARY_NOUN_PHRASE)
|| token.getChunkTag().match(INTERMEDIARY_NOUN_PHRASE_MAIN);
}
private static boolean isIntermediaryVerbPhrase(Token token) {
return token.getChunkTag().match(INTERMEDIARY_VERB_PHRASE);
}
private static boolean isOtherPhrase(Token token) {
return token.getChunkTag().match(OTHER);
}
}