org.cogroo.checker.GrammarChecker Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of cogroo-gc Show documentation
Show all versions of cogroo-gc Show documentation
Annotators specialized in grammar checking.
/**
* Copyright (C) 2012 cogroo
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.cogroo.checker;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Scanner;
import java.util.Set;
import org.apache.log4j.Logger;
import org.cogroo.analyzer.Analyzer;
import org.cogroo.analyzer.ComponentFactory;
import org.cogroo.entities.Mistake;
import org.cogroo.interpreters.FlorestaTagInterpreter;
import org.cogroo.text.Document;
import org.cogroo.text.Sentence;
import org.cogroo.tools.checker.Checker;
import org.cogroo.tools.checker.CheckerComposite;
import org.cogroo.tools.checker.RuleDefinition;
import org.cogroo.tools.checker.SentenceAdapter;
import org.cogroo.tools.checker.TypedChecker;
import org.cogroo.tools.checker.TypedCheckerComposite;
import org.cogroo.tools.checker.checkers.GovernmentChecker;
import org.cogroo.tools.checker.checkers.ParonymChecker;
import org.cogroo.tools.checker.checkers.PunctuationChecker;
import org.cogroo.tools.checker.checkers.RepetitionChecker;
import org.cogroo.tools.checker.checkers.SpaceChecker;
import org.cogroo.tools.checker.rules.applier.RulesApplier;
import org.cogroo.tools.checker.rules.applier.RulesProvider;
import org.cogroo.tools.checker.rules.applier.RulesTreesAccess;
import org.cogroo.tools.checker.rules.applier.RulesTreesBuilder;
import org.cogroo.tools.checker.rules.applier.RulesTreesFromScratchAccess;
import org.cogroo.tools.checker.rules.applier.RulesTreesProvider;
import org.cogroo.tools.checker.rules.applier.RulesXmlAccess;
import org.cogroo.tools.checker.rules.dictionary.FSALexicalDictionary;
import org.cogroo.tools.checker.rules.dictionary.TagDictionary;
import org.cogroo.tools.checker.rules.model.Example;
import org.cogroo.tools.checker.rules.util.MistakeComparator;
import org.cogroo.tools.checker.rules.validator.RulePostValidatorProvider;
import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.util.InvalidFormatException;
public class GrammarChecker implements CheckAnalyzer {
private static final Logger LOGGER = Logger.getLogger(GrammarChecker.class);
private final CheckerComposite checkers;
private final TagDictionary td;
private boolean allowOverlap;
private final SentenceAdapter sentenceAdapter;
private final TypedCheckerComposite typedCheckers;
private final Analyzer pipe;
private final RulePostValidatorProvider validator = new RulePostValidatorProvider();
private static final MistakeComparator MISTAKE_COMPARATOR = new MistakeComparator();
/**
* Creates an analyzer that will call the available checker. Today it is
* hard coded, but it is in the TODO list that we should make it more
* flexible, specially because of other languages.
*
*
* We have two different types of checkers: {@link TypedChecker}s are the
* one that uses the classes from XSD (package checker.rules.model), that
* should be deprecated in the future. They are:
*
*
* - {@link RulesApplier} (rules from XML file)
* - {@link PunctuationChecker}
* - {@link RepetitionChecker}
* - {@link SpaceChecker}
* - {@link GovernmentChecker} (beta)
*
*
* Also we should have checker that deals with basic Document structure, but
* today we don't have any yet.
*
* @throws IllegalArgumentException
* @throws IOException
*/
public GrammarChecker(Analyzer pipe) throws IllegalArgumentException,
IOException {
this(pipe, false, null);
}
public GrammarChecker(Analyzer pipe, boolean allowOverlap,
long[] activeXmlRules) throws IllegalArgumentException, IOException {
this.pipe = pipe;
// initialize resources...
// today we load the tag dictionary this way, but in the future it
// should be
// shared the rules and the models.
td = new TagDictionary(new FSALexicalDictionary(), false,
new FlorestaTagInterpreter());
sentenceAdapter = new SentenceAdapter(td);
// *************************************************************************
// Create typed checkers
// *************************************************************************
List typedCheckersList = new ArrayList();
// add the rules applier, from XSD
typedCheckersList.add(createRulesApplierChecker(activeXmlRules));
// create other typed checkers
// how to get the abbreviation dictionary?
typedCheckersList.add(new SpaceChecker(loadAbbDict()));
typedCheckersList.add(new PunctuationChecker());
typedCheckersList.add(new RepetitionChecker());
// typedCheckersList.add(new UIMAChecker(td));
typedCheckers = new TypedCheckerComposite(typedCheckersList, false);
// all non typed checkers will be added to this:
List checkerList = new ArrayList();
checkerList.add(new GovernmentChecker());
checkerList.add(new ParonymChecker(this.pipe));
this.checkers = new CheckerComposite(checkerList, false);
this.allowOverlap = allowOverlap;
if (LOGGER.isDebugEnabled()) {
LOGGER.debug("Created following rules:");
int count = 0;
for (RuleDefinition def : this.typedCheckers.getRulesDefinition()) {
LOGGER.debug(count++ + ": " + def.getId());
}
for (RuleDefinition def : this.checkers.getRulesDefinition()) {
LOGGER.debug(count++ + ": " + def.getId());
}
}
}
public GrammarChecker(Analyzer pipe, String serializedRule)
throws IllegalArgumentException, IOException {
this.pipe = pipe;
// initialize resources...
// today we load the tag dictionary this way, but in the future it
// should be
// shared the rules and the models.
td = new TagDictionary(new FSALexicalDictionary(), false,
new FlorestaTagInterpreter());
sentenceAdapter = new SentenceAdapter(td);
// *************************************************************************
// Create typed checkers
// *************************************************************************
List typedCheckersList = new ArrayList();
// add the rules applier, from XSD
typedCheckersList.add(createSingletonRuleChecker(serializedRule));
typedCheckers = new TypedCheckerComposite(typedCheckersList, false);
// all non typed checkers will be added to this:
List checkerList = new ArrayList();
this.checkers = new CheckerComposite(checkerList, false);
if (LOGGER.isDebugEnabled()) {
LOGGER.debug("Created following rules:");
int count = 0;
for (RuleDefinition def : this.typedCheckers.getRulesDefinition()) {
LOGGER.debug(count++ + ": " + def.getId());
}
for (RuleDefinition def : this.checkers.getRulesDefinition()) {
LOGGER.debug(count++ + ": " + def.getId());
}
}
}
public Set getRuleDefinitions() {
Set ruleDefinitions = new HashSet();
ruleDefinitions.addAll(this.typedCheckers.getRulesDefinition());
ruleDefinitions.addAll(this.checkers.getRulesDefinition());
return ruleDefinitions;
}
private Dictionary loadAbbDict() throws InvalidFormatException, IOException {
Dictionary abbDict = new Dictionary(this.getClass()
.getResourceAsStream("/dictionaries/pt_br/abbr.xml"));
return abbDict;
}
private TypedChecker createRulesApplierChecker(long[] activeRules) {
// Create XML rules applier
RulesProvider xmlProvider = new RulesProvider(
RulesXmlAccess.getInstance(), false);
RulesTreesBuilder rtb = new RulesTreesBuilder(xmlProvider, activeRules);
RulesTreesAccess rta = new RulesTreesFromScratchAccess(rtb);
RulesTreesProvider rtp = new RulesTreesProvider(rta, false);
return new RulesApplier(rtp, td);
}
private TypedChecker createSingletonRuleChecker(String serializedRule) {
// Create XML rules applier
RulesProvider xmlProvider = new RulesProvider(
RulesXmlAccess.getInstance(serializedRule), true);
RulesTreesBuilder rtb = new RulesTreesBuilder(xmlProvider, null);
RulesTreesAccess rta = new RulesTreesFromScratchAccess(rtb);
RulesTreesProvider rtp = new RulesTreesProvider(rta, false);
return new RulesApplier(rtp, td);
}
public void analyze(CheckDocument document, boolean filterInvalidSuggestions) {
pipe.analyze(document);
List mistakes = new ArrayList();
List sentences = document.getSentences();
List typedSentences = new ArrayList(
sentences.size());
for (Sentence sentence : sentences) {
mistakes.addAll(this.checkers.check(sentence));
org.cogroo.entities.Sentence typedSentence = this.sentenceAdapter
.asTypedSentence(sentence, document.getText());
typedSentences.add(typedSentence);
mistakes.addAll(this.typedCheckers.check(typedSentence));
}
document.setSentencesLegacy(typedSentences);
Collections.sort(mistakes, MISTAKE_COMPARATOR);
mistakes = filterInvalid(document, mistakes);
if (this.allowOverlap == false) {
mistakes = filterOverlap(document, mistakes);
}
if (filterInvalidSuggestions) {
filterWrongSuggestions(document, mistakes);
}
document.setMistakes(mistakes);
}
@Override
public void analyze(CheckDocument document) {
this.analyze(document, true);
}
private List filterInvalid(CheckDocument document,
List mistakes) {
List filtered = new ArrayList();
for (Mistake mistake : mistakes) {
if (validator.isValid(mistake, document)) {
filtered.add(mistake);
}
}
return filtered;
}
private List filterOverlap(Document doc, List mistakes) {
boolean[] occupied = new boolean[doc.getText().length()];
List mistakesNoOverlap = new ArrayList();
boolean overlap = false;
for (Mistake mistake : mistakes) {
overlap = false;
for (int i = mistake.getStart(); i < mistake.getEnd(); i++) {
if (occupied[i]) {
overlap = true;
}
}
if (!overlap) {
for (int i = mistake.getStart(); i < mistake.getEnd(); i++) {
occupied[i] = true;
}
mistakesNoOverlap.add(mistake);
}
}
return mistakesNoOverlap;
}
private void filterWrongSuggestions(Document document,
List mistakes) {
String documentText = document.getText();
for (Mistake mistake : mistakes) {
List rightSuggestions = new ArrayList();
for (String suggestion : mistake.getSuggestions()) {
String alternativeText = documentText.substring(0,
mistake.getStart())
+ suggestion + documentText.substring(mistake.getEnd());
CheckDocument alternative = new CheckDocument(alternativeText);
this.analyze(alternative, false);
if (alternative.getMistakes().size() == 0) { // No errors in
// suggestion
if (LOGGER.isDebugEnabled()) {
LOGGER.debug("\n****** Filtering suggestions **********: "
+ alternativeText + " (OK!)\n");
}
rightSuggestions.add(suggestion);
} else {
if (LOGGER.isDebugEnabled()) {
LOGGER.debug("\n****** Filtering suggestions **********: "
+ alternativeText + " (WRONG!)\n");
}
}
if (rightSuggestions.size() > 0) {
mistake.setSuggestions(rightSuggestions
.toArray(new String[rightSuggestions.size()]));
}
}
}
}
public void ignoreRule(String ruleIdentifier) {
this.checkers.ignore(ruleIdentifier);
}
public void resetIgnoredRules() {
this.checkers.resetIgnored();
}
private static void printExamples(List rulesDefinition) {
for (RuleDefinition def : rulesDefinition) {
for (Example ex : def.getExamples()) {
System.out.println(ex.getIncorrect());
// System.out.println(def.getCategory());
}
}
}
/**
* @param args
* the language to be used, "pt_BR" by default
* @throws IOException
* @throws IllegalArgumentException
*/
public static void main(String[] args) throws IllegalArgumentException,
IOException {
long start = System.nanoTime();
if (args.length != 1) {
System.err.println("Language is missing! usage: CLI pt_br");
return;
}
ComponentFactory factory = ComponentFactory.create(new Locale("pt",
"BR"));
long[] rules = { 129 };
// GrammarChecker cogroo = new GrammarChecker(factory.createPipe(),
// false, rules);
GrammarChecker cogroo = new GrammarChecker(factory.createPipe());
System.out.println("Loading time ["
+ ((System.nanoTime() - start) / 1000000) + "ms]");
Scanner kb = new Scanner(System.in);
System.out
.print("Enter the sentence, q to quit, 0 for the default, or 1 to print the examples: ");
String input = kb.nextLine();
while (!input.equals("q")) {
if (input.equals("0")) {
input = "Foi ferido por uma balas perdidas.";
} else if (input.equals("1")) {
printExamples(new ArrayList(
cogroo.getRuleDefinitions()));
}
CheckDocument document = new CheckDocument();
document.setText(input);
cogroo.analyze(document);
System.out.println(document);
System.out.print("Enter the sentence: ");
input = kb.nextLine();
}
}
}