![JAR search and dependency download from the Maven repository](/logo.png)
org.cogroo.analyzer.ContractionFinder Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of cogroo-ann Show documentation
Show all versions of cogroo-ann Show documentation
An API with pipes and annotators for NLP
/**
* Copyright (C) 2012 cogroo
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.cogroo.analyzer;
import java.util.List;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.util.Span;
import org.apache.log4j.Logger;
import org.cogroo.ContractionUtility;
import org.cogroo.config.Analyzers;
import org.cogroo.text.Document;
import org.cogroo.text.Sentence;
import org.cogroo.text.Token;
import org.cogroo.text.impl.TokenImpl;
import org.cogroo.util.TextUtils;
/**
* The ContractionFinder
class searches for contractions in a given
* sentence and then expands them to their primitive form.
*
*/
public class ContractionFinder implements Analyzer {
private NameFinderME contractionFinder;
protected static final Logger LOGGER = Logger.getLogger(ContractionFinder.class);
public ContractionFinder(NameFinderME contractionFinder) {
this.contractionFinder = contractionFinder;
}
public void analyze(Document document) {
List sentences = document.getSentences();
for (Sentence sentence : sentences) {
Span[] contractionsSpan;
synchronized (this.contractionFinder) {
contractionsSpan = contractionFinder.find(TextUtils
.tokensToString(sentence.getTokens()));
}
List newTokens = sentence.getTokens();
for (int i = contractionsSpan.length - 1; i >= 0; i--) {
int start = contractionsSpan[i].getStart();
String lexeme = sentence.getTokens().get(start).getLexeme();
String[] contractions = ContractionUtility.expand(lexeme);
Token original = newTokens.remove(start);
if(contractions != null) {
for (int j = contractions.length - 1; j >= 0; j--) {
Token token = new TokenImpl(original.getStart(), original.getEnd(), contractions[j]);
newTokens.add(start, token);
String caze = null;
if (j == 0)
caze = "B";
else if (j == contractions.length - 1)
caze = "E";
else
caze = "I";
token.addContext(Analyzers.CONTRACTION_FINDER, caze);
}
} else {
LOGGER.debug("Missing contraction: " + lexeme);
}
}
sentence.setTokens(newTokens);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy