![JAR search and dependency download from the Maven repository](/logo.png)
eu.project.ttc.engines.RegexSpotter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of termsuite-core Show documentation
Show all versions of termsuite-core Show documentation
A Java UIMA-based toolbox for multilingual and efficient terminology extraction an multilingual term alignment
/*******************************************************************************
* Copyright 2015 - CNRS (Centre National de Recherche Scientifique)
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*******************************************************************************/
package eu.project.ttc.engines;
import java.util.Collection;
import java.util.Iterator;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.ExternalResource;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.FSArray;
import org.apache.uima.jcas.cas.StringArray;
import org.apache.uima.resource.ResourceInitializationException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import eu.project.ttc.resources.OccurrenceFilter;
import eu.project.ttc.resources.TermIndexResource;
import eu.project.ttc.resources.TrueFilter;
import eu.project.ttc.types.SourceDocumentInformation;
import eu.project.ttc.types.TermOccAnnotation;
import eu.project.ttc.types.WordAnnotation;
import eu.project.ttc.utils.JCasUtils;
import eu.project.ttc.utils.OccurrenceBuffer;
import eu.project.ttc.utils.TermSuiteConstants;
import eu.project.ttc.utils.TermSuiteUtils;
import eu.project.ttc.utils.TermUtils;
import fr.univnantes.lina.uima.tkregex.LabelledAnnotation;
import fr.univnantes.lina.uima.tkregex.RegexOccurrence;
import fr.univnantes.lina.uima.tkregex.ae.TokenRegexAE;
import uima.sandbox.filter.resources.FilterResource;
/**
*
* Adds all Token Regex Occurrences to the Cas and to the Term Index.
*
* @author Damien Cram
*
*/
public class RegexSpotter extends TokenRegexAE {
private static final Logger LOGGER = LoggerFactory.getLogger(RegexSpotter.class);
public static final String POST_PROCESSING_STRATEGY = "PostProcessingStrategy";
@ConfigurationParameter(name = POST_PROCESSING_STRATEGY, mandatory = false, defaultValue = OccurrenceBuffer.NO_CLEANING)
private String postProcessingStrategy;
public static final String LOG_OVERLAPPING_RULES = "LogOverlappingRules";
@ConfigurationParameter(name = LOG_OVERLAPPING_RULES, mandatory = false, defaultValue = "false")
private boolean logOverlappingRules;
public static final String CONTEXTUALIZE = "Contextualize";
@ConfigurationParameter(name = CONTEXTUALIZE, mandatory = false, defaultValue = "false")
private boolean contextualize;
public static final String KEEP_OCCURRENCES_IN_TERM_INDEX = "KeepOccurrencesInTermIndex";
@ConfigurationParameter(name = KEEP_OCCURRENCES_IN_TERM_INDEX, mandatory = false, defaultValue = "true")
private boolean keepOccurrencesInTermIndex;
public static final String PARAM_ADD_TO_TERM_INDEX = "AddToTermIndex";
@ConfigurationParameter(name = PARAM_ADD_TO_TERM_INDEX, mandatory = false, defaultValue = "true")
private boolean addToTermIndex;
public static final String CHARACTER_FOOTPRINT_TERM_FILTER = "CharacterFootprintTermFilter";
@ExternalResource(key =CHARACTER_FOOTPRINT_TERM_FILTER, mandatory = false)
private OccurrenceFilter termFilter = TrueFilter.INSTANCE;
public static final String STOP_WORD_FILTER = "StopWordFilter";
@ExternalResource(key =STOP_WORD_FILTER, mandatory = true)
private FilterResource stopWordFilter;
@ExternalResource(key=TermIndexResource.TERM_INDEX, mandatory=false)
private TermIndexResource termIndexResource;
private String currentFileURI;
@Override
public void initialize(UimaContext context) throws ResourceInitializationException {
super.initialize(context);
Preconditions.checkState(
!(addToTermIndex && termIndexResource == null),
"{} is configured to store terms in TermIndex but no TermIndex was passed",
this.getClass().getName());
}
@Override
protected void beforeRuleProcessing(JCas jCas) {
Optional sdi = JCasUtils.getSourceDocumentAnnotation(jCas);
this.currentFileURI = sdi.isPresent() ? sdi.get().getUri() : "(no source uri given)";
this.occurrenceBuffer = new OccurrenceBuffer(this.postProcessingStrategy);
}
private OccurrenceBuffer occurrenceBuffer;
private int addedOccurrences = 0;
@Override
public void ruleMatched(JCas jCas, RegexOccurrence occurrence) {
/*
* Do not keep the term if it has too many bad characters
*/
if(!termFilter.accept(occurrence))
return;
/*
* Do not keep the term if it is a stop word
*/
WordAnnotation wa = (WordAnnotation)occurrence.getLabelledAnnotations().get(0).getAnnotation();
if(occurrence.size() == 1 && stopWordFilter.getFilters().contains(wa.getCoveredText().toLowerCase()))
return;
if(occurrence.size() == 1 && wa.getLemma() != null && stopWordFilter.getFilters().contains(wa.getLemma().toLowerCase()))
return;
/*
* Add the occurrence the buffer. Will be added to jCas if it is not filtered by any post processing strategy
*/
this.occurrenceBuffer.bufferize(occurrence);
}
@Override
protected void allRulesFailed(JCas jCas) {
flushOccurrenceBuffer(jCas);
}
/*
*
*/
private void flushOccurrenceBuffer(JCas jCas) {
/*
* Log a warning if the occurrence was found for another rule
*/
if(logOverlappingRules) {
for(Collection doublons:this.occurrenceBuffer.findDuplicates()) {
Iterator it = doublons.iterator();
RegexOccurrence base = it.next();
while(it.hasNext()) {
RegexOccurrence occ = it.next();
LOGGER.warn("Rules {} and {} overlap on occurrence [{},{}] \"{}\"",
base.getRule().getName(),
occ.getRule().getName(),
occ.getBegin(),
occ.getEnd(),
TermUtils.collapseText(jCas.getDocumentText().substring(occ.getBegin(), occ.getEnd()))
);
}
}
}
this.occurrenceBuffer.cleanBuffer();
for(RegexOccurrence occ:this.occurrenceBuffer)
addOccurrenceToCas(jCas, occ);
this.occurrenceBuffer.clear();
}
private void addOccurrenceToCas(JCas jCas, RegexOccurrence occurrence) {
TermOccAnnotation annotation = (TermOccAnnotation) jCas
.getCas().createAnnotation(
jCas.getCasType(TermOccAnnotation.type),
occurrence.getBegin(),
occurrence.getEnd());
StringArray patternFeature = new StringArray(jCas, occurrence.size());
FSArray innerWords = new FSArray(jCas, occurrence.size());
StringBuilder termLemma = new StringBuilder();
int i = 0;
for (LabelledAnnotation la:occurrence.getLabelledAnnotations()) {
patternFeature.set(i, la.getLabel());
WordAnnotation word = (WordAnnotation) la.getAnnotation();
termLemma.append(word.getLemma());
if(i
© 2015 - 2025 Weber Informatics LLC | Privacy Policy