eu.project.ttc.engines.CasStatCounter Maven / Gradle / Ivy
/*******************************************************************************
* Copyright 2015 - CNRS (Centre National de Recherche Scientifique)
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*******************************************************************************/
package eu.project.ttc.engines;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Writer;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import java.util.concurrent.TimeUnit;
import org.apache.commons.lang.mutable.MutableInt;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.ExternalResource;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Functions;
import com.google.common.base.Optional;
import com.google.common.base.Stopwatch;
import com.google.common.collect.ImmutableSortedMap;
import com.google.common.collect.Iterables;
import com.google.common.collect.Iterators;
import com.google.common.collect.Maps;
import com.google.common.collect.Ordering;
import eu.project.ttc.models.Term;
import eu.project.ttc.models.TermIndex;
import eu.project.ttc.models.TermOccurrence;
import eu.project.ttc.models.VariationType;
import eu.project.ttc.resources.TermIndexResource;
import eu.project.ttc.types.SourceDocumentInformation;
import eu.project.ttc.types.WordAnnotation;
import eu.project.ttc.utils.JCasUtils;
/**
* Compiles and logs CAS stats.
*
* @author Damien Cram
*
*/
public class CasStatCounter extends JCasAnnotator_ImplBase {
private static final Logger LOGGER = LoggerFactory.getLogger(CasStatCounter.class);
private Map counters = Maps.newHashMap();
public static final String STAT_NAME = "StatName";
@ConfigurationParameter(name=STAT_NAME, mandatory=false)
private String statName;
public static final String DOCUMENT_PERIOD = "DocumentPeriod";
@ConfigurationParameter(name=DOCUMENT_PERIOD, mandatory=false, defaultValue = "-1")
private int docPeriod;
private boolean periodicStatEnabled = false;
private int docIt;
private long cumulatedFileSize;
public static final String TO_TRACE_FILE = "ToTraceFile";
@ConfigurationParameter(name=TO_TRACE_FILE, mandatory=false)
private String traceFileName;
private Writer fileWriter;
private static final String TSV_LINE_FORMAT="%d\t%d\t%d\t%d\t%d\n";
@ExternalResource(key=TermIndexResource.TERM_INDEX, mandatory=true)
private TermIndexResource termIndexResource;
private Stopwatch sw;
@Override
public void initialize(UimaContext context) throws ResourceInitializationException {
super.initialize(context);
this.sw = Stopwatch.createStarted();
if(traceFileName != null) {
File file = new File(traceFileName);
try {
this.fileWriter = new FileWriter(file);
} catch (IOException e) {
LOGGER.error("Could not create a writer to file {}", traceFileName);
throw new ResourceInitializationException(e);
}
this.periodicStatEnabled = docPeriod > 0;
LOGGER.info("Tracing time performance to file {}", file.getAbsolutePath());
}
}
@Override
public void process(JCas aJCas) throws AnalysisEngineProcessException {
this.docIt++;
Optional sourceDocumentAnnotation = JCasUtils.getSourceDocumentAnnotation(aJCas);
if(sourceDocumentAnnotation.isPresent())
this.cumulatedFileSize += sourceDocumentAnnotation.get().getDocumentSize();
FSIterator it = aJCas.getAnnotationIndex().iterator();
Annotation a;
MutableInt i;
while(it.hasNext()) {
a = it.next();
i = counters.get(a.getType().getShortName());
if(i == null)
counters.put(a.getType().getShortName(), new MutableInt(1));
else
i.increment();
}
if(periodicStatEnabled && this.docIt % this.docPeriod == 0)
try {
traceToFile();
} catch (IOException e) {
throw new AnalysisEngineProcessException(e);
}
}
private void traceToFile() throws IOException {
String line = String.format(TSV_LINE_FORMAT,
this.sw.elapsed(TimeUnit.MILLISECONDS),
this.docIt,
this.cumulatedFileSize,
this.termIndexResource.getTermIndex().getTerms().size(),
this.counters.get(WordAnnotation.class.getSimpleName()).intValue()
);
LOGGER.debug(line);
this.fileWriter.write(line);
this.fileWriter.flush();
}
@Override
protected void finalize() throws Throwable {
this.fileWriter.close();
super.finalize();
}
@Override
public void collectionProcessComplete()
throws AnalysisEngineProcessException {
if(statName != null)
logStats();
}
private void logStats() {
Ordering a = Ordering.natural().reverse().onResultOf(Functions.forMap(counters)).compound(Ordering.natural());
Map map = ImmutableSortedMap.copyOf(counters, a);
Iterator> it = map.entrySet().iterator();
if(it.hasNext()) {// it will be empty if pipeline is run on empty collection
Entry mostFrequentAnno = it.next();
LOGGER.info("[{}] {}: {} ", statName, mostFrequentAnno.getKey(), mostFrequentAnno.getValue().intValue());
}
int nbSyntacticVariants = 0;
int nbMorphologicalVariants = 0;
int nbGraphicalVariants = 0;
int nbOccurrences = 0;
int nbPrimaryOccOccurrences = 0;
TermIndex tIndex = termIndexResource.getTermIndex();
for(Term t:tIndex.getTerms()) {
nbMorphologicalVariants+=Iterables.size(t.getVariations(VariationType.MORPHOLOGICAL));
nbSyntacticVariants+=Iterables.size(t.getVariations(VariationType.SYNTACTICAL));
nbGraphicalVariants+=Iterables.size(t.getVariations(VariationType.GRAPHICAL));
nbOccurrences+=t.getOccurrences().size();
for(TermOccurrence o:t.getOccurrences()) {
if(o.isPrimaryOccurrence())
nbPrimaryOccOccurrences++;
}
}
// graphical variants are bidirectional
nbGraphicalVariants/=2;
LOGGER.info("[{}] Nb terms: {} [sw: {}, mw: {}]", statName,
tIndex.getTerms().size(),
Iterators.size(tIndex.singleWordTermIterator()),
Iterators.size(tIndex.multiWordTermIterator()));
LOGGER.info("[{}] Nb words: {} [compounds: {}]", statName,
tIndex.getWords().size(),
Iterators.size(tIndex.compoundWordTermIterator()));
LOGGER.info("[{}] Nb occurrences: {} [primary: {}]", statName,
nbOccurrences,
nbPrimaryOccOccurrences);
LOGGER.info("[{}] Nb variants: {} [morph: {}, syn: {}, graph: {}]", statName,
nbMorphologicalVariants + nbSyntacticVariants + nbGraphicalVariants,
nbMorphologicalVariants,
nbSyntacticVariants,
nbGraphicalVariants);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy