org.carrot2.text.preprocessing.pipeline.CompletePreprocessingPipeline Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of carrot2-mini Show documentation

Carrot2 search results clustering framework. Minimal functional subset (core algorithms and infrastructure, no document sources).

There is a newer version: 3.16.3

Show newest version


/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2012, Dawid Weiss, Stanisław Osiński.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */

package org.carrot2.text.preprocessing.pipeline;

import org.carrot2.text.preprocessing.*;
import org.carrot2.util.attribute.Bindable;

/**
 * Performs a complete preprocessing on the provided documents. The preprocessing consists
 * of the following steps:
 * 
 * {@link Tokenizer#tokenize(PreprocessingContext)}
 * {@link CaseNormalizer#normalize(PreprocessingContext)}
 * {@link LanguageModelStemmer#stem(PreprocessingContext)}
 * {@link StopListMarker#mark(PreprocessingContext)}
 * {@link PhraseExtractor#extractPhrases(PreprocessingContext)}
 * {@link LabelFilterProcessor#process(PreprocessingContext)}
 * {@link DocumentAssigner#assign(PreprocessingContext)}
 * 
 */
@Bindable(prefix = "PreprocessingPipeline")
public class CompletePreprocessingPipeline extends BasicPreprocessingPipeline
{
    /**
     * Phrase extractor used by the algorithm, contains bindable attributes.
     */
    public final PhraseExtractor phraseExtractor = new PhraseExtractor();

    /**
     * Label filter processor used by the algorithm, contains bindable attributes.
     */
    public final LabelFilterProcessor labelFilterProcessor = new LabelFilterProcessor();

    /**
     * Document assigner used by the algorithm, contains bindable attributes.
     */
    public final DocumentAssigner documentAssigner = new DocumentAssigner();

    /**
     * Performs preprocessing on the provided {@link PreprocessingContext}.
     */
    @Override
    public void preprocess(PreprocessingContext context)
    {
        tokenizer.tokenize(context);
        caseNormalizer.normalize(context);
        languageModelStemmer.stem(context);
        stopListMarker.mark(context);
        phraseExtractor.extractPhrases(context);
        labelFilterProcessor.process(context);
        documentAssigner.assign(context);

        context.preprocessingFinished();
    }
}