org.carrot2.text.preprocessing.pipeline.CompletePreprocessingPipeline Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of carrot2-mini Show documentation
Show all versions of carrot2-mini Show documentation
Carrot2 search results clustering framework. Minimal functional subset
(core algorithms and infrastructure, no document sources).
/*
* Carrot2 project.
*
* Copyright (C) 2002-2019, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/
package org.carrot2.text.preprocessing.pipeline;
import java.util.List;
import org.carrot2.core.Document;
import org.carrot2.core.LanguageCode;
import org.carrot2.text.linguistic.LanguageModel;
import org.carrot2.text.preprocessing.CaseNormalizer;
import org.carrot2.text.preprocessing.DocumentAssigner;
import org.carrot2.text.preprocessing.LabelFilterProcessor;
import org.carrot2.text.preprocessing.LanguageModelStemmer;
import org.carrot2.text.preprocessing.PhraseExtractor;
import org.carrot2.text.preprocessing.PreprocessingContext;
import org.carrot2.text.preprocessing.StopListMarker;
import org.carrot2.text.preprocessing.Tokenizer;
import org.carrot2.util.attribute.Bindable;
/**
* Performs a complete preprocessing on the provided documents. The preprocessing consists
* of the following steps:
*
* - {@link Tokenizer#tokenize(PreprocessingContext)}
* - {@link CaseNormalizer#normalize(PreprocessingContext)}
* - {@link LanguageModelStemmer#stem(PreprocessingContext)}
* - {@link StopListMarker#mark(PreprocessingContext)}
* - {@link PhraseExtractor#extractPhrases(PreprocessingContext)}
* - {@link LabelFilterProcessor#process(PreprocessingContext)}
* - {@link DocumentAssigner#assign(PreprocessingContext)}
*
*/
@Bindable(prefix = "PreprocessingPipeline")
public class CompletePreprocessingPipeline extends BasicPreprocessingPipeline
{
/**
* Phrase extractor used by the algorithm, contains bindable attributes.
*/
public final PhraseExtractor phraseExtractor = new PhraseExtractor();
/**
* Label filter processor used by the algorithm, contains bindable attributes.
*/
public final LabelFilterProcessor labelFilterProcessor = new LabelFilterProcessor();
/**
* Document assigner used by the algorithm, contains bindable attributes.
*/
public final DocumentAssigner documentAssigner = new DocumentAssigner();
@Override
public PreprocessingContext preprocess(List documents, String query,
LanguageCode language)
{
final PreprocessingContext context = new PreprocessingContext(
LanguageModel.create(language, stemmerFactory, tokenizerFactory,
lexicalDataFactory), documents, query);
tokenizer.tokenize(context);
caseNormalizer.normalize(context);
languageModelStemmer.stem(context);
stopListMarker.mark(context);
phraseExtractor.extractPhrases(context);
labelFilterProcessor.process(context);
documentAssigner.assign(context);
context.preprocessingFinished();
return context;
}
}