All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.carrot2.text.preprocessing.StopListMarker Maven / Gradle / Ivy


/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2015, Dawid Weiss, Stanisław Osiński.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */

package org.carrot2.text.preprocessing;

import org.carrot2.text.analysis.ITokenizer;
import org.carrot2.text.linguistic.ILexicalData;
import org.carrot2.text.preprocessing.PreprocessingContext.AllWords;
import org.carrot2.text.util.MutableCharArray;
import org.carrot2.util.CharArrayUtils;
import org.carrot2.util.attribute.Bindable;

/**
 * Marks stop words based on the current language model.
 * 

* This class saves the following results to the {@link PreprocessingContext}: *

    *
  • {@link AllWords#type}
  • *
*

* This class requires that {@link Tokenizer} and {@link CaseNormalizer} be invoked first. */ @Bindable(prefix = "StopListMarker") public final class StopListMarker { /** * Marks stop words and saves the results to the context. */ public void mark(PreprocessingContext context) { final char [][] wordImages = context.allWords.image; final short [] types = context.allWords.type; final MutableCharArray mutableCharArray = new MutableCharArray(""); char [] buffer = new char [128]; final ILexicalData lexData = context.language.getLexicalData(); for (int i = 0; i < wordImages.length; i++) { final char [] word = wordImages[i]; if (buffer.length < word.length) buffer = new char [word.length]; CharArrayUtils.toLowerCase(word, buffer); mutableCharArray.reset(buffer, 0, word.length); if (lexData.isCommonWord(mutableCharArray)) { types[i] |= ITokenizer.TF_COMMON_WORD; } } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy