All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.clementlevallois.stopwords.StopWordsRemover Maven / Gradle / Ivy

Go to download

stopwords and related operations for essential text mining functions in the umigon-family suite of tools

The newest version!
/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */
package net.clementlevallois.stopwords;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

/**
 *
 * @author C. Levallois
 */
public final class StopWordsRemover {

    private int minWordLength;

    private final int maxAcceptedGarbage = 3;
    private int nbStopWords = 5000;
    private int nbStopWordsShort = 500;

    private Set setStopWordsFieldSpecificOrShort = new HashSet();
    private Set setStopWordsShort = new HashSet();
    private Set setStopwordsFieldSpecific = new HashSet();
    private Set setStopWords = new HashSet();
    private Set setKeepWords = new HashSet();
    private Set setRemoveWords = new HashSet();
    private List listGeneralStopwordsLarge = new ArrayList();
    private List listGeneralStopwordsShort = new ArrayList();
    private List stopwordsLong = new ArrayList();
    private Map> stopWordsLongAndShort;

    public static void main(String[] args) throws Exception {
        Set fieldSpecificTerms = new HashSet();
        fieldSpecificTerms.add("twitter");
        StopWordsRemover rem = new StopWordsRemover(3, "en");
        rem.addFieldSpecificStopWords(fieldSpecificTerms);
        Set scientificStopwordsInEnglish = Stopwords.getScientificStopwordsInEnglish();
        rem.addFieldSpecificStopWords(scientificStopwordsInEnglish);
        boolean shouldItBeRemoved = rem.shouldItBeRemoved("of textual");
        System.out.println(shouldItBeRemoved);
    }

    public StopWordsRemover(int minWordLength, String lang) {
        stopWordsLongAndShort = Stopwords.getStopWords(lang);
        stopwordsLong = new ArrayList((Set) stopWordsLongAndShort.get("long"));
        this.minWordLength = minWordLength;
        nbStopWordsShort = Math.min(nbStopWordsShort, Math.max(0, (stopwordsLong.size() - 1)));
        nbStopWords = Math.min(5000, Math.max(0, (stopwordsLong.size() - 1)));
        try {
            init();
        } catch (IOException ex) {
            System.out.println("ex: " + ex);
        }
    }

    public void addStopWordsToKeep(Set wordsToKeep) {
        if (wordsToKeep != null) {
            setKeepWords.addAll(wordsToKeep);
        }
    }

    public void addWordsToRemove(Set wordsToRemove) {
        if (wordsToRemove != null) {
            setRemoveWords.addAll(wordsToRemove);
        }
    }

    public void useUSerSuppliedStopwords(Set userSuppliedStopwords, boolean userStopwordsReplaceDefault) {
        if (userStopwordsReplaceDefault) {
            setStopWordsFieldSpecificOrShort = new HashSet(userSuppliedStopwords);
            setStopWordsShort = new HashSet(userSuppliedStopwords);
            setStopwordsFieldSpecific = new HashSet(userSuppliedStopwords);
            setStopWords = new HashSet(userSuppliedStopwords);
        } else {
            setStopWordsFieldSpecificOrShort.addAll(userSuppliedStopwords);
            setStopWordsShort.addAll(userSuppliedStopwords);
            setStopWords.addAll(userSuppliedStopwords);
        }
    }

    public void addFieldSpecificStopWords(Set fieldSpecificStopWordsToRemove) {
        if (fieldSpecificStopWordsToRemove != null) {
            setStopWordsFieldSpecificOrShort.addAll(fieldSpecificStopWordsToRemove);
            setStopWords.addAll(fieldSpecificStopWordsToRemove);
        }
    }

    private void init() throws IOException {
        setKeepWords = new HashSet();
        setStopWordsShort = new HashSet();

        listGeneralStopwordsLarge = stopwordsLong.subList(0, nbStopWords);
        listGeneralStopwordsShort = stopwordsLong.subList(0, nbStopWordsShort);

        setStopWords.addAll(listGeneralStopwordsLarge);
        setStopWords.addAll(Stopwords.getStopwordsValidForAllLanguages());
        if (stopWordsLongAndShort.get("short").isEmpty()) {
            setStopWordsShort.addAll(listGeneralStopwordsShort);
        } else {
            setStopWordsShort.addAll(stopWordsLongAndShort.get("short"));
        }
        setStopWordsFieldSpecificOrShort.addAll(setStopWordsShort);
    }

    public boolean shouldItBeRemoved(String term) {
        String entryWord;
        boolean multipleWord;

        boolean write = true;
        entryWord = term;
        multipleWord = entryWord.contains(" ");

        if (multipleWord) {
            String[] wordsNGrams = entryWord.split(" ");
            int wordsNGramsLength = wordsNGrams.length;

            for (String wordsNGram : wordsNGrams) {
                if (wordsNGram.length() < minWordLength) {
                    write = false;
                    break;
                }
            }

            if (wordsNGramsLength == 2
                    && ((setStopWordsFieldSpecificOrShort.contains(wordsNGrams[0].toLowerCase().trim())
                    || setStopWordsFieldSpecificOrShort.contains(wordsNGrams[1].toLowerCase().trim())))) {
                write = false;

            }

            if (wordsNGramsLength > 2) {
                int scoreGarbage = 0;

                for (int i = 0; i < wordsNGramsLength; i++) {

                    String currentTerm = wordsNGrams[i].toLowerCase().trim();

                    if ((i == 0 | i == (wordsNGramsLength - 1)) && setStopWordsFieldSpecificOrShort.contains(currentTerm)) {
                        scoreGarbage = maxAcceptedGarbage + 1;
                        continue;
                    }

                    if ((i == 0 | i == (wordsNGramsLength - 1)) && setStopWordsShort.contains(currentTerm)) {
                        write = false;
                        continue;
                    }

                    if (setStopWordsShort.contains(currentTerm)) {
                        scoreGarbage = scoreGarbage + 3;
                        continue;
                    }

                    if (setStopwordsFieldSpecific.contains(currentTerm)) {
                        scoreGarbage = scoreGarbage + 2;
                        continue;
                    }

                }

                if (setStopWords.contains(entryWord)) {
                    scoreGarbage = maxAcceptedGarbage + 1;
                }

                if (scoreGarbage > maxAcceptedGarbage) {

                    write = false;
                }
            }

        } else if (setStopWords.contains(entryWord) & !setKeepWords.contains(entryWord)) {
            write = false;
        }

        if (setKeepWords.contains(entryWord)) {
            write = true;
        }
        if (setRemoveWords.contains(entryWord)) {
            write = false;
        }

        return !write;

    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy