All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.clementlevallois.stopwords.Stopwords Maven / Gradle / Ivy

Go to download

stopwords and related operations for essential text mining functions in the umigon-family suite of tools

The newest version!
/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package net.clementlevallois.stopwords;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.stream.Stream;

/*
 Copyright 2008-2013 Clement Levallois
 Authors : Clement Levallois 
 Website : http://www.clementlevallois.net


 DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.

 Copyright 2013 Clement Levallois. All rights reserved.

 The contents of this file are subject to the terms of either the GNU
 General Public License Version 3 only ("GPL") or the Common
 Development and Distribution License("CDDL") (collectively, the
 "License"). You may not use this file except in compliance with the
 License. You can obtain a copy of the License at
 http://gephi.org/about/legal/license-notice/
 or /cddl-1.0.txt and /gpl-3.0.txt. See the License for the
 specific language governing permissions and limitations under the
 License.  When distributing the software, include this License Header
 Notice in each file and include the License files at
 /cddl-1.0.txt and /gpl-3.0.txt. If applicable, add the following below the
 License Header, with the fields enclosed by brackets [] replaced by
 your own identifying information:
 "Portions Copyrighted [year] [name of copyright owner]"

 If you wish your version of this file to be governed by only the CDDL
 or only the GPL Version 3, indicate your decision by adding
 "[Contributor] elects to include this software in this distribution
 under the [CDDL or GPL Version 3] license." If you do not indicate a
 single choice of license, a recipient has the option to distribute
 your version of this file under either the CDDL, the GPL Version 3 or
 to extend the choice of license to its licensees as provided above.
 However, if you add GPL Version 3 code and therefore, elected the GPL
 Version 3 license, then the option applies only if the new code is
 made subject to such option by the copyright holder.

 Contributor(s): Clement Levallois

 */
public class Stopwords {

    private static final String[] twitterStopWords = {"rt", "w/"};
    private static final String[] commonStopWords = {"and", "for", "nbsp", "http", "https", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "20", "25", "30", "40", "50", "100", "1000"};

    private static Map>> cache = new HashMap();
    private static Map> cacheTwitter = new HashMap();

    public static void main(String args[]) {
        try {
            InputStream fileFromResourceAsStream = Stopwords.class.getResourceAsStream("twitter_en.txt");

            BufferedReader br = new BufferedReader(new InputStreamReader(fileFromResourceAsStream));
            String readLine = br.readLine();
            System.out.println("line: " + readLine);
            Map> stopWords = Stopwords.getStopWords("en");
            System.out.println(stopWords.toString());
//        ResourceLoader.("twitter/en.txt");
//        List readAllLines = Files.readAllLines(Paths.get(url.toURI()));
//        System.out.println("text: "+readAllLines.get(0));
        } catch (IOException ex) {
            ex.printStackTrace();
        }

    }

    public static Map> getStopWords(String lang) {

        if (cache.containsKey(lang)) {
            return cache.get(lang);
        }

        Set stopWords = new HashSet();
        Set shortStopWords = new HashSet();
        Map> pair = null;
        InputStream inputStream;
        URL resource;

        for (String commonStopWord : commonStopWords) {
            stopWords.add(commonStopWord);
            shortStopWords.add(commonStopWord);
        }
        for (String twitterStopWord : twitterStopWords) {
            stopWords.add(twitterStopWord);
            shortStopWords.add(twitterStopWord);
        }

        resource = Stopwords.class.getResource(lang + ".txt");
        if (resource != null) {
            inputStream = Stopwords.class.getResourceAsStream(lang + ".txt");
            try {
                BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
                Stream lines = br.lines();
                lines.forEach(l -> stopWords.add(l));
            } catch (Exception e) {
                System.out.println("exception is: " + e.toString());
                e.printStackTrace();
            }
        }
        resource = Stopwords.class.getResource(lang + "_short.txt");
        if (resource != null) {
            inputStream = Stopwords.class.getResourceAsStream(lang + "_short.txt");
            try {
                BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
                Stream lines = br.lines();
                lines.forEach(l -> shortStopWords.add(l));
            } catch (Exception e) {
                System.out.println("exception is: " + e.toString());
                e.printStackTrace();
            }
        }

        pair = new HashMap();
        pair.put("short", shortStopWords);
        pair.put("long", stopWords);
        cache.put(lang, pair);

        return pair;
    }

    public static Set getStopWordsUsefulInSentimentAnalysis(String lang) {

        Set stopWords = new HashSet();
        InputStream inputStream;
        URL resource;

        resource = Stopwords.class.getResource(lang + "_stopword_sentiment.txt");
        if (resource != null) {
            inputStream = Stopwords.class.getResourceAsStream(lang + "_stopword_sentiment.txt");
            try {
                BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
                Stream lines = br.lines();
                lines.forEach(l -> stopWords.add(l));
            } catch (Exception e) {
                System.out.println("exception is: " + e.toString());
                e.printStackTrace();
                return stopWords;
            }
        }
        return stopWords;
    }

    public static Set getScientificStopwordsInEnglish() {

        Set stopWords = new HashSet();
        InputStream inputStream;
        URL resource;

        resource = Stopwords.class.getResource("scientificstopwords_en.txt");
        if (resource != null) {
            inputStream = Stopwords.class.getResourceAsStream("scientificstopwords_en.txt");
            try {
                BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
                Stream lines = br.lines();
                lines.forEach(l -> stopWords.add(l));
            } catch (Exception e) {
                System.out.println("exception is: " + e.toString());
                e.printStackTrace();
                return stopWords;
            }
        }
        return stopWords;
    }

    public static Set getScientificStopwordsInFrench() {

        Set stopWords = new HashSet();
        InputStream inputStream;
        URL resource;

        resource = Stopwords.class.getResource("scientificstopwords_fr.txt");
        if (resource != null) {
            inputStream = Stopwords.class.getResourceAsStream("scientificstopwords_fr.txt");
            try {
                BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
                Stream lines = br.lines();
                lines.forEach(l -> stopWords.add(l));
            } catch (Exception e) {
                System.out.println("exception is: " + e.toString());
                e.printStackTrace();
                return stopWords;
            }
        }
        return stopWords;
    }

    public static Set getTwitterStopwords(boolean longList) {
        Set words = new HashSet();

        InputStream inputStream;
        URL resource;

        resource = Stopwords.class.getResource("twitter_long.txt");
        if (resource != null) {
            inputStream = Stopwords.class.getResourceAsStream("twitter_long.txt");
            try {
                BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
                Stream lines = br.lines();
                lines.forEach(l -> words.add(l));
            } catch (Exception e) {
                System.out.println("exception is: " + e.toString());
                e.printStackTrace();
            }
        }
        resource = Stopwords.class.getResource("twitter_short.txt");
        if (resource != null) {
            inputStream = Stopwords.class.getResourceAsStream("twitter_short.txt");
            try {
                BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
                Stream lines = br.lines();
                lines.forEach(l -> words.add(l));
            } catch (Exception e) {
                System.out.println("exception is: " + e.toString());
                e.printStackTrace();
            }
        }
        return words;
    }

    public static Set getStopwordsValidForAllLanguages() {
        Set words = new HashSet();
        for (String commonStopWord : commonStopWords) {
            words.add(commonStopWord);
        }
        for (String twitterStopWord : twitterStopWords) {
            words.add(twitterStopWord);
        }

        InputStream inputStream;
        URL resource;

        resource = Stopwords.class.getResource("stopwords_all_languages.txt");
        if (resource != null) {
            inputStream = Stopwords.class.getResourceAsStream("stopwords_all_languages.txt");
            try {
                BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
                Stream lines = br.lines();
                lines.forEach(l -> words.add(l));
            } catch (Exception e) {
                System.out.println("exception is: " + e.toString());
                e.printStackTrace();
            }
        }
        return words;

    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy