
com.vader.sentiment.processor.TextProperties Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of vader-sentiment-analyzer Show documentation
Show all versions of vader-sentiment-analyzer Show documentation
Java port of Python NLTK Vader Sentiment Analyzer. VADER (Valence Aware Dictionary and sEntiment Reasoner)
is a lexicon and rule-based sentiment analysis tool that is specifically attuned to sentiments expressed in
social media, and works well on texts from other domains.
The newest version!
/*
* MIT License
*
* Copyright (c) 2018 Animesh Pandey
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
package com.vader.sentiment.processor;
import java.io.IOException;
import java.util.Collections;
import java.util.List;
import com.vader.sentiment.util.Utils;
/**
* The TextProperties class implements the pre-processing steps of the input string for sentiment analysis.
* It utilizes the Lucene analyzer to perform processing on the input string.
*
* @author Animesh Pandey
*/
public final class TextProperties {
/**
* String whose properties will be extracted.
*/
private String inputText;
/**
* List of tokens and emoticons extracted from the {@link TextProperties#inputText}.
*/
private List wordsAndEmoticons;
/**
* List of tokens extracted from the {@link TextProperties#inputText}.
* Emoticons are removed here.
*/
private List wordsOnly;
/**
* Flags that specifies if the current string has yelling words.
*/
private boolean isCapDiff;
/**
* Parameterized constructor accepting the input string that will be processed.
*
* @param inputText the input string
* @throws IOException if there is an issue with the lucene analyzers
*/
public TextProperties(String inputText) throws IOException {
this.inputText = inputText;
setWordsAndEmoticons();
setCapDiff(isAllCapDifferential());
}
/**
* This method tokenizes the input string, preserving the punctuation marks using
*
* @throws IOException if something goes wrong in the Lucene analyzer.
* @see InputAnalyzer#tokenize(String, boolean)
*/
private void setWordsAndEmoticons() throws IOException {
setWordsOnly();
final List wordsAndEmoticonsList = new InputAnalyzer().defaultSplit(inputText);
for (String currentWord : wordsOnly) {
for (String currentPunc : Utils.PUNCTUATION_LIST) {
final String wordPunct = currentWord + currentPunc;
Integer wordPunctCount = Collections.frequency(wordsAndEmoticonsList, wordPunct);
while (wordPunctCount > 0) {
final int index = wordsAndEmoticonsList.indexOf(wordPunct);
wordsAndEmoticonsList.remove(wordPunct);
wordsAndEmoticonsList.add(index, currentWord);
wordPunctCount = Collections.frequency(wordsAndEmoticonsList, wordPunct);
}
final String punctWord = currentPunc + currentWord;
Integer punctWordCount = Collections.frequency(wordsAndEmoticonsList, punctWord);
while (punctWordCount > 0) {
final int index = wordsAndEmoticonsList.indexOf(punctWord);
wordsAndEmoticonsList.remove(punctWord);
wordsAndEmoticonsList.add(index, currentWord);
punctWordCount = Collections.frequency(wordsAndEmoticonsList, punctWord);
}
}
}
this.wordsAndEmoticons = wordsAndEmoticonsList;
}
/**
* This method tokenizes the input string, removing the special characters as well.
*
* @throws IOException iff there is an error which using Lucene analyzers.
* @see InputAnalyzer#removePunctuation(String)
*/
private void setWordsOnly() throws IOException {
this.wordsOnly = new InputAnalyzer().removePunctuation(inputText);
}
private void setCapDiff(boolean capDiff) {
this.isCapDiff = capDiff;
}
public List getWordsAndEmoticons() {
return wordsAndEmoticons;
}
public List getWordsOnly() {
return wordsOnly;
}
public boolean isCapDiff() {
return isCapDiff;
}
/**
* Return true iff the input has yelling words i.e. all caps in the tokens, but all the token should not be
* in upper case.
* e.g. [GET, THE, HELL, OUT] returns false
* [GET, the, HELL, OUT] returns true
* [get, the, hell, out] returns false
*
* @return boolean value
*/
private boolean isAllCapDifferential() {
int countAllCaps = 0;
for (String token : wordsAndEmoticons) {
if (Utils.isUpper(token)) {
countAllCaps++;
}
}
final int capDifferential = wordsAndEmoticons.size() - countAllCaps;
return (0 < capDifferential) && (capDifferential < wordsAndEmoticons.size());
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy