![JAR search and dependency download from the Maven repository](/logo.png)
com.gravity.goose.text.StopWords.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of goose Show documentation
Show all versions of goose Show documentation
Goose was originally an article extractor written in Java that has most recently (aug2011) converted to a scala project. It's mission is to take any news article or article type web page and not only extract what is the main body of the article but also all meta data and most probable image candidate.
The newest version!
/**
* Licensed to Gravity.com under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Gravity.com licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.gravity.goose.text
/**
* Created by Jim Plush
* User: jim
* Date: 8/16/11
*/
import java.util._
import com.gravity.goose.utils.FileHelper
object StopWords {
// the confusing pattern below is basically just match any non-word character excluding white-space.
private val PUNCTUATION: StringReplacement = StringReplacement.compile("[^\\p{Ll}\\p{Lu}\\p{Lt}\\p{Lo}\\p{Nd}\\p{Pc}\\s]", string.empty)
val STOP_WORDS = FileHelper.loadResourceFile("stopwords-en.txt", StopWords.getClass).split(sys.props("line.separator")).toSet
def removePunctuation(str: String): String = {
PUNCTUATION.replaceAll(str)
}
def getStopWordCount(content: String): WordStats = {
if (string.isNullOrEmpty(content)) return WordStats.EMPTY
val ws: WordStats = new WordStats
val strippedInput: String = removePunctuation(content)
val candidateWords: Array[String] = string.SPACE_SPLITTER.split(strippedInput)
val overlappingStopWords: List[String] = new ArrayList[String]
candidateWords.foreach(w => {
if (STOP_WORDS.contains(w.toLowerCase)) overlappingStopWords.add(w.toLowerCase)
})
ws.setWordCount(candidateWords.length)
ws.setStopWordCount(overlappingStopWords.size)
ws.setStopWords(overlappingStopWords)
ws
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy