ants.subatomic-search-shared_native0.4_2.13.0.0.8.source-code.Tokenizer.scala Maven / Gradle / Ivy
The newest version!
/*
* Copyright 2020 Anton Sviridov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package subatomic
package search
import scala.collection.mutable.ArrayBuffer
object DefaultTokenizer extends (String => Vector[String]) {
private final val Apostrophe = '\''
def apply(content: String): Vector[String] = {
val tokens = ArrayBuffer.empty[String]
val acc = new StringBuilder
def appendMaybe(str: String) = {
if (!Stopwords.list(str)) tokens.append(PorterStemmer.stem(str))
}
var lastChar = Option.empty[Char]
content.toLowerCase.foreach { char =>
if (char.isLetterOrDigit) {
acc.append(char)
lastChar = Some(char)
} else if (char == Apostrophe && lastChar.exists(_.isLetter)) {
acc.append(char)
lastChar = None
} else if (acc.nonEmpty) { appendMaybe(acc.result()); acc.clear() }
}
if (acc.nonEmpty) appendMaybe(acc.result())
tokens.toVector
}
}
object Stopwords {
val list: Set[String] =
("a,about,above,after,again,against,all,am,an,and,any,are,aren't,as,at," +
"be,because,been,before,being,below,between,both,but,by,can't,cannot,could,couldn't," +
"did,didn't,do,does,doesn't,doing,don't,down,during,each,few,for,from,further,had,hadn't," +
"has,hasn't,have,haven't,having,he,he'd,he'll,he's,her,here,here's,hers,herself,him," +
"himself,his,how,how's,i,i'd,i'll,i'm,i've,if,in,into,is,isn't,it,it's,its,itself," +
"let's,me,more,most,mustn't,my,myself,no,nor,not,of,off,on,once,only,or,other,ought," +
"our,ours,ourselves,out,over,own,same,shan't,she,she'd,she'll,she's,should,shouldn't," +
"so,some,such,than,that,that's,the,their,theirs,them,themselves,then,there,there's," +
"these,they,they'd,they'll,they're,they've,this,those,through,to,too,under,until,up," +
"very,was,wasn't,we,we'd,we'll,we're,we've,were,weren't,what,what's,when,when's,where," +
"where's,which,while,who,who's,whom,why,why's,with,won't,would,wouldn't,you,you'd,you'll," +
"you're,you've,your,yours,yourself,yourselves").split(",").toSet
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy