All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ants.subatomic-search-shared_native0.4_2.13.0.0.8.source-code.Tokenizer.scala Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2020 Anton Sviridov
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package subatomic
package search

import scala.collection.mutable.ArrayBuffer

object DefaultTokenizer extends (String => Vector[String]) {

  private final val Apostrophe = '\''

  def apply(content: String): Vector[String] = {
    val tokens = ArrayBuffer.empty[String]
    val acc    = new StringBuilder

    def appendMaybe(str: String) = {
      if (!Stopwords.list(str)) tokens.append(PorterStemmer.stem(str))
    }

    var lastChar = Option.empty[Char]

    content.toLowerCase.foreach { char =>
      if (char.isLetterOrDigit) {
        acc.append(char)
        lastChar = Some(char)
      } else if (char == Apostrophe && lastChar.exists(_.isLetter)) {
        acc.append(char)
        lastChar = None
      } else if (acc.nonEmpty) { appendMaybe(acc.result()); acc.clear() }
    }

    if (acc.nonEmpty) appendMaybe(acc.result())

    tokens.toVector
  }
}

object Stopwords {
  val list: Set[String] =
    ("a,about,above,after,again,against,all,am,an,and,any,are,aren't,as,at," +
      "be,because,been,before,being,below,between,both,but,by,can't,cannot,could,couldn't," +
      "did,didn't,do,does,doesn't,doing,don't,down,during,each,few,for,from,further,had,hadn't," +
      "has,hasn't,have,haven't,having,he,he'd,he'll,he's,her,here,here's,hers,herself,him," +
      "himself,his,how,how's,i,i'd,i'll,i'm,i've,if,in,into,is,isn't,it,it's,its,itself," +
      "let's,me,more,most,mustn't,my,myself,no,nor,not,of,off,on,once,only,or,other,ought," +
      "our,ours,ourselves,out,over,own,same,shan't,she,she'd,she'll,she's,should,shouldn't," +
      "so,some,such,than,that,that's,the,their,theirs,them,themselves,then,there,there's," +
      "these,they,they'd,they'll,they're,they've,this,those,through,to,too,under,until,up," +
      "very,was,wasn't,we,we'd,we'll,we're,we've,were,weren't,what,what's,when,when's,where," +
      "where's,which,while,who,who's,whom,why,why's,with,won't,would,wouldn't,you,you'd,you'll," +
      "you're,you've,your,yours,yourself,yourselves").split(",").toSet
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy