All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.etsy.conjecture.text.Text.scala Maven / Gradle / Ivy

There is a newer version: 0.2.3
Show newest version
package com.etsy.conjecture.text

case class Text(val input: String) {

    private implicit def text2str(txt: Text): String = txt.input
    private implicit def str2text(str: String): Text = new Text(str)

    override def toString = input.toString

    def replaceNumbers(replacement: String = "_num_") = Text(input.replaceAll("[0-9]+", replacement).replaceAll(replacement + "\\s+" + replacement, replacement))

    def replaceHTMLEscapes(replacement: String = " ") = Text(input.replaceAll("&[^;]+;", replacement))

    def removeHTMLTags() = Text(input.replaceAll("<.*?>", " ")) //Text(XML.loadString(input).text)

    def replaceHTMLTags(replacement: String = " ") = Text(input.replaceAll("<[^>]+>", " "))

    def replaceNonAlphaNumeric(replacement: String = " ") = Text(input.replaceAll("[^a-zA-Z0-9\\.\\s\\-]+", replacement))

    def replaceNonAlphaNumericUnderscore(replacement: String = " ") = Text(input.replaceAll("[^a-zA-Z0-9\\.\\s\\-_]+", replacement))

    def replaceNonAlpha(replacement: String = " ") = Text(input.replaceAll("[^a-zA-Z]+", replacement))

    def collapseHyphens() = Text(input.replaceAll("--+", "--"))

    def collapseUnderscores() = Text(input.replaceAll("__+", "__"))

    def collapsePeriods() = Text(input.replaceAll("\\.\\.+", ".."))

    def toLowerCase() = Text(input.toLowerCase)

    def toUpperCase() = Text(input.toUpperCase)

    def stripPunctuation() = Text(input.replaceAll("^[^A-Za-z0-0]+", "").replaceAll("[^A-Za-z0-9]+$", ""))

    // compact any white space
    def collapse() = Text(input.replaceAll("\\s+", " "))

    // remove any whitespace from the right of a string
    def rstrip() = Text(input.replaceAll("\\s+$", ""))

    // remove any whitespace from the left of a string
    def lstrip() = Text(input.replaceAll("^\\s+", ""))

    // remove any leading or trailing whitespace
    def strip() = Text(input.trim)

    // clean up any whitespace
    def wsclean() = strip().collapse()

    // remove any unprintable non-ASCII characters
    def removeUnprintables(input: String) = Text(input.replaceAll("[^\\x20-\\x7E]", ""))

    def collapseWhitespaceAndPunc = Text(input.replaceAll("\\s+", " ")
        .replaceAll("[\\-]+", "-")
        .replaceAll("[\\.]+", "."))

    def standardTextFilter = Text(removeHTMLTags()
        .replaceHTMLEscapes()
        .replaceNumbers()
        .replaceNonAlphaNumericUnderscore()
        .collapseHyphens()
        .collapseUnderscores()
        .wsclean())

    def toListFromShingles(n: Int, ns: Int*): List[String] = (List(n) ++ ns.toList).flatMap{ i: Int => input.sliding(i) }.toList

    def toSequenceFromShingles(n: Int, ns: Int*): TextSequence = new TextSequence(toListFromShingles(n, ns: _*))

    def toList(sep: String = " "): List[String] = input.split(sep).toList

    def toSequence(sep: String = " "): TextSequence = new TextSequence(toList(sep))

    def isEmpty(): Boolean = input.isEmpty()
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy