com.johnsnowlabs.reader.SparkNLPReader.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-nlp_2.12 Show documentation
spark-nlp
There is a newer version: 6.0.3
/*
 * Copyright 2017-2024 John Snow Labs
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.johnsnowlabs.reader

import com.johnsnowlabs.nlp.util.io.ResourceHelper
import com.johnsnowlabs.reader.util.pdf.TextStripperType
import org.apache.spark.ml.Pipeline
import org.apache.spark.sql.DataFrame

import scala.collection.JavaConverters._

class SparkNLPReader(params: java.util.Map[String, String] = new java.util.HashMap()) {

  /** Instantiates class to read HTML files.
    *
    * Two types of input paths are supported,
    *
    * htmlPath: this is a path to a directory of HTML files or a path to an HTML file E.g.
    * "path/html/files"
    *
    * url: this is the URL or set of URLs of a website . E.g., "https://www.wikipedia.org"
    *
    * ==Example==
    * {{{
    * val url = "https://www.wikipedia.org"
    * val sparkNLPReader = new SparkNLPReader()
    * val htmlDf = sparkNLPReader.html(url)
    * }}}
    *
    * ==Example 2==
    * You can use SparkNLP for one line of code
    * {{{
    * val htmlDf = SparkNLP.read.html(url)
    * }}}
    * {{{
    * htmlDf.show(false)
    *
    * +--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
    * |url                 |html                                                                                                                                                                                                                                                                                                                            |
    * +--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
    * |https://example.com/|[{Title, Example Domain, {pageNumber -> 1}}, {NarrativeText, 0, This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission., {pageNumber -> 1}}, {NarrativeText, 0, More information... More information..., {pageNumber -> 1}}]   |
    * +--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
    *
    * htmlDf.printSchema()
    * root
    *  |-- url: string (nullable = true)
    *  |-- html: array (nullable = true)
    *  |    |-- element: struct (containsNull = true)
    *  |    |    |-- elementType: string (nullable = true)
    *  |    |    |-- content: string (nullable = true)
    *  |    |    |-- metadata: map (nullable = true)
    *  |    |    |    |-- key: string
    *  |    |    |    |-- value: string (valueContainsNull = true)
    * }}}
    *
    * @param params
    *   Parameter with custom configuration
    */

  def html(htmlPath: String): DataFrame = {
    val htmlReader = new HTMLReader(getTitleFontSize, getStoreContent)
    htmlReader.read(htmlPath)
  }

  def html(urls: Array[String]): DataFrame = {
    val htmlReader = new HTMLReader(getTitleFontSize, getStoreContent)
    htmlReader.read(urls)
  }

  def html(urls: java.util.List[String]): DataFrame = {
    val htmlReader = new HTMLReader(getTitleFontSize, getStoreContent)
    htmlReader.read(urls.asScala.toArray)
  }

  private def getTitleFontSize: Int = {
    val titleFontSize =
      try {
        params.asScala.getOrElse("titleFontSize", "16").toInt
      } catch {
        case _: IllegalArgumentException => 16
      }

    titleFontSize
  }

  private def getStoreContent: Boolean = {
    val storeContent =
      try {
        params.asScala.getOrElse("storeContent", "false").toBoolean
      } catch {
        case _: IllegalArgumentException => false
      }
    storeContent
  }

  /** Instantiates class to read email files.
    *
    * emailPath: this is a path to a directory of HTML files or a path to an HTML file E.g.
    * "path/email/files"
    *
    * ==Example==
    * {{{
    * val emailsPath = "home/user/emails-directory"
    * val sparkNLPReader = new SparkNLPReader()
    * val emailDf = sparkNLPReader.email(emailsPath)
    * }}}
    *
    * ==Example 2==
    * You can use SparkNLP for one line of code
    * {{{
    * val emailDf = SparkNLP.read.email(emailsPath)
    * }}}
    *
    * {{{
    * emailDf.select("email").show(false)
    * +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
    * |email                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
    * +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
    * |[{Title, Email Text Attachments, {sent_to -> Danilo Burbano , sent_from -> Danilo Burbano }}, {NarrativeText, Email  test with two text attachments\r\n\r\nCheers,\r\n\r\n, {sent_to -> Danilo Burbano , sent_from -> Danilo Burbano , mimeType -> text/plain}}, {NarrativeText, \r\n\r\n\r\n\r\n\r\n\r\nEmail  test with two text attachments\r\n\r\n
\r\n
\r\n\r\nCheers,
\r\n\r\n
\r\n
\r\n\r\n\r\n, {sent_to -> Danilo Burbano , sent_from -> Danilo Burbano , mimeType -> text/html}}, {Attachment, filename.txt, {sent_to -> Danilo Burbano , sent_from -> Danilo Burbano , contentType -> text/plain; name="filename.txt"}}, {NarrativeText, This is the content of the file.\n, {sent_to -> Danilo Burbano , sent_from -> Danilo Burbano , mimeType -> text/plain}}, {Attachment, filename2.txt, {sent_to -> Danilo Burbano , sent_from -> Danilo Burbano , contentType -> text/plain; name="filename2.txt"}}, {NarrativeText, This is an additional content file.\n, {sent_to -> Danilo Burbano , sent_from -> Danilo Burbano , mimeType -> text/plain}}]|
    * +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
    *
    * emailDf.printSchema()
    * root
    *  |-- path: string (nullable = true)
    *  |-- content: binary (nullable = true)
    *  |-- email: array (nullable = true)
    *  |    |-- element: struct (containsNull = true)
    *  |    |    |-- elementType: string (nullable = true)
    *  |    |    |-- content: string (nullable = true)
    *  |    |    |-- metadata: map (nullable = true)
    *  |    |    |    |-- key: string
    *  |    |    |    |-- value: string (valueContainsNull = true)
    * }}}
    *
    * @param params
    *   Parameter with custom configuration
    */

  def email(emailPath: String): DataFrame = {
    val emailReader = new EmailReader(getAddAttachmentContent, getStoreContent)
    emailReader.read(emailPath)
  }

  private def getAddAttachmentContent: Boolean = {
    val addAttachmentContent =
      try {
        params.asScala.getOrElse("addAttachmentContent", "false").toBoolean
      } catch {
        case _: IllegalArgumentException => false
      }
    addAttachmentContent
  }

  /** Instantiates class to read Word files.
    *
    * docPath: this is a path to a directory of Word files or a path to an HTML file E.g.
    * "path/word/files"
    *
    * ==Example==
    * {{{
    * val docsPath = "home/user/word-directory"
    * val sparkNLPReader = new SparkNLPReader()
    * val docsDf = sparkNLPReader.email(docsPath)
    * }}}
    *
    * ==Example 2==
    * You can use SparkNLP for one line of code
    * {{{
    * val docsDf = SparkNLP.read.doc(docsPath)
    * }}}
    *
    * {{{
    * docsDf.select("doc").show(false)
    * +----------------------------------------------------------------------------------------------------------------------------------------------------+
    * |doc                                                                                                                                                 |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
    * +----------------------------------------------------------------------------------------------------------------------------------------------------+
    * |[{Table, Header Col 1, {}}, {Table, Header Col 2, {}}, {Table, Lorem ipsum, {}}, {Table, A Link example, {}}, {NarrativeText, Dolor sit amet, {}}]  |
    * +----------------------------------------------------------------------------------------------------------------------------------------------------+
    *
    * docsDf.printSchema()
    * root
    *  |-- path: string (nullable = true)
    *  |-- content: binary (nullable = true)
    *  |-- doc: array (nullable = true)
    *  |    |-- element: struct (containsNull = true)
    *  |    |    |-- elementType: string (nullable = true)
    *  |    |    |-- content: string (nullable = true)
    *  |    |    |-- metadata: map (nullable = true)
    *  |    |    |    |-- key: string
    *  |    |    |    |-- value: string (valueContainsNull = true)
    * }}}
    *
    * @param params
    *   Parameter with custom configuration
    */

  def doc(docPath: String): DataFrame = {
    val wordReader = new WordReader(getStoreContent)
    wordReader.doc(docPath)
  }

  /** Instantiates class to read PDF files.
    *
    * pdfPath: this is a path to a directory of PDF files or a path to an PDF file E.g.
    * "path/pdfs/"
    *
    * ==Example==
    * {{{
    * val pdfsPath = "home/user/pdfs-directory"
    * val sparkNLPReader = new SparkNLPReader()
    * val pdfDf = sparkNLPReader.pdf(pdfsPath)
    * }}}
    *
    * ==Example 2==
    * You can use SparkNLP for one line of code
    * {{{
    * val pdfDf = SparkNLP.read.pdf(pdfsPath)
    * }}}
    *
    * {{{
    * pdfDf.show(false)
    * +--------------------+--------------------+------+--------------------+----------------+---------------+--------------------+---------+-------+
    * |                path|    modificationTime|length|                text|height_dimension|width_dimension|             content|exception|pagenum|
    * +--------------------+--------------------+------+--------------------+----------------+---------------+--------------------+---------+-------+
    * |file:/content/pdf...|2025-01-15 20:48:...| 25803|This is a Title \...|             842|            596|[25 50 44 46 2D 3...|     NULL|      0|
    * |file:/content/pdf...|2025-01-15 20:48:...|  9487|This is a page.\n...|             841|            595|[25 50 44 46 2D 3...|     NULL|      0|
    * +--------------------+--------------------+------+--------------------+----------------+---------------+--------------------+---------+-------+
    *
    * pdf_df.printSchema()
    * root
    *  |-- path: string (nullable = true)
    *  |-- modificationTime: timestamp (nullable = true)
    *  |-- length: long (nullable = true)
    *  |-- text: string (nullable = true)
    *  |-- height_dimension: integer (nullable = true)
    *  |-- width_dimension: integer (nullable = true)
    *  |-- content: binary (nullable = true)
    *  |-- exception: string (nullable = true)
    *  |-- pagenum: integer (nullable = true)
    * }}}
    *
    * @param params
    *   Parameter with custom configuration
    */
  def pdf(pdfPath: String): DataFrame = {
    val spark = ResourceHelper.spark
    spark.conf.set("spark.sql.legacy.allowUntypedScalaUDF", "true")
    val pdfToText = new PdfToText()
      .setStoreSplittedPdf(getStoreSplittedPdf)
      .setSplitPage(getSplitPage)
      .setOnlyPageNum(getOnlyPageNum)
      .setTextStripper(getTextStripper)
      .setSort(getSort)
    val binaryPdfDF = spark.read.format("binaryFile").load(pdfPath)
    val pipelineModel = new Pipeline()
      .setStages(Array(pdfToText))
      .fit(binaryPdfDF)

    pipelineModel.transform(binaryPdfDF)
  }

  private def getStoreSplittedPdf: Boolean = {
    val splitPage =
      try {
        params.asScala.getOrElse("storeSplittedPdf", "false").toBoolean
      } catch {
        case _: IllegalArgumentException => false
      }
    splitPage
  }

  private def getSplitPage: Boolean = {
    val splitPage =
      try {
        params.asScala.getOrElse("splitPage", "true").toBoolean
      } catch {
        case _: IllegalArgumentException => true
      }
    splitPage
  }

  private def getOnlyPageNum: Boolean = {
    val splitPage =
      try {
        params.asScala.getOrElse("onlyPageNum", "false").toBoolean
      } catch {
        case _: IllegalArgumentException => false
      }
    splitPage
  }

  private def getTextStripper: String = {
    val textStripper =
      try {
        params.asScala.getOrElse("textStripper", TextStripperType.PDF_TEXT_STRIPPER)
      } catch {
        case _: IllegalArgumentException => TextStripperType.PDF_TEXT_STRIPPER
      }
    textStripper
  }

  private def getSort: Boolean = {
    val sort =
      try {
        params.asScala.getOrElse("sort", "false").toBoolean
      } catch {
        case _: IllegalArgumentException => false
      }
    sort
  }

  /** Instantiates class to read Excel files.
    *
    * docPath: this is a path to a directory of Excel files or a path to an HTML file E.g.
    * "path/excel/files"
    *
    * ==Example==
    * {{{
    * val docsPath = "home/user/excel-directory"
    * val sparkNLPReader = new SparkNLPReader()
    * val xlsDf = sparkNLPReader.xls(docsPath)
    * }}}
    *
    * ==Example 2==
    * You can use SparkNLP for one line of code
    * {{{
    * val xlsDf = SparkNLP.read.xls(docsPath)
    * }}}
    *
    * {{{
    * xlsDf.select("xls").show(false)
    * +-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
    * |xls                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
    * +-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
    * |[{Title, Financial performance, {SheetName -> Index}}, {Title, Topic\tPeriod\t\t\tPage, {SheetName -> Index}}, {NarrativeText, Quarterly revenue\tNine quarters to 30 June 2023\t\t\t1.0, {SheetName -> Index}}, {NarrativeText, Group financial performance\tFY 22\tFY 23\t\t2.0, {SheetName -> Index}}, {NarrativeText, Segmental results\tFY 22\tFY 23\t\t3.0, {SheetName -> Index}}, {NarrativeText, Segmental analysis\tFY 22\tFY 23\t\t4.0, {SheetName -> Index}}, {NarrativeText, Cash flow\tFY 22\tFY 23\t\t5.0, {SheetName -> Index}}, {Title, Operational metrics, {SheetName -> Index}}, {Title, Topic\tPeriod\t\t\tPage, {SheetName -> Index}}, {NarrativeText, Mobile customers\tNine quarters to 30 June 2023\t\t\t6.0, {SheetName -> Index}}, {NarrativeText, Fixed broadband customers\tNine quarters to 30 June 2023\t\t\t7.0, {SheetName -> Index}}, {NarrativeText, Marketable homes passed\tNine quarters to 30 June 2023\t\t\t8.0, {SheetName -> Index}}, {NarrativeText, TV customers\tNine quarters to 30 June 2023\t\t\t9.0, {SheetName -> Index}}, {NarrativeText, Converged customers\tNine quarters to 30 June 2023\t\t\t10.0, {SheetName -> Index}}, {NarrativeText, Mobile churn\tNine quarters to 30 June 2023\t\t\t11.0, {SheetName -> Index}}, {NarrativeText, Mobile data usage\tNine quarters to 30 June 2023\t\t\t12.0, {SheetName -> Index}}, {NarrativeText, Mobile ARPU\tNine quarters to 30 June 2023\t\t\t13.0, {SheetName -> Index}}, {Title, Other, {SheetName -> Index}}, {Title, Topic\tPeriod\t\t\tPage, {SheetName -> Index}}, {NarrativeText, Average foreign exchange rates\tNine quarters to 30 June 2023\t\t\t14.0, {SheetName -> Index}}, {NarrativeText, Guidance rates\tFY 23/24\t\t\t14.0, {SheetName -> Index}}]|
    * +-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
    *
    * xlsDf.printSchema()
    * root
    *  |-- path: string (nullable = true)
    *  |-- content: binary (nullable = true)
    *  |-- xls: array (nullable = true)
    *  |    |-- element: struct (containsNull = true)
    *  |    |    |-- elementType: string (nullable = true)
    *  |    |    |-- content: string (nullable = true)
    *  |    |    |-- metadata: map (nullable = true)
    *  |    |    |    |-- key: string
    *  |    |    |    |-- value: string (valueContainsNull = true)
    * }}}
    *
    * @param params
    *   Parameter with custom configuration
    */

  def xls(docPath: String): DataFrame = {
    val excelReader = new ExcelReader(getTitleFontSize, getCellSeparator, getStoreContent)
    excelReader.xls(docPath)
  }

  private def getCellSeparator: String = {
    params.asScala.getOrElse("cellSeparator", "\t")
  }

  /** Instantiates class to read PowerPoint files.
    *
    * docPath: this is a path to a directory of Excel files or a path to an HTML file E.g.
    * "path/power-point/files"
    *
    * ==Example==
    * {{{
    * val docsPath = "home/user/power-point-directory"
    * val sparkNLPReader = new SparkNLPReader()
    * val pptDf = sparkNLPReader.ppt(docsPath)
    * }}}
    *
    * ==Example 2==
    * You can use SparkNLP for one line of code
    * {{{
    * val pptDf = SparkNLP.read.ppt(docsPath)
    * }}}
    *
    * {{{
    * xlsDf.select("ppt").show(false)
    * +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
    * |ppt                                                                                                                                                                                                                                                                                                                      |
    * +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
    * |[{Title, Adding a Bullet Slide, {}}, {ListItem, • Find the bullet slide layout, {}}, {ListItem, – Use _TextFrame.text for first bullet, {}}, {ListItem, • Use _TextFrame.add_paragraph() for subsequent bullets, {}}, {NarrativeText, Here is a lot of text!, {}}, {NarrativeText, Here is some text in a text box!, {}}]|
    * +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
    *
    * pptDf.printSchema()
    * root
    *  |-- path: string (nullable = true)
    *  |-- content: binary (nullable = true)
    *  |-- ppt: array (nullable = true)
    *  |    |-- element: struct (containsNull = true)
    *  |    |    |-- elementType: string (nullable = true)
    *  |    |    |-- content: string (nullable = true)
    *  |    |    |-- metadata: map (nullable = true)
    *  |    |    |    |-- key: string
    *  |    |    |    |-- value: string (valueContainsNull = true)
    * }}}
    *
    * @param params
    *   Parameter with custom configuration
    */

  def ppt(docPath: String): DataFrame = {
    val powerPointReader = new PowerPointReader(getStoreContent)
    powerPointReader.ppt(docPath)
  }

  /** Instantiates class to read txt files.
    *
    * filePath: this is a path to a directory of TXT files or a path to an TXT file E.g.
    * "path/txt/files"
    *
    * ==Example==
    * {{{
    * val filePath = "home/user/txt/files"
    * val sparkNLPReader = new SparkNLPReader()
    * val txtDf = sparkNLPReader.txt(filePath)
    * }}}
    *
    * ==Example 2==
    * You can use SparkNLP for one line of code
    * {{{
    * val txtDf = SparkNLP.read.txt(filePath)
    * }}}
    *
    * {{{
    * txtDf.select("txt").show(false)
    * +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
    * |txt                                                                                                                                                                                                                                                                                                                                                                                                                                        |
    * +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
    * |[{Title, BIG DATA ANALYTICS, {paragraph -> 0}}, {NarrativeText, Apache Spark is a fast and general-purpose cluster computing system.\nIt provides high-level APIs in Java, Scala, Python, and R., {paragraph -> 0}}, {Title, MACHINE LEARNING, {paragraph -> 1}}, {NarrativeText, Spark's MLlib provides scalable machine learning algorithms.\nIt includes tools for classification, regression, clustering, and more., {paragraph -> 1}}]|
    * +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
    *
    * emailDf.printSchema()
    * root
    *  |-- path: string (nullable = true)
    *  |-- content: binary (nullable = true)
    *  |-- txt: array (nullable = true)
    *  |    |-- element: struct (containsNull = true)
    *  |    |    |-- elementType: string (nullable = true)
    *  |    |    |-- content: string (nullable = true)
    *  |    |    |-- metadata: map (nullable = true)
    *  |    |    |    |-- key: string
    *  |    |    |    |-- value: string (valueContainsNull = true)
    * }}}
    *
    * @param params
    *   Parameter with custom configuration
    */
  def txt(filePath: String): DataFrame = {
    val textReader = new TextReader(getTitleLengthSize, getStoreContent)
    textReader.txt(filePath)
  }

  private def getTitleLengthSize: Int = {
    val titleLengthSize =
      try {
        params.asScala.getOrElse("titleLengthSize", "50").toInt
      } catch {
        case _: IllegalArgumentException => 50
      }

    titleLengthSize
  }

}