
com.johnsnowlabs.reader.SparkNLPReader.scala Maven / Gradle / Ivy
/*
* Copyright 2017-2024 John Snow Labs
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.johnsnowlabs.reader
import com.johnsnowlabs.nlp.util.io.ResourceHelper
import com.johnsnowlabs.reader.util.pdf.TextStripperType
import org.apache.spark.ml.Pipeline
import org.apache.spark.sql.DataFrame
import scala.collection.JavaConverters._
class SparkNLPReader(params: java.util.Map[String, String] = new java.util.HashMap()) {
/** Instantiates class to read HTML files.
*
* Two types of input paths are supported,
*
* htmlPath: this is a path to a directory of HTML files or a path to an HTML file E.g.
* "path/html/files"
*
* url: this is the URL or set of URLs of a website . E.g., "https://www.wikipedia.org"
*
* ==Example==
* {{{
* val url = "https://www.wikipedia.org"
* val sparkNLPReader = new SparkNLPReader()
* val htmlDf = sparkNLPReader.html(url)
* }}}
*
* ==Example 2==
* You can use SparkNLP for one line of code
* {{{
* val htmlDf = SparkNLP.read.html(url)
* }}}
* {{{
* htmlDf.show(false)
*
* +--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
* |url |html |
* +--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
* |https://example.com/|[{Title, Example Domain, {pageNumber -> 1}}, {NarrativeText, 0, This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission., {pageNumber -> 1}}, {NarrativeText, 0, More information... More information..., {pageNumber -> 1}}] |
* +--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
*
* htmlDf.printSchema()
* root
* |-- url: string (nullable = true)
* |-- html: array (nullable = true)
* | |-- element: struct (containsNull = true)
* | | |-- elementType: string (nullable = true)
* | | |-- content: string (nullable = true)
* | | |-- metadata: map (nullable = true)
* | | | |-- key: string
* | | | |-- value: string (valueContainsNull = true)
* }}}
*
* @param params
* Parameter with custom configuration
*/
def html(htmlPath: String): DataFrame = {
val htmlReader = new HTMLReader(getTitleFontSize, getStoreContent)
htmlReader.read(htmlPath)
}
def html(urls: Array[String]): DataFrame = {
val htmlReader = new HTMLReader(getTitleFontSize, getStoreContent)
htmlReader.read(urls)
}
def html(urls: java.util.List[String]): DataFrame = {
val htmlReader = new HTMLReader(getTitleFontSize, getStoreContent)
htmlReader.read(urls.asScala.toArray)
}
private def getTitleFontSize: Int = {
val titleFontSize =
try {
params.asScala.getOrElse("titleFontSize", "16").toInt
} catch {
case _: IllegalArgumentException => 16
}
titleFontSize
}
private def getStoreContent: Boolean = {
val storeContent =
try {
params.asScala.getOrElse("storeContent", "false").toBoolean
} catch {
case _: IllegalArgumentException => false
}
storeContent
}
/** Instantiates class to read email files.
*
* emailPath: this is a path to a directory of HTML files or a path to an HTML file E.g.
* "path/email/files"
*
* ==Example==
* {{{
* val emailsPath = "home/user/emails-directory"
* val sparkNLPReader = new SparkNLPReader()
* val emailDf = sparkNLPReader.email(emailsPath)
* }}}
*
* ==Example 2==
* You can use SparkNLP for one line of code
* {{{
* val emailDf = SparkNLP.read.email(emailsPath)
* }}}
*
* {{{
* emailDf.select("email").show(false)
* +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
* |email |
* +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
* |[{Title, Email Text Attachments, {sent_to -> Danilo Burbano , sent_from -> Danilo Burbano }}, {NarrativeText, Email test with two text attachments\r\n\r\nCheers,\r\n\r\n, {sent_to -> Danilo Burbano , sent_from -> Danilo Burbano , mimeType -> text/plain}}, {NarrativeText, \r\n\r\n\r\n\r\n\r\n\r\nEmail test with two text attachments\r\n\r\n
\r\n\r\n\r\nCheers,\r\n\r\n
\r\n\r\n\r\n\r\n, {sent_to -> Danilo Burbano , sent_from -> Danilo Burbano , mimeType -> text/html}}, {Attachment, filename.txt, {sent_to -> Danilo Burbano , sent_from -> Danilo Burbano , contentType -> text/plain; name="filename.txt"}}, {NarrativeText, This is the content of the file.\n, {sent_to -> Danilo Burbano , sent_from -> Danilo Burbano , mimeType -> text/plain}}, {Attachment, filename2.txt, {sent_to -> Danilo Burbano , sent_from -> Danilo Burbano , contentType -> text/plain; name="filename2.txt"}}, {NarrativeText, This is an additional content file.\n, {sent_to -> Danilo Burbano , sent_from -> Danilo Burbano , mimeType -> text/plain}}]|
* +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
*
* emailDf.printSchema()
* root
* |-- path: string (nullable = true)
* |-- content: binary (nullable = true)
* |-- email: array (nullable = true)
* | |-- element: struct (containsNull = true)
* | | |-- elementType: string (nullable = true)
* | | |-- content: string (nullable = true)
* | | |-- metadata: map (nullable = true)
* | | | |-- key: string
* | | | |-- value: string (valueContainsNull = true)
* }}}
*
* @param params
* Parameter with custom configuration
*/
def email(emailPath: String): DataFrame = {
val emailReader = new EmailReader(getAddAttachmentContent, getStoreContent)
emailReader.read(emailPath)
}
private def getAddAttachmentContent: Boolean = {
val addAttachmentContent =
try {
params.asScala.getOrElse("addAttachmentContent", "false").toBoolean
} catch {
case _: IllegalArgumentException => false
}
addAttachmentContent
}
/** Instantiates class to read Word files.
*
* docPath: this is a path to a directory of Word files or a path to an HTML file E.g.
* "path/word/files"
*
* ==Example==
* {{{
* val docsPath = "home/user/word-directory"
* val sparkNLPReader = new SparkNLPReader()
* val docsDf = sparkNLPReader.email(docsPath)
* }}}
*
* ==Example 2==
* You can use SparkNLP for one line of code
* {{{
* val docsDf = SparkNLP.read.doc(docsPath)
* }}}
*
* {{{
* docsDf.select("doc").show(false)
* +----------------------------------------------------------------------------------------------------------------------------------------------------+
* |doc | |
* +----------------------------------------------------------------------------------------------------------------------------------------------------+
* |[{Table, Header Col 1, {}}, {Table, Header Col 2, {}}, {Table, Lorem ipsum, {}}, {Table, A Link example, {}}, {NarrativeText, Dolor sit amet, {}}] |
* +----------------------------------------------------------------------------------------------------------------------------------------------------+
*
* docsDf.printSchema()
* root
* |-- path: string (nullable = true)
* |-- content: binary (nullable = true)
* |-- doc: array (nullable = true)
* | |-- element: struct (containsNull = true)
* | | |-- elementType: string (nullable = true)
* | | |-- content: string (nullable = true)
* | | |-- metadata: map (nullable = true)
* | | | |-- key: string
* | | | |-- value: string (valueContainsNull = true)
* }}}
*
* @param params
* Parameter with custom configuration
*/
def doc(docPath: String): DataFrame = {
val wordReader = new WordReader(getStoreContent)
wordReader.doc(docPath)
}
/** Instantiates class to read PDF files.
*
* pdfPath: this is a path to a directory of PDF files or a path to an PDF file E.g.
* "path/pdfs/"
*
* ==Example==
* {{{
* val pdfsPath = "home/user/pdfs-directory"
* val sparkNLPReader = new SparkNLPReader()
* val pdfDf = sparkNLPReader.pdf(pdfsPath)
* }}}
*
* ==Example 2==
* You can use SparkNLP for one line of code
* {{{
* val pdfDf = SparkNLP.read.pdf(pdfsPath)
* }}}
*
* {{{
* pdfDf.show(false)
* +--------------------+--------------------+------+--------------------+----------------+---------------+--------------------+---------+-------+
* | path| modificationTime|length| text|height_dimension|width_dimension| content|exception|pagenum|
* +--------------------+--------------------+------+--------------------+----------------+---------------+--------------------+---------+-------+
* |file:/content/pdf...|2025-01-15 20:48:...| 25803|This is a Title \...| 842| 596|[25 50 44 46 2D 3...| NULL| 0|
* |file:/content/pdf...|2025-01-15 20:48:...| 9487|This is a page.\n...| 841| 595|[25 50 44 46 2D 3...| NULL| 0|
* +--------------------+--------------------+------+--------------------+----------------+---------------+--------------------+---------+-------+
*
* pdf_df.printSchema()
* root
* |-- path: string (nullable = true)
* |-- modificationTime: timestamp (nullable = true)
* |-- length: long (nullable = true)
* |-- text: string (nullable = true)
* |-- height_dimension: integer (nullable = true)
* |-- width_dimension: integer (nullable = true)
* |-- content: binary (nullable = true)
* |-- exception: string (nullable = true)
* |-- pagenum: integer (nullable = true)
* }}}
*
* @param params
* Parameter with custom configuration
*/
def pdf(pdfPath: String): DataFrame = {
val spark = ResourceHelper.spark
spark.conf.set("spark.sql.legacy.allowUntypedScalaUDF", "true")
val pdfToText = new PdfToText()
.setStoreSplittedPdf(getStoreSplittedPdf)
.setSplitPage(getSplitPage)
.setOnlyPageNum(getOnlyPageNum)
.setTextStripper(getTextStripper)
.setSort(getSort)
val binaryPdfDF = spark.read.format("binaryFile").load(pdfPath)
val pipelineModel = new Pipeline()
.setStages(Array(pdfToText))
.fit(binaryPdfDF)
pipelineModel.transform(binaryPdfDF)
}
private def getStoreSplittedPdf: Boolean = {
val splitPage =
try {
params.asScala.getOrElse("storeSplittedPdf", "false").toBoolean
} catch {
case _: IllegalArgumentException => false
}
splitPage
}
private def getSplitPage: Boolean = {
val splitPage =
try {
params.asScala.getOrElse("splitPage", "true").toBoolean
} catch {
case _: IllegalArgumentException => true
}
splitPage
}
private def getOnlyPageNum: Boolean = {
val splitPage =
try {
params.asScala.getOrElse("onlyPageNum", "false").toBoolean
} catch {
case _: IllegalArgumentException => false
}
splitPage
}
private def getTextStripper: String = {
val textStripper =
try {
params.asScala.getOrElse("textStripper", TextStripperType.PDF_TEXT_STRIPPER)
} catch {
case _: IllegalArgumentException => TextStripperType.PDF_TEXT_STRIPPER
}
textStripper
}
private def getSort: Boolean = {
val sort =
try {
params.asScala.getOrElse("sort", "false").toBoolean
} catch {
case _: IllegalArgumentException => false
}
sort
}
/** Instantiates class to read Excel files.
*
* docPath: this is a path to a directory of Excel files or a path to an HTML file E.g.
* "path/excel/files"
*
* ==Example==
* {{{
* val docsPath = "home/user/excel-directory"
* val sparkNLPReader = new SparkNLPReader()
* val xlsDf = sparkNLPReader.xls(docsPath)
* }}}
*
* ==Example 2==
* You can use SparkNLP for one line of code
* {{{
* val xlsDf = SparkNLP.read.xls(docsPath)
* }}}
*
* {{{
* xlsDf.select("xls").show(false)
* +-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
* |xls |
* +-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
* |[{Title, Financial performance, {SheetName -> Index}}, {Title, Topic\tPeriod\t\t\tPage, {SheetName -> Index}}, {NarrativeText, Quarterly revenue\tNine quarters to 30 June 2023\t\t\t1.0, {SheetName -> Index}}, {NarrativeText, Group financial performance\tFY 22\tFY 23\t\t2.0, {SheetName -> Index}}, {NarrativeText, Segmental results\tFY 22\tFY 23\t\t3.0, {SheetName -> Index}}, {NarrativeText, Segmental analysis\tFY 22\tFY 23\t\t4.0, {SheetName -> Index}}, {NarrativeText, Cash flow\tFY 22\tFY 23\t\t5.0, {SheetName -> Index}}, {Title, Operational metrics, {SheetName -> Index}}, {Title, Topic\tPeriod\t\t\tPage, {SheetName -> Index}}, {NarrativeText, Mobile customers\tNine quarters to 30 June 2023\t\t\t6.0, {SheetName -> Index}}, {NarrativeText, Fixed broadband customers\tNine quarters to 30 June 2023\t\t\t7.0, {SheetName -> Index}}, {NarrativeText, Marketable homes passed\tNine quarters to 30 June 2023\t\t\t8.0, {SheetName -> Index}}, {NarrativeText, TV customers\tNine quarters to 30 June 2023\t\t\t9.0, {SheetName -> Index}}, {NarrativeText, Converged customers\tNine quarters to 30 June 2023\t\t\t10.0, {SheetName -> Index}}, {NarrativeText, Mobile churn\tNine quarters to 30 June 2023\t\t\t11.0, {SheetName -> Index}}, {NarrativeText, Mobile data usage\tNine quarters to 30 June 2023\t\t\t12.0, {SheetName -> Index}}, {NarrativeText, Mobile ARPU\tNine quarters to 30 June 2023\t\t\t13.0, {SheetName -> Index}}, {Title, Other, {SheetName -> Index}}, {Title, Topic\tPeriod\t\t\tPage, {SheetName -> Index}}, {NarrativeText, Average foreign exchange rates\tNine quarters to 30 June 2023\t\t\t14.0, {SheetName -> Index}}, {NarrativeText, Guidance rates\tFY 23/24\t\t\t14.0, {SheetName -> Index}}]|
* +-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
*
* xlsDf.printSchema()
* root
* |-- path: string (nullable = true)
* |-- content: binary (nullable = true)
* |-- xls: array (nullable = true)
* | |-- element: struct (containsNull = true)
* | | |-- elementType: string (nullable = true)
* | | |-- content: string (nullable = true)
* | | |-- metadata: map (nullable = true)
* | | | |-- key: string
* | | | |-- value: string (valueContainsNull = true)
* }}}
*
* @param params
* Parameter with custom configuration
*/
def xls(docPath: String): DataFrame = {
val excelReader = new ExcelReader(getTitleFontSize, getCellSeparator, getStoreContent)
excelReader.xls(docPath)
}
private def getCellSeparator: String = {
params.asScala.getOrElse("cellSeparator", "\t")
}
/** Instantiates class to read PowerPoint files.
*
* docPath: this is a path to a directory of Excel files or a path to an HTML file E.g.
* "path/power-point/files"
*
* ==Example==
* {{{
* val docsPath = "home/user/power-point-directory"
* val sparkNLPReader = new SparkNLPReader()
* val pptDf = sparkNLPReader.ppt(docsPath)
* }}}
*
* ==Example 2==
* You can use SparkNLP for one line of code
* {{{
* val pptDf = SparkNLP.read.ppt(docsPath)
* }}}
*
* {{{
* xlsDf.select("ppt").show(false)
* +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
* |ppt |
* +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
* |[{Title, Adding a Bullet Slide, {}}, {ListItem, • Find the bullet slide layout, {}}, {ListItem, – Use _TextFrame.text for first bullet, {}}, {ListItem, • Use _TextFrame.add_paragraph() for subsequent bullets, {}}, {NarrativeText, Here is a lot of text!, {}}, {NarrativeText, Here is some text in a text box!, {}}]|
* +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
*
* pptDf.printSchema()
* root
* |-- path: string (nullable = true)
* |-- content: binary (nullable = true)
* |-- ppt: array (nullable = true)
* | |-- element: struct (containsNull = true)
* | | |-- elementType: string (nullable = true)
* | | |-- content: string (nullable = true)
* | | |-- metadata: map (nullable = true)
* | | | |-- key: string
* | | | |-- value: string (valueContainsNull = true)
* }}}
*
* @param params
* Parameter with custom configuration
*/
def ppt(docPath: String): DataFrame = {
val powerPointReader = new PowerPointReader(getStoreContent)
powerPointReader.ppt(docPath)
}
/** Instantiates class to read txt files.
*
* filePath: this is a path to a directory of TXT files or a path to an TXT file E.g.
* "path/txt/files"
*
* ==Example==
* {{{
* val filePath = "home/user/txt/files"
* val sparkNLPReader = new SparkNLPReader()
* val txtDf = sparkNLPReader.txt(filePath)
* }}}
*
* ==Example 2==
* You can use SparkNLP for one line of code
* {{{
* val txtDf = SparkNLP.read.txt(filePath)
* }}}
*
* {{{
* txtDf.select("txt").show(false)
* +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
* |txt |
* +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
* |[{Title, BIG DATA ANALYTICS, {paragraph -> 0}}, {NarrativeText, Apache Spark is a fast and general-purpose cluster computing system.\nIt provides high-level APIs in Java, Scala, Python, and R., {paragraph -> 0}}, {Title, MACHINE LEARNING, {paragraph -> 1}}, {NarrativeText, Spark's MLlib provides scalable machine learning algorithms.\nIt includes tools for classification, regression, clustering, and more., {paragraph -> 1}}]|
* +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
*
* emailDf.printSchema()
* root
* |-- path: string (nullable = true)
* |-- content: binary (nullable = true)
* |-- txt: array (nullable = true)
* | |-- element: struct (containsNull = true)
* | | |-- elementType: string (nullable = true)
* | | |-- content: string (nullable = true)
* | | |-- metadata: map (nullable = true)
* | | | |-- key: string
* | | | |-- value: string (valueContainsNull = true)
* }}}
*
* @param params
* Parameter with custom configuration
*/
def txt(filePath: String): DataFrame = {
val textReader = new TextReader(getTitleLengthSize, getStoreContent)
textReader.txt(filePath)
}
private def getTitleLengthSize: Int = {
val titleLengthSize =
try {
params.asScala.getOrElse("titleLengthSize", "50").toInt
} catch {
case _: IllegalArgumentException => 50
}
titleLengthSize
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy