All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.johnsnowlabs.reader.util.PptParser.scala Maven / Gradle / Ivy

There is a newer version: 6.0.3
Show newest version
/*
 * Copyright 2017-2024 John Snow Labs
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.johnsnowlabs.reader.util

import com.johnsnowlabs.reader.{ElementType, HTMLElement}
import org.apache.poi.hslf.usermodel.{HSLFSlide, HSLFTable, HSLFTextShape}
import org.apache.poi.xslf.usermodel.{XSLFSlide, XSLFTable, XSLFTextShape}

import scala.collection.JavaConverters._
import scala.collection.mutable

object PptParser {

  // Extract content from legacy PowerPoint slides (.ppt)
  def extractHSLFSlideContent(slide: HSLFSlide): Seq[HTMLElement] = {
    val title = Option(slide.getTitle).getOrElse("")
    val titleElement = if (title.nonEmpty) {
      Seq(HTMLElement(elementType = ElementType.TITLE, content = title, metadata = mutable.Map()))
    } else Seq()

    val content: Seq[HTMLElement] = slide.getShapes.asScala.flatMap {
      case textShape: HSLFTextShape =>
        textShape.getTextParagraphs.asScala.flatMap { paragraph =>
          val isBullet = paragraph.isBullet
          val bulletSymbol = Option(paragraph.getBulletChar).getOrElse("")
          val paragraphText = paragraph.getTextRuns.asScala.map(_.getRawText).mkString("")

          if (isBullet) {
            Some(
              HTMLElement(
                elementType = ElementType.LIST_ITEM,
                content = s"$bulletSymbol $paragraphText",
                metadata = mutable.Map()))
          } else if (paragraphText.nonEmpty) {
            Some(
              HTMLElement(
                elementType = ElementType.NARRATIVE_TEXT,
                content = paragraphText,
                metadata = mutable.Map()))
          } else {
            None
          }
        }

      case table: HSLFTable =>
        val cellElements = (0 until table.getNumberOfRows).flatMap { rowIndex =>
          (0 until table.getNumberOfColumns).map { colIndex =>
            val cellContent =
              Option(table.getCell(rowIndex, colIndex)).map(_.getText).getOrElse("").trim
            HTMLElement(
              elementType = ElementType.TABLE,
              content = cellContent,
              metadata =
                mutable.Map("tableLocation" -> s"(${rowIndex.toString}, ${colIndex.toString})"))
          }
        }

        cellElements

      case _ => Seq()
    }

    titleElement ++ content
  }

  def extractXSLFSlideContent(slide: XSLFSlide): Seq[HTMLElement] = {
    val title = Option(slide.getTitle).getOrElse("")
    val titleElement = if (title.nonEmpty) {
      Seq(HTMLElement(elementType = ElementType.TITLE, content = title, metadata = mutable.Map()))
    } else Seq()

    val content: Seq[HTMLElement] = slide.getShapes.asScala.flatMap {
      case textShape: XSLFTextShape
          if textShape.getText != null &&
            textShape.getText != title =>
        textShape.getTextParagraphs.asScala.map { paragraph =>
          val isBullet = paragraph.isBullet
          val bulletSymbol = Option(paragraph.getBulletCharacter).getOrElse("")
          val paragraphText = paragraph.getText
          if (isBullet) {
            HTMLElement(
              elementType = ElementType.LIST_ITEM,
              content = s"$bulletSymbol $paragraphText",
              metadata = mutable.Map())
          } else {
            HTMLElement(
              elementType = ElementType.NARRATIVE_TEXT,
              content = paragraphText,
              metadata = mutable.Map())
          }
        }
      case table: XSLFTable =>
        val cellElements = table.getRows.asScala.zipWithIndex.flatMap { case (row, rowIndex) =>
          row.getCells.asScala.zipWithIndex.map { case (cell, colIndex) =>
            val cellContent = Option(cell.getText).getOrElse("").trim // Extract cell content
            HTMLElement(
              elementType = ElementType.TABLE,
              content = cellContent,
              metadata =
                mutable.Map("tableLocation" -> s"(${rowIndex.toString}, ${colIndex.toString})"))
          }
        }

        cellElements

      case _ => Seq()
    }

    titleElement ++ content
  }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy