All Downloads are FREE. Search and download functionalities are using the official Maven repository.

guru.nidi.text.transform.parse.html.TableParser.scala Maven / Gradle / Ivy

/**
 * Copyright (C) 2013 Stefan Niederhauser ([email protected])
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package guru.nidi.text.transform.parse.html

import guru.nidi.text.transform.Attribute._
import guru.nidi.text.transform.AttributeValue._
import guru.nidi.text.transform.Name._
import guru.nidi.text.transform.parse.TagCustomizerParser
import guru.nidi.text.transform.{Attribute, Segment}

import scala.xml.{Node, NodeSeq}

/**
 *
 */
class TableParser(parser: HtmlParser) {
  private val CUSTOMIZER_COLSPAN = "colspan"
  private val CUSTOMIZER_WIDTH = "width"
  private val CUSTOMIZER_ALIGN = "align"
  private val CUSTOMIZER_ALIGN_CELL = "align-cell"

  val table = TABLE()
  var rowIndex = 1
  var colIndex = 1
  var maxColumns = 1

  def parse(style: String, ns: NodeSeq, listLevel: Int): Segment = {
    if (!(ns \\ "caption").isEmpty) {
      table(CAPTION -> ROOT(parser.parse((ns \\ "caption")(0).child, listLevel): _*))
    }
    for (row <- ns \ "tr") {
      parseRow(row, listLevel)
    }

    setWidths(style)
    table(ROWS -> (rowIndex - 1), COLUMNS -> (maxColumns - 1))
    table
  }

  private def parseRow(row: Node, listLevel: Int) {
    colIndex = 1
    for (col <- row.child) {
      parseCell(col, listLevel)
    }
    maxColumns = Math.max(maxColumns, colIndex)
    rowIndex += 1
  }

  private def parseCell(col: Node, listLevel: Int) {
    if (col.label == "td" || col.label == "th") {
      table(Attribute(rowIndex + "," + colIndex) -> cell(col, colIndex, listLevel))
      colIndex += 1
    }
  }

  private def cell(col: Node, index: Int, listLevel: Int) = {
    val cell = TABLE_CELL()
    val content = parser.parse(col.child, listLevel)
    if (!content.isEmpty && content(0).name == PLAIN) handleTagCustomizer(content(0), index, cell)
    if (!(col \ "@style").isEmpty) handleStyle((col \ "@style").text, index, cell)
    handleColspan(col, cell)
    if (col.label == "th" || (col \ "@class").text == "highlight") cell(HEADER -> true)
    parser.trimNewlines(cell(content: _*))
  }

  def handleColspan(col: Node, cell: Segment) {
    try {
      val span = Integer.parseInt((col \ "@colspan").text)
      if (span > 1) cell(SPAN -> span)
      colIndex += span - 1
    } catch {
      case e: NumberFormatException =>
    }
  }

  private def handleTagCustomizer(content: Segment, index: Int, cell: Segment) {
    content(TEXT -> TagCustomizerParser(content(TEXT).get, (name, value) => name match {
      case CUSTOMIZER_WIDTH => table(WIDTH(index) -> value)
      case CUSTOMIZER_ALIGN => table(ALIGN(index) -> leftOrRight(value))
      case CUSTOMIZER_ALIGN_CELL => cell(ALIGN -> leftOrRight(value))
      case _ =>
    }).trim)
  }

  private def handleStyle(style: String, index: Int, cell: Segment) {
    CssParser(style, (name, value) => name match {
      case "width" => table(WIDTH(index) -> value)
      case _ =>
    })
  }

  private def setWidths(style: String) {
    if (!setWidthsFromStyle(style)) setWidthsFromCells
  }

  private def setWidthsFromStyle(style: String): Boolean = {
    var hasWidthStyle = false
    def setWidths(value: String) {
      hasWidthStyle = true
      value.split(",").zipWithIndex.foreach({
        case (width, index) => table(WIDTH(index + 1) -> width)
      })
    }

    CssParser(style, (name, value) => name match {
      case CUSTOMIZER_WIDTH => setWidths(value)
      case _ =>
    })

    hasWidthStyle
  }

  private def setWidthsFromCells {
    def isPx(col: Int) = table(WIDTH(col)).getOrElse("").endsWith("px")
    def valuePx(col: Int) = {
      val s = table(WIDTH(col)).get
      try {
        Integer.parseInt(s.substring(0, s.length - 2))
      } catch {
        case _: NumberFormatException => 0
      }
    }

    val allPxWidths = (1 until maxColumns).forall(col => isPx(col))
    if (allPxWidths) {
      val sum = (1 until maxColumns).foldLeft(0)((sum, col) => valuePx(col) + sum)
      (1 until maxColumns).foreach(col => table(WIDTH(col) -> (100.0 * valuePx(col) / sum + "%")))
    } else {
      (1 until maxColumns).foreach(col => if (isPx(col)) table(WIDTH(col) -> null))
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy