All Downloads are FREE. Search and download functionalities are using the official Maven repository.

guru.nidi.text.transform.parse.html.HtmlParser.scala Maven / Gradle / Ivy

/**
 * Copyright (C) 2013 Stefan Niederhauser ([email protected])
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package guru.nidi.text.transform.parse.html

import java.io.StringReader
import java.util.regex.Pattern

import guru.nidi.text.transform.Attribute._
import guru.nidi.text.transform.AttributeValue._
import guru.nidi.text.transform.Name._
import guru.nidi.text.transform.Segment._
import guru.nidi.text.transform.parse.AbstractParser
import guru.nidi.text.transform.{AttributeValue, Segment, TransformContext}
import org.apache.commons.lang3.text.translate._
import org.xml.sax.InputSource

import scala.collection.mutable.ListBuffer
import scala.xml.factory.XMLLoader
import scala.xml.parsing.{FactoryAdapter, NoBindingFactoryAdapter}
import scala.xml.{Elem, Node, NodeSeq, Text}

/**
 *
 */
class HtmlParser(context: TransformContext) extends AbstractParser(context) {
  val PATTERN = Pattern.compile("(https?://[^\\Q (,.?!:;\"')\\E]*)|(<->)|(<=>)|(->)|(<-)|(=>)|(<=)")
  val UNESCAPE_HTML4 = new AggregateTranslator(
    translator(EntityArrays.ISO8859_1_UNESCAPE()),
    translator(EntityArrays.HTML40_EXTENDED_UNESCAPE()),
    new NumericEntityUnescaper()
  )

  def translator(a: Array[Array[String]]): CharSequenceTranslator =
    lookupTranslator(a.asInstanceOf[Array[Array[CharSequence]]])

  def lookupTranslator(a: Array[Array[CharSequence]]) = new LookupTranslator(a: _*)

  object EntityIgnoringXml extends XMLLoader[Elem] {
    override def adapter: FactoryAdapter = new NoBindingFactoryAdapter {
      override def resolveEntity(publicId: String, systemId: String): InputSource =
        new InputSource(new StringReader(""))
    }
  }

  override def parseImpl(): Segment = {
    val unescaped = UNESCAPE_HTML4.translate(input)
    val xml = EntityIgnoringXml.loadString( s"""$unescaped""")
    cleanNewlines(parse(xml, 1)(0))
  }

  def trimNewlines(seg: Segment) = {
    val ch = seg.children
    while (!ch.isEmpty && ch(0).name == NEWLINE) ch.remove(0)
    while (!ch.isEmpty && ch(ch.length - 1).name == NEWLINE) ch.remove(ch.length - 1)
    seg
  }

  def cleanNewlines(seg: Segment): Segment = {
    trimNewlines(seg)

    val ch = seg.children
    var i = 0
    while (i < ch.length) {
      cleanNewlines(ch(i))
      if (List(HEADING, TABLE, IMAGE, LIST) contains ch(i).name) {
        if (i > 0 && ch(i - 1).name == NEWLINE) {
          ch.remove(i - 1)
          i -= 2
        } else if (i < ch.length - 1 && ch(i + 1).name == NEWLINE) {
          ch.remove(i + 1)
          i -= 1
        }
      }
      i += 1
    }
    seg
  }


  def nsDefs = namespaces.map(ns => s"""xmlns:$ns="$ns" """).mkString

  def namespaces: Seq[String] = Nil

  def parse(xml: Seq[Node], listLevel: Int): Seq[Segment] =
    xml.flatMap(n => parse(n, listLevel))

  def parse(node: Node, listLevel: Int): Seq[Segment] = {
    def heading(ns: NodeSeq, headingLevel: Int) = HEADING(parse(ns, listLevel): _*)(LEVEL -> (headingLevel + context.headingLevel))

    node match {
      case {ns@_*} => List(ROOT(parse(ns, listLevel): _*))
      case {ns@_*} if !ns.isEmpty => List(BOLD(parse(ns, listLevel): _*))
      case {ns@_*} if !ns.isEmpty => List(BOLD(parse(ns, listLevel): _*))
      case {ns@_*} if !ns.isEmpty => List(ITALICS(parse(ns, listLevel): _*))
      case {ns@_*} if !ns.isEmpty => List(ITALICS(parse(ns, listLevel): _*))
      case {ns@_*} if !ns.isEmpty => List(UNDERLINED(parse(ns, listLevel): _*))
      case 

{ns@_*}

if !ns.isEmpty => parse(ns, listLevel) ++ List(NEWLINE()) case {ns@_*} if !ns.isEmpty => parse(ns, listLevel) case
{ns@_*}
if !ns.isEmpty => parse(ns, listLevel) case {ns@_*} if !ns.isEmpty => parse(ns, listLevel) case
=> List(NEWLINE()) case
=> List(LINE()) case

{ns@_*}

if !ns.isEmpty => List(heading(ns, 1)) case

{ns@_*}

if !ns.isEmpty => List(heading(ns, 2)) case

{ns@_*}

if !ns.isEmpty => List(heading(ns, 3)) case

{ns@_*}

if !ns.isEmpty => List(heading(ns, 4)) case
{ns@_*}
if !ns.isEmpty => List(heading(ns, 5)) case
{ns@_*}
if !ns.isEmpty => List(heading(ns, 6)) case
    {ns@_*}
if !ns.isEmpty => List(LIST(parse(ns, listLevel + 1): _*)(TYPE -> AttributeValue.ORDERED, LEVEL -> listLevel)) case
    {ns@_*}
if !ns.isEmpty => List(LIST(parse(ns, listLevel + 1): _*)(TYPE -> AttributeValue.UNORDERED, LEVEL -> listLevel)) case
  • {ns@_*}
  • if !ns.isEmpty => List(ITEM(parse(ns, listLevel): _*)) case n@ {ns@_*}
    => List(new TableParser(this).parse((n\"@class").text, ns, listLevel)) case n@ {ns@_*} => List(image((n \ "@src").text,(n \ "@alt").text,(n \ "@id").text)) case n@ {ns@_*} => List(link((n \ "@href").text,ns,listLevel)) case Text(t) => text(t) case _ => Nil } } private def link(href: String, ns: NodeSeq, listLevel: Int) = { val desc = if (ns.isEmpty) List(plain(href)) else parse(ns, listLevel) val link = LINK(CAPTION -> ROOT(desc: _*)) if (href.startsWith("#")) link(TARGET -> href.substring(1), TYPE -> REF) else link(TARGET -> href, TYPE -> URL) } private def image(src: String, alt: String, id: String) = { val image = IMAGE(TARGET -> src) if (!id.isEmpty) image(ID->id) CssParser(alt, (name, value) => name match { case "width" if value != null => image(WIDTH -> value) case "caption" if value != null => image(CAPTION -> ROOT(plain(value))) case _ => }) image } private def text(t: String) = { val list = new ListBuffer[Segment] val s = new StringBuffer val m = PATTERN.matcher(t) def addPlain() { if (s.length() > 0) list += plain(s.toString) } while (m.find) { m.appendReplacement(s, "") addPlain() val matchedGroup = m.group(0) matchedGroup match { case "->" => list += symbol("->", ARROW_RIGHT) case "=>" => list += symbol("=>", DOUBLE_ARROW_RIGHT) case "<-" => list += symbol("<-", ARROW_LEFT) case "<=" => list += symbol("<=", DOUBLE_ARROW_LEFT) case "<->" => list += symbol("<->", ARROW_BOTH) case "<=>" => list += symbol("<=>", DOUBLE_ARROW_BOTH) case _ => list += LINK(CAPTION->ROOT(plain(matchedGroup)), TARGET -> matchedGroup, TYPE -> URL) } s.setLength(0) } m.appendTail(s) addPlain() list } }




    © 2015 - 2025 Weber Informatics LLC | Privacy Policy