br.gov.lexml.parser.pl.xhtml.XHTML.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lexml-parser-projeto-lei Show documentation
Brazillian legal document parsing library.
The newest version!
package br.gov.lexml.parser.pl.xhtml

import org.apache.commons.io.IOUtils

import java.io.InputStream
import java.io.File
import scala.xml._
import scala.util.matching.Regex
import grizzled.slf4j.Logging
import org.apache.commons.io.FileUtils

import java.io.ByteArrayInputStream
import scala.xml.parsing.NoBindingFactoryAdapter
import org.apache.commons.io.filefilter.PrefixFileFilter

import java.io.FileFilter
import scala.xml.parsing.XhtmlParser
import scala.io.BufferedSource
import br.gov.lexml.parser.pl.docx.DOCXReader

import scala.annotation.{tailrec, unused}
import scala.io.Codec

abstract class XHTMLProcessorResult
case object Failure extends XHTMLProcessorResult
case class Success(result: List[Node]) extends XHTMLProcessorResult

object TextUtils {
  def fixXHTML(data: Array[Byte]) : Array[Byte] =
    new String(data,"utf-8")
      //.replaceFirst("]*>", "")
      .replaceAll("\u0007", "")
      .replaceAll("\u001f", "")
      .replace(0x92: Char, '`')
      .replaceAll("’", "`")
      .replace(0x202d: Char, ' ')
      .replace(0x202c: Char, ' ')
      .replace('–', '-')
      .getBytes("utf-8")
}

trait Converter {
  def convert(srcExtension: String, srcData: Array[Byte], dstExtension: String): Array[Byte]
  def deleteByPrefix(dir : File, prefix : String) : Unit =
        dir.listFiles(new PrefixFileFilter(prefix) : FileFilter).foreach(f => FileUtils.deleteQuietly(f))
}

final class DOCXConverter(otherConverter : Converter) extends Converter {
  override def convert(srcExtension: String, srcData: Array[Byte], dstExtension: String): Array[Byte] = {
	  (srcExtension,dstExtension) match {
	    case ("docx","xhtml") =>  
	      DOCXReader.readDOCX(new ByteArrayInputStream(srcData)).
	      		get.toString.getBytes("utf-8")   
	    
	    case _ => otherConverter.convert(srcExtension,srcData,dstExtension)
	  }
  }
}

final class AbiwordConverter(val removeTemporaryFiles: Boolean = true) extends Converter with Logging {

  private def noPostProc(data: Array[Byte]) = data

  override def convert(srcExtension: String, srcData: Array[Byte], dstExtension: String) : Array[Byte] = {
    logger.info("abiword.convert: starting: srcExtension = " + srcExtension + ", dstExtension = " + dstExtension)
    val srcFile = File.createTempFile("lexml-parser-pl", "." + srcExtension)
    logger.info("abiword.convert: srcFile = " + srcFile)
    val baseName = srcFile.getName.substring(0, srcFile.getName.length - srcExtension.length - 1)
    val baseDir = srcFile.getParentFile
    val destFile = new File(baseDir, baseName + "." + dstExtension)
    logger.info("abiword.convert: destFile = " + destFile)
    val (params, postProc) = dstExtension match {
      case "xhtml" => (List("--to=xhtml", "--exp-props=html4: no; declare-xml: yes; use-awml:no; embed-css: yes; embed-images: yes"), TextUtils.fixXHTML(_ : Array[Byte]))
      case "pdf" => (List("--to=pdf"), noPostProc _)
      case _ => throw new RuntimeException("Abiword Converter does not support extension: " + dstExtension)
    }
    logger.info("abiword.convert: params = " + params)
    try {
      //val srcPath = srcFile.getCanonicalPath
      FileUtils.writeByteArrayToFile(srcFile, srcData)
      val cmd: Array[String] = (("/usr/bin/abiword" :: params) :+ srcFile.getPath).toArray
      logger.info("running " + cmd.mkString(" "))
      val p = Runtime.getRuntime.exec(cmd, Array[String](), srcFile.getParentFile)
      logger.info("returned from abiword")
      p.waitFor
      postProc(FileUtils.readFileToByteArray(destFile))
    } finally {
      if (removeTemporaryFiles) {        
        deleteByPrefix(baseDir,baseName)        
      }
    }
  }
}

@unused
final class OpenOfficeConverter(val removeTemporaryFiles: Boolean = true) extends Converter with Logging {

  private val pyodconverter = "/usr/local/bin/docconverter"

  import TextUtils._

  private def htmlPostProc(data: Array[Byte]) = {
    val data2 = fixXHTML(data)
    val parserFactory = new org.ccil.cowan.tagsoup.jaxp.SAXFactoryImpl
    val parser = parserFactory.newSAXParser()
    val source = new InputSource(new ByteArrayInputStream(data2))
    val adapter = new NoBindingFactoryAdapter
    val e = adapter.loadXML(source, parser)
    e.toString.getBytes("utf-8")
  }

  private def noPostProc(data: Array[Byte]) = data

  override def convert(srcExtension: String, srcData: Array[Byte], dstExtension: String): Array[Byte] = {
    logger.info("oo.convert: starting: srcExtension = " + srcExtension + ", dstExtension = " + dstExtension)
    val dstExtension2 = dstExtension match {
      case "xhtml" => "html"
      case x => x
    }
    val srcFile = File.createTempFile("lexml-parser-pl", "." + srcExtension)
    logger.info("oo.convert: srcFile = " + srcFile)
    val baseName = srcFile.getName.substring(0, srcFile.getName.length - srcExtension.length - 1)
    val baseDir = srcFile.getParentFile
    val destFile = new File(baseDir, baseName + "." + dstExtension2)
    logger.info("oo.convert: destFile = " + destFile)
    val postProc = dstExtension match {
      case "xhtml" => htmlPostProc _
      case _ => noPostProc _
    }

    try {
      FileUtils.writeByteArrayToFile(srcFile, srcData)
      val cmd: Array[String] = Array(pyodconverter, srcFile.getPath, destFile.getPath)
      val p = Runtime.getRuntime.exec(cmd, Array[String](), srcFile.getParentFile)
      p.waitFor
      val res = postProc(FileUtils.readFileToByteArray(destFile))
      //FileUtils.writeByteArrayToFile(new File(destFile.getParentFile,destFile.getName + ".res"),res)
      res
    } finally {
      if (removeTemporaryFiles) {
        deleteByPrefix(baseDir,baseName)        
      }
    }
  }
}

object XHTMLProcessor extends Logging {

  val accept : Set[String] = Set(
      "text/plain",
      "text/html",
      "application/rtf",
      "text/rtf",
      "application/msword",
      "application/vnd.oasis.opendocument.text", 
      "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
      )
  
  import TextUtils._

/*  lazy val dtdFile: File = {
    val f = File.createTempFile("xhtml11-", ".dtd")
    f.deleteOnExit()
    f
  }
  lazy val dtdUrl: String = {
    val is = getClass.getClassLoader().getResourceAsStream("xhtml11.dtd")
    val os = new BufferedOutputStream(new FileOutputStream(dtdFile))
    IOUtils.copy(is, os)
    IOUtils.closeQuietly(is)
    IOUtils.closeQuietly(os)
    dtdFile.getCanonicalFile.toURI.toURL.toString
  }*/

  //val converter : Converter = new AbiwordConverter
  private val defaultConverter: Converter = new DOCXConverter(new AbiwordConverter)

  private def changeChildren[T <: Seq[Node]](f: Seq[Node] => Seq[Node]) = (e: T) => {
    e match {
      case Elem(pref, name, attrs, scope, children @ _*) =>
        Elem(pref, name, attrs, scope, true, f(children): _*).asInstanceOf[T]
      case _ => e
    }
  }

  //  var removeTemporaryFiles = true
  //  def fixXHTML(xhtml: String) = xhtml
  //    .replaceFirst("]*>", "")
  //    .replaceAll("\007", "")
  //    .replaceAll("\037", "")
  //    .replace(0x92: Char, '`')
  //    .replaceAll("’", "`")
  //    .replace(0x202d: Char, ' ')
  //    .replace(0x202c: Char, ' ')
  //
  //
  //  
  //  def abiwordConversion(extension : String) =
  //	  	(extension,convertUsingAbiword2(extension)(_))
  //	  	
  //  def convertUsingAbiword2(extension : String)(source : Array[Byte]) =
  //	  convertUsingAbiword(source,extension)
  //    
  //  def convertUsingAbiword(source: Array[Byte], extension: String): Elem = {
  //    val srcFile = File.createTempFile("lexml-parser-pl", "." + extension)
  //    val xhtmlFile = new File(srcFile.getCanonicalPath.replaceFirst(extension + "$", "xhtml"))
  //    try {
  //      //val srcPath = srcFile.getCanonicalPath
  //      FileUtils.writeByteArrayToFile(srcFile, source)
  //      val cmd: Array[String] = Array(
  //        "/usr/bin/abiword", "--to=xhtml", srcFile.getName, "--exp-props=html4: no; declare-xml: yes; use-awml:no; embed-css: no; embed-images: no")
  //      val p = Runtime.getRuntime.exec(cmd, Array[String](), srcFile.getParentFile)
  //      p.waitFor
  //
  //      val xhtmlData = fixXHTML(FileUtils.readFileToString(xhtmlFile))
  //
  //      val reader = new StringReader(xhtmlData)
  //
  //      try {
  //        if (!xhtmlFile.exists || xhtmlFile.length < 300) {
  //          throw new RuntimeException("HTML de saida do Abiword vazio")
  //        }
  //        else { XML.load(reader) }
  //      } finally {
  //        IOUtils.closeQuietly(reader)
  //      }    
  //    } finally {
  //      if (removeTemporaryFiles) {
  //        srcFile.delete
  //        xhtmlFile.delete
  //        val f = new File(xhtmlFile.getPath() + "_files")
  //        if (f.exists() && f.isDirectory()) {
  //          f.listFiles().foreach(_.delete)
  //          f.delete
  //        }
  //      }
  //    }
  //  }

  private type SourceProcessor = (Array[Byte], Converter) => Elem

  private def xhtmlLoader(in : Array[Byte]) : Elem = xhtmlLoader(new ByteArrayInputStream(in))
	  
  private def xhtmlLoader(is : InputStream) : Elem =
	  XhtmlParser(new BufferedSource(is)(Codec.UTF8)).collectFirst({ case e : Elem => e }).get
	      
  private def externalConvertToXhtml(extension: String) = (extension, (data: Array[Byte], converter : Converter) => {
    System.setProperty("file.encoding", "utf-8")
    val converted = converter.convert(extension, data, "xhtml")
    val r = xhtmlLoader(converted)
    (r \\ "html").collect { case e : Elem => e } . head
    
  })
  
  private val sourceProcessorMap: Map[String, (String, SourceProcessor)] = Map(
    ("text/plain", ("txt", (source: Array[Byte], _ : Converter) => {
      val text = fixXHTML(source)

      val lines = scala.io.Source.fromBytes(text,"utf-8").getLines().to(List)
      @tailrec
      def toPars(l: List[String], r: List[String] = Nil, s: List[String] = Nil): List[String] = l match {
        case Nil => s match { case Nil => r; case _ => s.mkString("", " ", "") :: r }
        case x :: xs if x.trim.isEmpty => toPars(xs, s.mkString("", " ", "") :: r)
        case x :: xs => toPars(xs, r, x :: s)
      }
      val pars = toPars(lines).reverse
      { pars.map(p => { p }
) }
    })),
    ("application/xhtml+xml", ("xhtml", (source: Array[Byte],_ : Converter) => {
      val text = fixXHTML(source)
      xhtmlLoader(text)
    })),
    ("text/html", ("html", (source: Array[Byte],_ : Converter) => {
      val text = fixXHTML(source)
      xhtmlLoader(text)
    })),
    ("application/rtf", externalConvertToXhtml("rtf")),
    ("text/rtf", externalConvertToXhtml("rtf")),
    ("application/msword", externalConvertToXhtml("doc")),
    ("application/vnd.oasis.opendocument.text", externalConvertToXhtml("odt")),
    ("application/vnd.openxmlformats-officedocument.wordprocessingml.document", externalConvertToXhtml("docx")))

  def convertSrcToXHTML(source: Array[Byte], mimeType: String,converter : Converter): Option[Elem] =
    sourceProcessorMap.get(mimeType).map(_._2(source,converter))

  def convertRTFtoXHTML(rtfSource: InputStream, converter : Converter): Option[Elem] =
    convertSrcToXHTML(IOUtils.toByteArray(rtfSource), "text/rtf",converter)

  private def selectBaseElems(root: Elem): List[Elem] = {
    val body = (root \\ "body").head.asInstanceOf[Elem]
   
    val belems = root.child.to(List).dropWhile ((n : Node) => n match {
      case e : Elem => e.label != "body" 
      case _ => true 
    })
      
    
    def getAttr(n: Node, attr: String) = n match {
      case e: Elem => e.attributes.get(attr).map(_.text.toLowerCase)
      case x => None
    }
    def getIdOrType(n: Node) = getAttr(n, "id").orElse(getAttr(n, "type")).getOrElse("")
    val childs = trim(belems)
    val childs1 = childs.filter((n: Node) => { val x = getIdOrType(n) ; x != "header" && x != "footer" })
    val (cl1, cl2) = childs1.span({ case e: Elem => e.label == "table"; case _ => false })
    val childs3 = (cl1 \\ "*").filter { 
      case e : Elem => e.label == "p" || e.label == "h1" || e.label == "h2" || e.label == "h3"
      case _ => false
    } ++ cl2
    val r = wrapText(childs3.toList).collect { case e : Elem => e }
    r
  }

  private def chooseDivs(divs: List[Elem]): List[Elem] = divs

  private val parLabels: Set[String] = Set("p", "h1", "h2", "h3", "h4", "blockquote")
  private val isValidElem: PartialFunction[Node, Node] = {
    case e@Elem(_, "table", _, _, _@_*) => e
    case Elem(pref, "ol", _, scope, children@_*) => Elem(pref, "ol", Null, scope, true, children.collect(isValidElem): _*)
    case Elem(pref, "li", _, scope, children@_*) => Elem(pref, "li", Null, scope, true, children.collect(isValidElem orElse isContent): _*)
    case Elem(pref, label, attrs, scope, children@_*) if parLabels.contains(label) => Elem(pref, "p", attrs, scope, true, children: _*)
  }
  private val isContent: PartialFunction[Node, Node] = ({
    case t: Text => t
    case e @ Elem(_, "span", _, _, _*) => e
  }: PartialFunction[Node, Node]).orElse(isValidElem)

  private def wrapText(nl: List[Node]): List[Node] = {
    val blockElems = List("table","thead","tbody","th","tr","td","p","blockquote","center","div","img")
    @tailrec
    def docollect(nl: List[Node], accum: List[Node] = Nil, accum2 : List[Node] = Nil): List[Node] = nl match {
      case Nil if trim(accum).isEmpty => accum2.reverse
      case Nil => ({ NodeSeq fromSeq accum.reverse } :: accum2).reverse
      case (t: Text) :: r => docollect(r, t :: accum, accum2)
      case (e: Elem) :: r if !blockElems.contains(e.label) => docollect(r, e :: accum, accum2)
      case (e : Elem) :: r if trim(accum).isEmpty => docollect(r,Nil,e :: accum2)
      case (e : Elem) :: r  => docollect(r,Nil,e :: { NodeSeq fromSeq accum.reverse } :: accum2)
      case b :: _ => sys.error(s"unexpected block at wrapText.docollect: $b")
    }
    docollect(nl, Nil)
  }
  
  private def trimLeft(nl: List[Node]) = nl.dropWhile({ case t: Text => t.text.trim.isEmpty; case _ => false })
  def trim(nl: List[Node]): List[Node] = trimLeft(trimLeft(nl).reverse).reverse
  private val explodedBlockElements = Set("div", "center")
  private val explodedInlineElements = Set("font")
  private def explodeDivs(divs: List[Elem]) = {
    def explode(n: Node): List[Node] = n match {
      case e: Elem if explodedBlockElements.contains(e.label) => wrapText(e.child.toList).flatMap(explode)
      case e: Elem if explodedInlineElements.contains(e.label) => e.child.toList.flatMap(explode)
      case e: Elem if e.label == "td" =>
      	 trim(e.child.toList) match {
            case List(e2: Elem) if e2.label == "p" => List(e copy (child = e2.child.toList.flatMap(explode)))
            case _ => List(e copy (child = e.child.toList.flatMap(explode)))
         }
      case e: Elem => List(e copy (child = e.child.toList.flatMap(explode)))
      case x => List(x)
    }
    divs.flatMap(explode).collect(isValidElem)
  }

  private def mapToAttributes(m: Map[String, String]) = m.foldRight(Null.asInstanceOf[MetaData])(
    (kv, md) => {
      val (k, v) = kv
      new UnprefixedAttribute(k, v, md)
    })

  private def cleanAttributes: Seq[Node] => Seq[Node] = bottomUp((n: Node) => {

    val changeAttrs = (f: Map[String, String] => Map[String, String]) =>
      (e: Node) => e match {
        case Elem(pref, name, attrs, scope, children @ _*) =>
          Elem(pref, name, mapToAttributes(f(attrs.asAttrMap)), scope, true, children: _*)
      }

    val allowedStyles = Set[String](
      "font-weight:bold", "font-weight:bolder", "font-weight:normal", "font-style:italic", "font-style:oblique", "font-style:normal",
      "text-decoration:underline","vertical-align:super", "vertical-align:sub" )

    def cleanStyle(v: String) = {
      v.split(";").filter(allowedStyles.contains).mkString(";")
    }

    val filterStyle: PartialFunction[(String, String), (String, String)] = {
      case ("style", v : String) => ("style", cleanStyle(v))
    }

    val cleanSpanAttrs: Elem => Elem = changeAttrs(_.collect(filterStyle).toMap)

    val filterRedundantTDAttrs = (m: Map[String, String]) => m.filter({
      case ("rowspan", "1") => false
      case ("colspan", "1") => false
      case _ => true
    })

    def keepOnly(keyNames: String*) = (m: Map[String, String]) => m.filter(x => keyNames.contains(x._1))

    val emptyAttributes = changeAttrs((_: Map[String, String]) => Map())

    def saveIndentation(e: Elem) = e match {
      case Elem(pref, name, attrs, scope, children @ _*) =>
        val styleMap = getStyleMap(attrs).withDefault(_ => "")

        def parseAndNormalize(s: String): Option[Double] = {
          val re = new Regex("(-?[0-9.]+)([a-z]+)")
          val ratios = Map[String, Double](
            "mm" -> 1, "cm" -> 10, "in" -> 25.4)
          val m = re.findAllIn(s)
          if (m.nonEmpty) {
            val num = m.group(1).toDouble
            val unit = m.group(2)
            val ratio = ratios.get(unit)
            ratio.map(_ * num)
          } else {
            None
          }
        }

        val textIndent = parseAndNormalize(styleMap("text-indent"))
        val marginLeft = parseAndNormalize(styleMap("margin-left"))
        val centered = styleMap("text-align") == "center"
        val indentation = List(textIndent, marginLeft).collect({ case Some(x) => x }).sum
        Elem(pref, name, new UnprefixedAttribute("indentation", Text(indentation.toString), new UnprefixedAttribute("centered", centered.toString, Null)), scope, true, children: _*)
    }

    n match {
      case Elem(_, label, _, _, _*) =>
        val e = n.asInstanceOf[Elem]
        label match {
          case "span" => cleanSpanAttrs(e)
          case "table" => changeAttrs(keepOnly("rows", "cols"))(e)
          case "td" => changeAttrs(filterRedundantTDAttrs.andThen(keepOnly("colspan", "rowspan")))(e)
          case _ => emptyAttributes(e)
        }
      case _ => n
    }

  })

  private def fixSpans(nl: List[Node]): List[Node] = {
    nl.flatMap({  
	    case e @ Elem(pref, label, attrs, scope, child @ _*) =>
        val child2 = fixSpans(child.toList)
        e.label match {
          case "span" => makeSpanOrIandB(pref, scope, attrs, child2)
          case _ => List(Elem(pref, label, attrs, scope, true, child2: _*))
        }
      case n => List(n)
	  })
  }

  private def makeSpanOrIandB(prefix: String, scope: NamespaceBinding, attrs: MetaData, child: Seq[Node]): Seq[Node] = {
    
    def makePair(s: String) = s.span(c => c != ':') match {
      case (k, "") => (k, "")
      case (k, v) => (k, v.substring(1))
    }
    val attrMap = attrs.asAttrMap
  
    val styleString = attrMap.getOrElse("style", "")
    val otherAttrs: Map[String, String] = attrMap - "style"
    val styles = styleString.split(";").map(makePair).toMap

    val italicPresent = styles.get("font-style") match {
      case Some("italic") => true
      case Some("oblique") => true
      case _ => false
    }
    val boldPresent = styles.get("font-weight") match {
      case Some("bold") => true
      case Some("bolder") => true
      case _ => false
    }
    val isSuperScript = styles.get("vertical-align").contains("super")
    val isSubScript = styles.get("vertical-align").contains("sub")
    val hasUnderline = styles.get("text-decoration").contains("underline")
    
    val otherStyles = styles - "font-style" - "font-weight" - "text-decoration" - "vertical-align"
    val restMap: Map[String, String] = if (otherStyles.isEmpty) { otherAttrs } else {
      otherAttrs + (("style", otherStyles.toList.map(x => x._1 + ":" + x._2).mkString("", ";", "")))
    }
    
    var e = if (restMap.isEmpty) { child } else { Elem(prefix, "span", mapToAttributes(restMap), scope, true, child: _*) }
    e = if (italicPresent) { Elem(prefix, "i", Null, scope, true, e: _*) } else { e }
    e = if (boldPresent) { Elem(prefix, "b", Null, scope, true, e: _*) } else { e }
    e = if (isSuperScript) { Elem(prefix, "sup", Null, scope, true, e: _*) } else { e }
    e = if (isSubScript) { Elem(prefix, "sub", Null, scope, true, e: _*) } else { e }
    //e = if (hasUnderline)  { Elem(prefix, "u", Null, scope, true, e: _*) } else { e }
    if (hasUnderline)  { logger.warn("text has underline!") }
    //logger.info("makeSpanOrIandB: attrMap = " + attrMap + ", styleString = " + styleString + ", styles =  " + styles + ", italicPresent = " + italicPresent + ", boldPresent = " + boldPresent + ", restMap.isEmpty = " + restMap.isEmpty + ", res = " + child3)
    
    e
  }

  private def mapElements[T](others: Node => T, elem: Elem => T) = (n: Node) =>
    n match {
      case e: Elem => elem(e)
      case _ => others(n)
    }

  def id[T]: T => T = (t: T) => t

  private val validElements = Set("p", "span", "sup", "sub", "table", "tr", "td", "th", "b", "i", "ol", "li", "img", "blockquote", "u",
      "h1","h2","h3","h4")

  private val cleanSeqNodes: List[Node] => List[Node] = bottomUp(mapElements(id,
    (e: Elem) => if (validElements.contains(e.label)) { e } else { e.child }))

  private val headings = Set("h1", "h2", "h3", "h4")
    
  private val renameHeadings: List[Node] => List[Node] = bottomUp(mapElements(id,
    (e: Elem) => if (headings.contains(e.label)) { e copy (label = "p") } else { e }))
    
  private def bottomUp(f: Node => Seq[Node]): Seq[Node] => List[Node] = (ns: Seq[Node]) => {
    val chChildren = (n: Node) => changeChildren(bottomUp(f))(n)

    val nl = ns.iterator.toList
    nl.flatMap(f.compose(chChildren))
  }

  private def topDown(f: Node => Seq[Node]): Seq[Node] => List[Node] = (ns: Seq[Node]) => {
    val chChildren = (n: Node) => changeChildren(bottomUp(f))(n)
    val nl = ns.iterator.toList
    nl.flatMap(f).map(chChildren)
  }

  def topDownUntil(f: PartialFunction[Node, Seq[Node]]): Seq[Node] => List[Node] = (ns: Seq[Node]) => {
    val rec = topDownUntil(f)
    ns.toList.flatMap((n: Node) =>
      f.lift(n) match {
        case None => changeChildren(rec)(n)
        case Some(ns2) => ns2
      })
  }

  private def transformTextWith[A](f: (A, String) => (String, A))(a: A): Seq[Node] => List[Node] = (ns: Seq[Node]) => {
    def doit(bl: (A, List[Node]), n: Node): (A, List[Node]) = {
      val (b1, l) = bl
      n match {
        case x : Atom[String] =>
          val t1 = x.text
          val (t2, b2) = f(b1, t1)
          if (t2.isEmpty) { (b2, l) }
          else { (b2, Text(t2) :: l) }
        case Elem(pref, name, attrs, scope, children @ _*) =>
          val (b2, rl) = children.foldLeft(b1, List[Node]())(doit)
          (b2, Elem(pref, name, attrs, scope, true, rl.reverse: _*) :: l)
        case _ => (b1, n :: l)
      }
    }
    val (_, rl) = ns.foldLeft(a, List[Node]())(doit)
    rl.reverse
  }

  private def transformTextBackwardsWith[A](f: (A, String) => Option[(String, A)])(a: A): Seq[Node] => List[Node] =
    (ns: Seq[Node]) => {
      def doit(bl: (A, List[Node], Boolean), n: Node): (A, List[Node], Boolean) = {
        val (b1, l, skip) = bl
        if (skip) { (b1, n :: l, skip) } else n match {
          case x : Atom[String] =>
            val t1 = x.text
            f(b1, t1) match {
              case None => (b1, n :: l, true)
              case Some((t2, b2)) =>
                if (t2.isEmpty) { (b2, l, false) }
                else { (b2, Text(t2) :: l, false) }
            }
          case Elem(pref, name, attrs, scope, children @ _*) =>
            val (b2, rl, skip2) = children.reverse.foldLeft(b1, List[Node](), skip)(doit)
            (b2, Elem(pref, name, attrs, scope, true, rl: _*) :: l, skip2)
          case _ => (b1, n :: l, skip)
        }
      }
      val (_, rl, _) = ns.reverse.foldLeft(a, List[Node](), false)(doit)
      rl
    }

  private val re1 = new Regex("(\\s| )+")
  private val re2 = new Regex("^ +")
  private val re3 = new Regex("“ +")
  private val re4 = new Regex(" +”")

  private def cleanSpaces(trimLeft: Boolean, s: String) = {
    val s1 = re1.replaceAllIn(s, " ")
    val s2 = re3.replaceAllIn(s1, "“")
    val s3 = re4.replaceAllIn(s2, "”")
    val s4 = if (!trimLeft || s3.isEmpty) { s3 }
    else { re2.replaceFirstIn(s3, "") }
    val tl = if (s4.isEmpty) { trimLeft }
    else { s4.endsWith(" ") }
    (s4, tl)
  }

  private val re5 = new Regex(" +$")

  private val normalizeSpace: Seq[Node] => List[Node] = topDownUntil((n: Node) =>
    n match {
      case Elem(_, label, _, _, _*) if label == "p" || label == "li" || label == "blockquote" =>
        val ns1 = transformTextWith(cleanSpaces)(true)(n)
        val ns2 = transformTextBackwardsWith(
          (skip: Boolean, t: String) =>
            if (skip) { None }
            else { Some(re5.replaceFirstIn(t, ""), true) })(false)(ns1)
        ns2
    })



  private def getAttr(md: MetaData, key: String): String = {
    md.get(key) match {
      case None => ""
      case Some(l) => l.map(_.text).mkString("", "", "")
    }
  }

  private val cleanSpuriousSpans = topDown {
    case n@Elem(_, "span", attrs, _, children@_*) if getAttr(attrs, "style").isEmpty ||
      n.text.trim.isEmpty =>  children
    case e: Elem if (e.label == "i" || e.label == "b") && e.text.trim.isEmpty =>
      e.child
    case n => n
  }

  private val cleanNameSpaces = topDown((n: Node) =>
    n match {
      case Elem(pref, label, attrs, _, cl @ _*) =>
        Elem(pref, label, attrs, TopScope, true, cl: _*)
      case _ => n
    })

  private def cleanRepeatedEmptyParagraphs(ns: Seq[Node]) = {
    def f(n: Node, ns: List[Node]) =
      n match {
        case Elem(_, name, _, _, _*) if name == "p" || name == "blockquote" => ns match {
          //case List() => n :: ns
          case (n2 @ Elem(_, "p", _, _, _*)) :: _ =>
            if (n.text.isEmpty && n2.text.isEmpty) { ns }
            else { n :: ns }
          case _ => n :: ns
        }
        case _ => n :: ns
      }
    ns.foldRight(List[Node]())(f)
  }

  private def getStyleMap(m: MetaData) = {
    m.get("style") match {
      case None => Map[String, String]()
      case Some(s) => (NodeSeq fromSeq s).text.split(";").
        map((x: String) => { val r = x.split(":"); (r(0), r(1)) }).
        toMap
    }
  }

  private def styleIsTheSame(m1: MetaData, m2: MetaData) =
    getStyleMap(m1) == getStyleMap(m2)

  private def collectWhiteSpace(ns: Seq[Node]): (Seq[Node], Seq[Node]) =
    ns.span((n: Node) => n match {
      case Text(t) => t.trim.isEmpty
      case _ => false
    })

  private val mergeTextNodes = topDown(
    mapElements(id, changeChildren((ns: Seq[Node]) => {
      def mergeTexts(n: Node, ns: List[Node]): List[Node] = {
        (n, ns) match {
          case (Text(t1), Text(t2) :: ns2) => Text(t1 + t2) :: ns2
          case _ => n :: ns
        }
      }
      ns.foldRight(List[Node]())(mergeTexts)
    })))

  private val mergeSpans = bottomUp(
    mapElements(id, (e: Elem) => {
      val Elem(pref, label, attrs, scope, cl @ _*) = e
      def mergeit(n: Node, ns: List[Node]) = {
        val (ws, nss) = collectWhiteSpace(ns)
        val res = n match {
          case Elem(_, "span", attrs1, _, cl1 @ _*) =>
            nss match {
              case Elem(pref, "span", attrs2, scope, cl2 @ _*) :: ns2 =>
                if (styleIsTheSame(attrs1, attrs2)) {
                  Elem(pref, "span", attrs2, scope, true,  cl1 ++ ws ++ cl2 : _*) :: ns2
                } else {
                  n :: ns
                }
              case _ => n :: ns
            }
          case _ => n :: ns
        }
        res
      }
      val cll = cl.foldRight(List[Node]())(mergeit)
      Elem(pref, label, attrs, scope, true, cll: _*)
    }))

  private val cleanSpecialCharacters = {
    val cleanit = (x: Unit, s: String) => {
      val s1 = s.map((c: Char) => c match {
        case '\u0096' => '-'
        case _ => c
      })
      (s1, x)
    }
    transformTextWith(cleanit)(())
  }

  private def applySeq[T](fs: Seq[T => T]) =
    (v0: T) => fs.foldLeft(v0)((v: T, f: T => T) => f(v))

  private def applySeqTo[T](v0: T)(fs: Seq[T => T]) = applySeq(fs)(v0)

  private def pipelineXHTML(xhtml: Elem): List[Node] = {
    
    def debug(where: String): List[Node] => List[Node] = (l: List[Node]) => {
      println("debug: " + where + ":")
      l.zipWithIndex foreach { 
        case (n,i) =>
          println("  [%20s][%06d]: %s ".format(where,i,n.toString) )
      }      
      l
    }
    
    val xhtml2 = renameHeadings(List(xhtml)).collect { case e : Elem => e }.head
    
    val baseElems = selectBaseElems(xhtml2)
    
    val divs = chooseDivs(baseElems)
       
    val validElems = explodeDivs(divs)

    val res = applySeqTo(validElems)(List[List[Node] => List[Node]](
      //debug("start"),
      cleanNameSpaces,
      //debug("after cleanNameSpaces"),
      cleanSeqNodes,
      //debug("after cleanSeqNodes"),
      _.flatMap(cleanAttributes),
      normalizeSpace,
      cleanSpuriousSpans,      
      mergeTextNodes,
      mergeSpans,
      fixSpans, 
      cleanRepeatedEmptyParagraphs,
      cleanSpecialCharacters))
    res
  }
  
  def pipelineWithDefaultConverter(source: Array[Byte], mimeType: String) : Option[List[Node]] = 
    pipeline(source,mimeType,defaultConverter)
    
  def pipeline(source: Array[Byte], mimeType: String, converter : Converter): Option[List[Node]] = 
    convertSrcToXHTML(source, mimeType,converter).map(pipelineXHTML)
  

  def pipeline(rtfSource: InputStream,converter : Converter = defaultConverter): XHTMLProcessorResult =  
    pipeline(IOUtils.toByteArray(rtfSource), "text/rtf",converter) match {
      case None => Failure
      case Some(x) => Success(x)
    }
  
}