All Downloads are FREE. Search and download functionalities are using the official Maven repository.

br.gov.lexml.parser.pl.xhtml.XHTML.scala Maven / Gradle / Ivy

The newest version!
package br.gov.lexml.parser.pl.xhtml

import org.apache.commons.io.IOUtils

import java.io.InputStream
import java.io.File
import scala.xml._
import scala.util.matching.Regex
import grizzled.slf4j.Logging
import org.apache.commons.io.FileUtils

import java.io.ByteArrayInputStream
import scala.xml.parsing.NoBindingFactoryAdapter
import org.apache.commons.io.filefilter.PrefixFileFilter

import java.io.FileFilter
import scala.xml.parsing.XhtmlParser
import scala.io.BufferedSource
import br.gov.lexml.parser.pl.docx.DOCXReader

import scala.annotation.{tailrec, unused}
import scala.io.Codec

abstract class XHTMLProcessorResult
case object Failure extends XHTMLProcessorResult
case class Success(result: List[Node]) extends XHTMLProcessorResult

object TextUtils {
  def fixXHTML(data: Array[Byte]) : Array[Byte] =
    new String(data,"utf-8")
      //.replaceFirst("]*>", "")
      .replaceAll("\u0007", "")
      .replaceAll("\u001f", "")
      .replace(0x92: Char, '`')
      .replaceAll("’", "`")
      .replace(0x202d: Char, ' ')
      .replace(0x202c: Char, ' ')
      .replace('–', '-')
      .getBytes("utf-8")
}

trait Converter {
  def convert(srcExtension: String, srcData: Array[Byte], dstExtension: String): Array[Byte]
  def deleteByPrefix(dir : File, prefix : String) : Unit =
        dir.listFiles(new PrefixFileFilter(prefix) : FileFilter).foreach(f => FileUtils.deleteQuietly(f))
}

final class DOCXConverter(otherConverter : Converter) extends Converter {
  override def convert(srcExtension: String, srcData: Array[Byte], dstExtension: String): Array[Byte] = {
	  (srcExtension,dstExtension) match {
	    case ("docx","xhtml") =>  
	      DOCXReader.readDOCX(new ByteArrayInputStream(srcData)).
	      		get.toString.getBytes("utf-8")   
	    
	    case _ => otherConverter.convert(srcExtension,srcData,dstExtension)
	  }
  }
}

final class AbiwordConverter(val removeTemporaryFiles: Boolean = true) extends Converter with Logging {

  private def noPostProc(data: Array[Byte]) = data

  override def convert(srcExtension: String, srcData: Array[Byte], dstExtension: String) : Array[Byte] = {
    logger.info("abiword.convert: starting: srcExtension = " + srcExtension + ", dstExtension = " + dstExtension)
    val srcFile = File.createTempFile("lexml-parser-pl", "." + srcExtension)
    logger.info("abiword.convert: srcFile = " + srcFile)
    val baseName = srcFile.getName.substring(0, srcFile.getName.length - srcExtension.length - 1)
    val baseDir = srcFile.getParentFile
    val destFile = new File(baseDir, baseName + "." + dstExtension)
    logger.info("abiword.convert: destFile = " + destFile)
    val (params, postProc) = dstExtension match {
      case "xhtml" => (List("--to=xhtml", "--exp-props=html4: no; declare-xml: yes; use-awml:no; embed-css: yes; embed-images: yes"), TextUtils.fixXHTML(_ : Array[Byte]))
      case "pdf" => (List("--to=pdf"), noPostProc _)
      case _ => throw new RuntimeException("Abiword Converter does not support extension: " + dstExtension)
    }
    logger.info("abiword.convert: params = " + params)
    try {
      //val srcPath = srcFile.getCanonicalPath
      FileUtils.writeByteArrayToFile(srcFile, srcData)
      val cmd: Array[String] = (("/usr/bin/abiword" :: params) :+ srcFile.getPath).toArray
      logger.info("running " + cmd.mkString(" "))
      val p = Runtime.getRuntime.exec(cmd, Array[String](), srcFile.getParentFile)
      logger.info("returned from abiword")
      p.waitFor
      postProc(FileUtils.readFileToByteArray(destFile))
    } finally {
      if (removeTemporaryFiles) {        
        deleteByPrefix(baseDir,baseName)        
      }
    }
  }
}

@unused
final class OpenOfficeConverter(val removeTemporaryFiles: Boolean = true) extends Converter with Logging {

  private val pyodconverter = "/usr/local/bin/docconverter"

  import TextUtils._

  private def htmlPostProc(data: Array[Byte]) = {
    val data2 = fixXHTML(data)
    val parserFactory = new org.ccil.cowan.tagsoup.jaxp.SAXFactoryImpl
    val parser = parserFactory.newSAXParser()
    val source = new InputSource(new ByteArrayInputStream(data2))
    val adapter = new NoBindingFactoryAdapter
    val e = adapter.loadXML(source, parser)
    e.toString.getBytes("utf-8")
  }

  private def noPostProc(data: Array[Byte]) = data

  override def convert(srcExtension: String, srcData: Array[Byte], dstExtension: String): Array[Byte] = {
    logger.info("oo.convert: starting: srcExtension = " + srcExtension + ", dstExtension = " + dstExtension)
    val dstExtension2 = dstExtension match {
      case "xhtml" => "html"
      case x => x
    }
    val srcFile = File.createTempFile("lexml-parser-pl", "." + srcExtension)
    logger.info("oo.convert: srcFile = " + srcFile)
    val baseName = srcFile.getName.substring(0, srcFile.getName.length - srcExtension.length - 1)
    val baseDir = srcFile.getParentFile
    val destFile = new File(baseDir, baseName + "." + dstExtension2)
    logger.info("oo.convert: destFile = " + destFile)
    val postProc = dstExtension match {
      case "xhtml" => htmlPostProc _
      case _ => noPostProc _
    }

    try {
      FileUtils.writeByteArrayToFile(srcFile, srcData)
      val cmd: Array[String] = Array(pyodconverter, srcFile.getPath, destFile.getPath)
      val p = Runtime.getRuntime.exec(cmd, Array[String](), srcFile.getParentFile)
      p.waitFor
      val res = postProc(FileUtils.readFileToByteArray(destFile))
      //FileUtils.writeByteArrayToFile(new File(destFile.getParentFile,destFile.getName + ".res"),res)
      res
    } finally {
      if (removeTemporaryFiles) {
        deleteByPrefix(baseDir,baseName)        
      }
    }
  }
}

object XHTMLProcessor extends Logging {

  val accept : Set[String] = Set(
      "text/plain",
      "text/html",
      "application/rtf",
      "text/rtf",
      "application/msword",
      "application/vnd.oasis.opendocument.text", 
      "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
      )
  
  import TextUtils._

/*  lazy val dtdFile: File = {
    val f = File.createTempFile("xhtml11-", ".dtd")
    f.deleteOnExit()
    f
  }
  lazy val dtdUrl: String = {
    val is = getClass.getClassLoader().getResourceAsStream("xhtml11.dtd")
    val os = new BufferedOutputStream(new FileOutputStream(dtdFile))
    IOUtils.copy(is, os)
    IOUtils.closeQuietly(is)
    IOUtils.closeQuietly(os)
    dtdFile.getCanonicalFile.toURI.toURL.toString
  }*/

  //val converter : Converter = new AbiwordConverter
  private val defaultConverter: Converter = new DOCXConverter(new AbiwordConverter)

  private def changeChildren[T <: Seq[Node]](f: Seq[Node] => Seq[Node]) = (e: T) => {
    e match {
      case Elem(pref, name, attrs, scope, children @ _*) =>
        Elem(pref, name, attrs, scope, true, f(children): _*).asInstanceOf[T]
      case _ => e
    }
  }

  //  var removeTemporaryFiles = true
  //  def fixXHTML(xhtml: String) = xhtml
  //    .replaceFirst("]*>", "")
  //    .replaceAll("\007", "")
  //    .replaceAll("\037", "")
  //    .replace(0x92: Char, '`')
  //    .replaceAll("’", "`")
  //    .replace(0x202d: Char, ' ')
  //    .replace(0x202c: Char, ' ')
  //
  //
  //  
  //  def abiwordConversion(extension : String) =
  //	  	(extension,convertUsingAbiword2(extension)(_))
  //	  	
  //  def convertUsingAbiword2(extension : String)(source : Array[Byte]) =
  //	  convertUsingAbiword(source,extension)
  //    
  //  def convertUsingAbiword(source: Array[Byte], extension: String): Elem = {
  //    val srcFile = File.createTempFile("lexml-parser-pl", "." + extension)
  //    val xhtmlFile = new File(srcFile.getCanonicalPath.replaceFirst(extension + "$", "xhtml"))
  //    try {
  //      //val srcPath = srcFile.getCanonicalPath
  //      FileUtils.writeByteArrayToFile(srcFile, source)
  //      val cmd: Array[String] = Array(
  //        "/usr/bin/abiword", "--to=xhtml", srcFile.getName, "--exp-props=html4: no; declare-xml: yes; use-awml:no; embed-css: no; embed-images: no")
  //      val p = Runtime.getRuntime.exec(cmd, Array[String](), srcFile.getParentFile)
  //      p.waitFor
  //
  //      val xhtmlData = fixXHTML(FileUtils.readFileToString(xhtmlFile))
  //
  //      val reader = new StringReader(xhtmlData)
  //
  //      try {
  //        if (!xhtmlFile.exists || xhtmlFile.length < 300) {
  //          throw new RuntimeException("HTML de saida do Abiword vazio")
  //        }
  //        else { XML.load(reader) }
  //      } finally {
  //        IOUtils.closeQuietly(reader)
  //      }    
  //    } finally {
  //      if (removeTemporaryFiles) {
  //        srcFile.delete
  //        xhtmlFile.delete
  //        val f = new File(xhtmlFile.getPath() + "_files")
  //        if (f.exists() && f.isDirectory()) {
  //          f.listFiles().foreach(_.delete)
  //          f.delete
  //        }
  //      }
  //    }
  //  }

  private type SourceProcessor = (Array[Byte], Converter) => Elem

  private def xhtmlLoader(in : Array[Byte]) : Elem = xhtmlLoader(new ByteArrayInputStream(in))
	  
  private def xhtmlLoader(is : InputStream) : Elem =
	  XhtmlParser(new BufferedSource(is)(Codec.UTF8)).collectFirst({ case e : Elem => e }).get
	      
  private def externalConvertToXhtml(extension: String) = (extension, (data: Array[Byte], converter : Converter) => {
    System.setProperty("file.encoding", "utf-8")
    val converted = converter.convert(extension, data, "xhtml")
    val r = xhtmlLoader(converted)
    (r \\ "html").collect { case e : Elem => e } . head
    
  })
  
  private val sourceProcessorMap: Map[String, (String, SourceProcessor)] = Map(
    ("text/plain", ("txt", (source: Array[Byte], _ : Converter) => {
      val text = fixXHTML(source)

      val lines = scala.io.Source.fromBytes(text,"utf-8").getLines().to(List)
      @tailrec
      def toPars(l: List[String], r: List[String] = Nil, s: List[String] = Nil): List[String] = l match {
        case Nil => s match { case Nil => r; case _ => s.mkString("", " ", "") :: r }
        case x :: xs if x.trim.isEmpty => toPars(xs, s.mkString("", " ", "") :: r)
        case x :: xs => toPars(xs, r, x :: s)
      }
      val pars = toPars(lines).reverse
      
{ pars.map(p =>

{ p }

) }
})), ("application/xhtml+xml", ("xhtml", (source: Array[Byte],_ : Converter) => { val text = fixXHTML(source) xhtmlLoader(text) })), ("text/html", ("html", (source: Array[Byte],_ : Converter) => { val text = fixXHTML(source) xhtmlLoader(text) })), ("application/rtf", externalConvertToXhtml("rtf")), ("text/rtf", externalConvertToXhtml("rtf")), ("application/msword", externalConvertToXhtml("doc")), ("application/vnd.oasis.opendocument.text", externalConvertToXhtml("odt")), ("application/vnd.openxmlformats-officedocument.wordprocessingml.document", externalConvertToXhtml("docx"))) def convertSrcToXHTML(source: Array[Byte], mimeType: String,converter : Converter): Option[Elem] = sourceProcessorMap.get(mimeType).map(_._2(source,converter)) def convertRTFtoXHTML(rtfSource: InputStream, converter : Converter): Option[Elem] = convertSrcToXHTML(IOUtils.toByteArray(rtfSource), "text/rtf",converter) private def selectBaseElems(root: Elem): List[Elem] = { val body = (root \\ "body").head.asInstanceOf[Elem] val belems = root.child.to(List).dropWhile ((n : Node) => n match { case e : Elem => e.label != "body" case _ => true }) def getAttr(n: Node, attr: String) = n match { case e: Elem => e.attributes.get(attr).map(_.text.toLowerCase) case x => None } def getIdOrType(n: Node) = getAttr(n, "id").orElse(getAttr(n, "type")).getOrElse("") val childs = trim(belems) val childs1 = childs.filter((n: Node) => { val x = getIdOrType(n) ; x != "header" && x != "footer" }) val (cl1, cl2) = childs1.span({ case e: Elem => e.label == "table"; case _ => false }) val childs3 = (cl1 \\ "*").filter { case e : Elem => e.label == "p" || e.label == "h1" || e.label == "h2" || e.label == "h3" case _ => false } ++ cl2 val r = wrapText(childs3.toList).collect { case e : Elem => e } r } private def chooseDivs(divs: List[Elem]): List[Elem] = divs private val parLabels: Set[String] = Set("p", "h1", "h2", "h3", "h4", "blockquote") private val isValidElem: PartialFunction[Node, Node] = { case e@Elem(_, "table", _, _, _@_*) => e case Elem(pref, "ol", _, scope, children@_*) => Elem(pref, "ol", Null, scope, true, children.collect(isValidElem): _*) case Elem(pref, "li", _, scope, children@_*) => Elem(pref, "li", Null, scope, true, children.collect(isValidElem orElse isContent): _*) case Elem(pref, label, attrs, scope, children@_*) if parLabels.contains(label) => Elem(pref, "p", attrs, scope, true, children: _*) } private val isContent: PartialFunction[Node, Node] = ({ case t: Text => t case e @ Elem(_, "span", _, _, _*) => e }: PartialFunction[Node, Node]).orElse(isValidElem) private def wrapText(nl: List[Node]): List[Node] = { val blockElems = List("table","thead","tbody","th","tr","td","p","blockquote","center","div","img") @tailrec def docollect(nl: List[Node], accum: List[Node] = Nil, accum2 : List[Node] = Nil): List[Node] = nl match { case Nil if trim(accum).isEmpty => accum2.reverse case Nil => (

{ NodeSeq fromSeq accum.reverse }

:: accum2).reverse case (t: Text) :: r => docollect(r, t :: accum, accum2) case (e: Elem) :: r if !blockElems.contains(e.label) => docollect(r, e :: accum, accum2) case (e : Elem) :: r if trim(accum).isEmpty => docollect(r,Nil,e :: accum2) case (e : Elem) :: r => docollect(r,Nil,e ::

{ NodeSeq fromSeq accum.reverse }

:: accum2) case b :: _ => sys.error(s"unexpected block at wrapText.docollect: $b") } docollect(nl, Nil) } private def trimLeft(nl: List[Node]) = nl.dropWhile({ case t: Text => t.text.trim.isEmpty; case _ => false }) def trim(nl: List[Node]): List[Node] = trimLeft(trimLeft(nl).reverse).reverse private val explodedBlockElements = Set("div", "center") private val explodedInlineElements = Set("font") private def explodeDivs(divs: List[Elem]) = { def explode(n: Node): List[Node] = n match { case e: Elem if explodedBlockElements.contains(e.label) => wrapText(e.child.toList).flatMap(explode) case e: Elem if explodedInlineElements.contains(e.label) => e.child.toList.flatMap(explode) case e: Elem if e.label == "td" => trim(e.child.toList) match { case List(e2: Elem) if e2.label == "p" => List(e copy (child = e2.child.toList.flatMap(explode))) case _ => List(e copy (child = e.child.toList.flatMap(explode))) } case e: Elem => List(e copy (child = e.child.toList.flatMap(explode))) case x => List(x) } divs.flatMap(explode).collect(isValidElem) } private def mapToAttributes(m: Map[String, String]) = m.foldRight(Null.asInstanceOf[MetaData])( (kv, md) => { val (k, v) = kv new UnprefixedAttribute(k, v, md) }) private def cleanAttributes: Seq[Node] => Seq[Node] = bottomUp((n: Node) => { val changeAttrs = (f: Map[String, String] => Map[String, String]) => (e: Node) => e match { case Elem(pref, name, attrs, scope, children @ _*) => Elem(pref, name, mapToAttributes(f(attrs.asAttrMap)), scope, true, children: _*) } val allowedStyles = Set[String]( "font-weight:bold", "font-weight:bolder", "font-weight:normal", "font-style:italic", "font-style:oblique", "font-style:normal", "text-decoration:underline","vertical-align:super", "vertical-align:sub" ) def cleanStyle(v: String) = { v.split(";").filter(allowedStyles.contains).mkString(";") } val filterStyle: PartialFunction[(String, String), (String, String)] = { case ("style", v : String) => ("style", cleanStyle(v)) } val cleanSpanAttrs: Elem => Elem = changeAttrs(_.collect(filterStyle).toMap) val filterRedundantTDAttrs = (m: Map[String, String]) => m.filter({ case ("rowspan", "1") => false case ("colspan", "1") => false case _ => true }) def keepOnly(keyNames: String*) = (m: Map[String, String]) => m.filter(x => keyNames.contains(x._1)) val emptyAttributes = changeAttrs((_: Map[String, String]) => Map()) def saveIndentation(e: Elem) = e match { case Elem(pref, name, attrs, scope, children @ _*) => val styleMap = getStyleMap(attrs).withDefault(_ => "") def parseAndNormalize(s: String): Option[Double] = { val re = new Regex("(-?[0-9.]+)([a-z]+)") val ratios = Map[String, Double]( "mm" -> 1, "cm" -> 10, "in" -> 25.4) val m = re.findAllIn(s) if (m.nonEmpty) { val num = m.group(1).toDouble val unit = m.group(2) val ratio = ratios.get(unit) ratio.map(_ * num) } else { None } } val textIndent = parseAndNormalize(styleMap("text-indent")) val marginLeft = parseAndNormalize(styleMap("margin-left")) val centered = styleMap("text-align") == "center" val indentation = List(textIndent, marginLeft).collect({ case Some(x) => x }).sum Elem(pref, name, new UnprefixedAttribute("indentation", Text(indentation.toString), new UnprefixedAttribute("centered", centered.toString, Null)), scope, true, children: _*) } n match { case Elem(_, label, _, _, _*) => val e = n.asInstanceOf[Elem] label match { case "span" => cleanSpanAttrs(e) case "table" => changeAttrs(keepOnly("rows", "cols"))(e) case "td" => changeAttrs(filterRedundantTDAttrs.andThen(keepOnly("colspan", "rowspan")))(e) case _ => emptyAttributes(e) } case _ => n } }) private def fixSpans(nl: List[Node]): List[Node] = { nl.flatMap({ case e @ Elem(pref, label, attrs, scope, child @ _*) => val child2 = fixSpans(child.toList) e.label match { case "span" => makeSpanOrIandB(pref, scope, attrs, child2) case _ => List(Elem(pref, label, attrs, scope, true, child2: _*)) } case n => List(n) }) } private def makeSpanOrIandB(prefix: String, scope: NamespaceBinding, attrs: MetaData, child: Seq[Node]): Seq[Node] = { def makePair(s: String) = s.span(c => c != ':') match { case (k, "") => (k, "") case (k, v) => (k, v.substring(1)) } val attrMap = attrs.asAttrMap val styleString = attrMap.getOrElse("style", "") val otherAttrs: Map[String, String] = attrMap - "style" val styles = styleString.split(";").map(makePair).toMap val italicPresent = styles.get("font-style") match { case Some("italic") => true case Some("oblique") => true case _ => false } val boldPresent = styles.get("font-weight") match { case Some("bold") => true case Some("bolder") => true case _ => false } val isSuperScript = styles.get("vertical-align").contains("super") val isSubScript = styles.get("vertical-align").contains("sub") val hasUnderline = styles.get("text-decoration").contains("underline") val otherStyles = styles - "font-style" - "font-weight" - "text-decoration" - "vertical-align" val restMap: Map[String, String] = if (otherStyles.isEmpty) { otherAttrs } else { otherAttrs + (("style", otherStyles.toList.map(x => x._1 + ":" + x._2).mkString("", ";", ""))) } var e = if (restMap.isEmpty) { child } else { Elem(prefix, "span", mapToAttributes(restMap), scope, true, child: _*) } e = if (italicPresent) { Elem(prefix, "i", Null, scope, true, e: _*) } else { e } e = if (boldPresent) { Elem(prefix, "b", Null, scope, true, e: _*) } else { e } e = if (isSuperScript) { Elem(prefix, "sup", Null, scope, true, e: _*) } else { e } e = if (isSubScript) { Elem(prefix, "sub", Null, scope, true, e: _*) } else { e } //e = if (hasUnderline) { Elem(prefix, "u", Null, scope, true, e: _*) } else { e } if (hasUnderline) { logger.warn("text has underline!") } //logger.info("makeSpanOrIandB: attrMap = " + attrMap + ", styleString = " + styleString + ", styles = " + styles + ", italicPresent = " + italicPresent + ", boldPresent = " + boldPresent + ", restMap.isEmpty = " + restMap.isEmpty + ", res = " + child3) e } private def mapElements[T](others: Node => T, elem: Elem => T) = (n: Node) => n match { case e: Elem => elem(e) case _ => others(n) } def id[T]: T => T = (t: T) => t private val validElements = Set("p", "span", "sup", "sub", "table", "tr", "td", "th", "b", "i", "ol", "li", "img", "blockquote", "u", "h1","h2","h3","h4") private val cleanSeqNodes: List[Node] => List[Node] = bottomUp(mapElements(id, (e: Elem) => if (validElements.contains(e.label)) { e } else { e.child })) private val headings = Set("h1", "h2", "h3", "h4") private val renameHeadings: List[Node] => List[Node] = bottomUp(mapElements(id, (e: Elem) => if (headings.contains(e.label)) { e copy (label = "p") } else { e })) private def bottomUp(f: Node => Seq[Node]): Seq[Node] => List[Node] = (ns: Seq[Node]) => { val chChildren = (n: Node) => changeChildren(bottomUp(f))(n) val nl = ns.iterator.toList nl.flatMap(f.compose(chChildren)) } private def topDown(f: Node => Seq[Node]): Seq[Node] => List[Node] = (ns: Seq[Node]) => { val chChildren = (n: Node) => changeChildren(bottomUp(f))(n) val nl = ns.iterator.toList nl.flatMap(f).map(chChildren) } def topDownUntil(f: PartialFunction[Node, Seq[Node]]): Seq[Node] => List[Node] = (ns: Seq[Node]) => { val rec = topDownUntil(f) ns.toList.flatMap((n: Node) => f.lift(n) match { case None => changeChildren(rec)(n) case Some(ns2) => ns2 }) } private def transformTextWith[A](f: (A, String) => (String, A))(a: A): Seq[Node] => List[Node] = (ns: Seq[Node]) => { def doit(bl: (A, List[Node]), n: Node): (A, List[Node]) = { val (b1, l) = bl n match { case x : Atom[String] => val t1 = x.text val (t2, b2) = f(b1, t1) if (t2.isEmpty) { (b2, l) } else { (b2, Text(t2) :: l) } case Elem(pref, name, attrs, scope, children @ _*) => val (b2, rl) = children.foldLeft(b1, List[Node]())(doit) (b2, Elem(pref, name, attrs, scope, true, rl.reverse: _*) :: l) case _ => (b1, n :: l) } } val (_, rl) = ns.foldLeft(a, List[Node]())(doit) rl.reverse } private def transformTextBackwardsWith[A](f: (A, String) => Option[(String, A)])(a: A): Seq[Node] => List[Node] = (ns: Seq[Node]) => { def doit(bl: (A, List[Node], Boolean), n: Node): (A, List[Node], Boolean) = { val (b1, l, skip) = bl if (skip) { (b1, n :: l, skip) } else n match { case x : Atom[String] => val t1 = x.text f(b1, t1) match { case None => (b1, n :: l, true) case Some((t2, b2)) => if (t2.isEmpty) { (b2, l, false) } else { (b2, Text(t2) :: l, false) } } case Elem(pref, name, attrs, scope, children @ _*) => val (b2, rl, skip2) = children.reverse.foldLeft(b1, List[Node](), skip)(doit) (b2, Elem(pref, name, attrs, scope, true, rl: _*) :: l, skip2) case _ => (b1, n :: l, skip) } } val (_, rl, _) = ns.reverse.foldLeft(a, List[Node](), false)(doit) rl } private val re1 = new Regex("(\\s| )+") private val re2 = new Regex("^ +") private val re3 = new Regex("“ +") private val re4 = new Regex(" +”") private def cleanSpaces(trimLeft: Boolean, s: String) = { val s1 = re1.replaceAllIn(s, " ") val s2 = re3.replaceAllIn(s1, "“") val s3 = re4.replaceAllIn(s2, "”") val s4 = if (!trimLeft || s3.isEmpty) { s3 } else { re2.replaceFirstIn(s3, "") } val tl = if (s4.isEmpty) { trimLeft } else { s4.endsWith(" ") } (s4, tl) } private val re5 = new Regex(" +$") private val normalizeSpace: Seq[Node] => List[Node] = topDownUntil((n: Node) => n match { case Elem(_, label, _, _, _*) if label == "p" || label == "li" || label == "blockquote" => val ns1 = transformTextWith(cleanSpaces)(true)(n) val ns2 = transformTextBackwardsWith( (skip: Boolean, t: String) => if (skip) { None } else { Some(re5.replaceFirstIn(t, ""), true) })(false)(ns1) ns2 }) private def getAttr(md: MetaData, key: String): String = { md.get(key) match { case None => "" case Some(l) => l.map(_.text).mkString("", "", "") } } private val cleanSpuriousSpans = topDown { case n@Elem(_, "span", attrs, _, children@_*) if getAttr(attrs, "style").isEmpty || n.text.trim.isEmpty => children case e: Elem if (e.label == "i" || e.label == "b") && e.text.trim.isEmpty => e.child case n => n } private val cleanNameSpaces = topDown((n: Node) => n match { case Elem(pref, label, attrs, _, cl @ _*) => Elem(pref, label, attrs, TopScope, true, cl: _*) case _ => n }) private def cleanRepeatedEmptyParagraphs(ns: Seq[Node]) = { def f(n: Node, ns: List[Node]) = n match { case Elem(_, name, _, _, _*) if name == "p" || name == "blockquote" => ns match { //case List() => n :: ns case (n2 @ Elem(_, "p", _, _, _*)) :: _ => if (n.text.isEmpty && n2.text.isEmpty) { ns } else { n :: ns } case _ => n :: ns } case _ => n :: ns } ns.foldRight(List[Node]())(f) } private def getStyleMap(m: MetaData) = { m.get("style") match { case None => Map[String, String]() case Some(s) => (NodeSeq fromSeq s).text.split(";"). map((x: String) => { val r = x.split(":"); (r(0), r(1)) }). toMap } } private def styleIsTheSame(m1: MetaData, m2: MetaData) = getStyleMap(m1) == getStyleMap(m2) private def collectWhiteSpace(ns: Seq[Node]): (Seq[Node], Seq[Node]) = ns.span((n: Node) => n match { case Text(t) => t.trim.isEmpty case _ => false }) private val mergeTextNodes = topDown( mapElements(id, changeChildren((ns: Seq[Node]) => { def mergeTexts(n: Node, ns: List[Node]): List[Node] = { (n, ns) match { case (Text(t1), Text(t2) :: ns2) => Text(t1 + t2) :: ns2 case _ => n :: ns } } ns.foldRight(List[Node]())(mergeTexts) }))) private val mergeSpans = bottomUp( mapElements(id, (e: Elem) => { val Elem(pref, label, attrs, scope, cl @ _*) = e def mergeit(n: Node, ns: List[Node]) = { val (ws, nss) = collectWhiteSpace(ns) val res = n match { case Elem(_, "span", attrs1, _, cl1 @ _*) => nss match { case Elem(pref, "span", attrs2, scope, cl2 @ _*) :: ns2 => if (styleIsTheSame(attrs1, attrs2)) { Elem(pref, "span", attrs2, scope, true, cl1 ++ ws ++ cl2 : _*) :: ns2 } else { n :: ns } case _ => n :: ns } case _ => n :: ns } res } val cll = cl.foldRight(List[Node]())(mergeit) Elem(pref, label, attrs, scope, true, cll: _*) })) private val cleanSpecialCharacters = { val cleanit = (x: Unit, s: String) => { val s1 = s.map((c: Char) => c match { case '\u0096' => '-' case _ => c }) (s1, x) } transformTextWith(cleanit)(()) } private def applySeq[T](fs: Seq[T => T]) = (v0: T) => fs.foldLeft(v0)((v: T, f: T => T) => f(v)) private def applySeqTo[T](v0: T)(fs: Seq[T => T]) = applySeq(fs)(v0) private def pipelineXHTML(xhtml: Elem): List[Node] = { def debug(where: String): List[Node] => List[Node] = (l: List[Node]) => { println("debug: " + where + ":") l.zipWithIndex foreach { case (n,i) => println(" [%20s][%06d]: %s ".format(where,i,n.toString) ) } l } val xhtml2 = renameHeadings(List(xhtml)).collect { case e : Elem => e }.head val baseElems = selectBaseElems(xhtml2) val divs = chooseDivs(baseElems) val validElems = explodeDivs(divs) val res = applySeqTo(validElems)(List[List[Node] => List[Node]]( //debug("start"), cleanNameSpaces, //debug("after cleanNameSpaces"), cleanSeqNodes, //debug("after cleanSeqNodes"), _.flatMap(cleanAttributes), normalizeSpace, cleanSpuriousSpans, mergeTextNodes, mergeSpans, fixSpans, cleanRepeatedEmptyParagraphs, cleanSpecialCharacters)) res } def pipelineWithDefaultConverter(source: Array[Byte], mimeType: String) : Option[List[Node]] = pipeline(source,mimeType,defaultConverter) def pipeline(source: Array[Byte], mimeType: String, converter : Converter): Option[List[Node]] = convertSrcToXHTML(source, mimeType,converter).map(pipelineXHTML) def pipeline(rtfSource: InputStream,converter : Converter = defaultConverter): XHTMLProcessorResult = pipeline(IOUtils.toByteArray(rtfSource), "text/rtf",converter) match { case None => Failure case Some(x) => Success(x) } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy