All Downloads are FREE. Search and download functionalities are using the official Maven repository.

biz.neumann.parser.HTMLParser.scala Maven / Gradle / Ivy

The newest version!
package biz.neumann

/**
 * AN-iT
 *
 * User: Andreas Neumann
 * Mail: [email protected]
 * URL: http://www.an-it.com
 * Date: 31.10.11
 * Time: 19:52
 * Package: biz.neumann.parser
 */

import xml.NodeSeq
import java.net.URL
import java.io.{FileInputStream, File, InputStream}

class HTMLParser {

  /* Quelle: http://www.hars.de/2009/01/html-as-xml-in-scala.html*/
  val parserFactory = new org.ccil.cowan.tagsoup.jaxp.SAXFactoryImpl
  val parser = parserFactory.newSAXParser()
  val adapter = new scala.xml.parsing.NoBindingFactoryAdapter

  def fromFile(file : File) : NodeSeq = fromStream( new FileInputStream( file ) )
  def fromFile(file : String) : NodeSeq = fromStream( new FileInputStream( new File(file) ) )

  def fromURL(url : String) : NodeSeq = fromStream( new URL(url).openStream() )

  def fromStream(stream : InputStream ) : NodeSeq = {
      val HTMLFile = new org.xml.sax.InputSource(stream )
      HTMLFile.setEncoding("UTF-8") // Force UTF-8
      adapter.loadXML(HTMLFile, parser)
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy