scala.xml.include.sax.EncodingHeuristics.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-core Show documentation
Show all versions of spark-core Show documentation
Shaded version of Apache Spark 2.x.x for Presto
The newest version!
/* __ *\
** ________ ___ / / ___ Scala API **
** / __/ __// _ | / / / _ | (c) 2002-2013, LAMP/EPFL **
** __\ \/ /__/ __ |/ /__/ __ | http://scala-lang.org/ **
** /____/\___/_/ |_/____/_/ | | **
** |/ **
\* */
package scala
package xml
package include.sax
import java.io.InputStream
import scala.util.matching.Regex
/**
* `EncodingHeuristics` reads from a stream
* (which should be buffered) and attempts to guess
* what the encoding of the text in the stream is.
* If it fails to determine the type of the encoding,
* it returns the default UTF-8.
*
* @author Burak Emir
* @author Paul Phillips
*/
object EncodingHeuristics {
object EncodingNames {
// UCS-4 isn't yet implemented in java releases anyway...
val bigUCS4 = "UCS-4"
val littleUCS4 = "UCS-4"
val unusualUCS4 = "UCS-4"
val bigUTF16 = "UTF-16BE"
val littleUTF16 = "UTF-16LE"
val utf8 = "UTF-8"
val default = utf8
}
import EncodingNames._
/**
* This utility method attempts to determine the XML character encoding
* by examining the input stream, as specified at
* [[http://www.w3.org/TR/xml/#sec-guessing w3]].
*
* @param in `InputStream` to read from.
* @throws IOException if the stream cannot be reset
* @return the name of the encoding.
*/
def readEncodingFromStream(in: InputStream): String = {
var ret: String = null
val bytesToRead = 1024 // enough to read most XML encoding declarations
def resetAndRet = { in.reset; ret }
// This may fail if there are a lot of space characters before the end
// of the encoding declaration
in mark bytesToRead
val bytes = (in.read, in.read, in.read, in.read)
// first look for byte order mark
ret = bytes match {
case (0x00, 0x00, 0xFE, 0xFF) => bigUCS4
case (0xFF, 0xFE, 0x00, 0x00) => littleUCS4
case (0x00, 0x00, 0xFF, 0xFE) => unusualUCS4
case (0xFE, 0xFF, 0x00, 0x00) => unusualUCS4
case (0xFE, 0xFF, _, _) => bigUTF16
case (0xFF, 0xFE, _, _) => littleUTF16
case (0xEF, 0xBB, 0xBF, _) => utf8
case _ => null
}
if (ret != null)
return resetAndRet
def readASCIIEncoding: String = {
val data = new Array[Byte](bytesToRead - 4)
val length = in.read(data, 0, bytesToRead - 4)
// Use Latin-1 (ISO-8859-1) because all byte sequences are legal.
val declaration = new String(data, 0, length, "ISO-8859-1")
val regexp = """(?m).*?encoding\s*=\s*["'](.+?)['"]""".r
(regexp findFirstMatchIn declaration) match {
case None => default
case Some(md) => md.subgroups(0)
}
}
// no byte order mark present; first character must be '<' or whitespace
ret = bytes match {
case (0x00, 0x00, 0x00, '<') => bigUCS4
case ('<', 0x00, 0x00, 0x00) => littleUCS4
case (0x00, 0x00, '<', 0x00) => unusualUCS4
case (0x00, '<', 0x00, 0x00) => unusualUCS4
case (0x00, '<', 0x00, '?') => bigUTF16 // XXX must read encoding
case ('<', 0x00, '?', 0x00) => littleUTF16 // XXX must read encoding
case ('<', '?', 'x', 'm') => readASCIIEncoding
case (0x4C, 0x6F, 0xA7, 0x94) => utf8 // XXX EBCDIC
case _ => utf8 // no XML or text declaration present
}
resetAndRet
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy