
better.files.UnicodeCharset.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of better-files_2.11 Show documentation
Show all versions of better-files_2.11 Show documentation
Simple, safe and intuitive I/O in Scala
The newest version!
package better.files
import java.nio.charset._
import java.nio.{BufferOverflowException, ByteBuffer, CharBuffer}
import scala.collection.JavaConverters._
/** A Unicode charset that handles byte-order markers
*
* @param underlyingCharset Use this charset if no known byte-order marker is detected; use this for encoding too
* @param writeByteOrderMarkers If set, write BOMs while encoding
*/
class UnicodeCharset(underlyingCharset: Charset, writeByteOrderMarkers: Boolean)
extends Charset(underlyingCharset.name(), underlyingCharset.aliases().asScala.toArray) {
override def newDecoder() = new UnicodeDecoder(underlyingCharset)
override def newEncoder() =
if (writeByteOrderMarkers) new BomEncoder(underlyingCharset) else underlyingCharset.newEncoder()
override def contains(cs: Charset) = underlyingCharset.contains(cs)
}
/** A Unicode decoder that uses the Unicode byte-order marker (BOM) to auto-detect the encoding
* (if none detected, falls back on the defaultCharset). This also gets around a bug in the JDK
* (http://bugs.java.com/bugdatabase/view_bug.do?bug_id=4508058) where BOM is not consumed for UTF-8.
* See: https://github.com/pathikrit/better-files/issues/107
*
* @param defaultCharset Use this charset if no known byte-order marker is detected
*/
class UnicodeDecoder(defaultCharset: Charset) extends CharsetDecoder(defaultCharset, 1, 1) {
import UnicodeCharset.bomTable
private[this] var inferredCharset: Option[Charset] = None
@annotation.tailrec
private[this] def decode(
in: ByteBuffer,
out: CharBuffer,
candidates: Set[Charset] = Set.empty,
firstCall: Boolean
): CoderResult = {
if (isCharsetDetected) {
detectedCharset().newDecoder().decode(in, out, false)
} else if (firstCall && in.position() != 0) {
// See: https://github.com/pathikrit/better-files/pull/384
inferredCharset = Some(defaultCharset)
decode(in, out, firstCall = false)
} else if (candidates.isEmpty || !in.hasRemaining) {
inferredCharset = Some(defaultCharset)
in.rewind()
decode(in, out, firstCall = false)
} else if (candidates.forall(c => bomTable(c).length == in.position())) {
inferredCharset = candidates.headOption.ensuring(candidates.size == 1, "Ambiguous BOMs found")
decode(in, out, firstCall = false)
} else {
val idx = in.position()
val byte = in.get()
def isPossible(charset: Charset) = bomTable(charset).lift(idx).contains(byte)
decode(in, out, candidates.filter(isPossible), firstCall = false)
}
}
override def decodeLoop(in: ByteBuffer, out: CharBuffer) =
decode(in = in, out = out, candidates = bomTable.keySet, firstCall = true)
override def isCharsetDetected = inferredCharset.isDefined
override def isAutoDetecting = true
override def implReset() = inferredCharset = None
override def detectedCharset() =
inferredCharset.getOrElse(throw new IllegalStateException("Insufficient bytes read to determine charset"))
}
/** Encoder that writes the BOM for this charset
* @param charset
*/
class BomEncoder(charset: Charset) extends CharsetEncoder(charset, 1, 1) {
private[this] val bom = UnicodeCharset.bomTable
.getOrElse(charset, throw new IllegalArgumentException(s"$charset does not support BOMs"))
.toArray
private[this] var isBomWritten = false
override def encodeLoop(in: CharBuffer, out: ByteBuffer): CoderResult = {
if (!isBomWritten) {
try {
out.put(bom)
} catch {
case _: BufferOverflowException => return CoderResult.OVERFLOW
} finally {
isBomWritten = true
}
}
charset.newEncoder().encode(in, out, true)
}
override def implReset() = isBomWritten = false
}
object UnicodeCharset {
private[files] val bomTable: Map[Charset, IndexedSeq[Byte]] = Map(
"UTF-8" -> IndexedSeq(0xef, 0xbb, 0xbf),
"UTF-16BE" -> IndexedSeq(0xfe, 0xff),
"UTF-16LE" -> IndexedSeq(0xff, 0xfe),
"UTF-32BE" -> IndexedSeq(0x00, 0x00, 0xfe, 0xff),
"UTF-32LE" -> IndexedSeq(0xff, 0xfe, 0x00, 0x00)
).collect {
case (charset, bytes) if Charset.isSupported(charset) => Charset.forName(charset) -> bytes.map(_.toByte)
}.ensuring(_.nonEmpty, "No unicode charset detected")
def isValid(charset: Charset): Boolean = bomTable.contains(charset)
def apply(charset: Charset, writeByteOrderMarkers: Boolean = false): Charset =
if (isValid(charset)) new UnicodeCharset(charset, writeByteOrderMarkers)
else charset
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy