ch.ninecode.cim.CIMRecordReader.scala Maven / Gradle / Ivy

package ch.ninecode.cim

import org.apache.commons.logging.{Log, LogFactory}
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapreduce.InputSplit
import org.apache.hadoop.mapreduce.RecordReader
import org.apache.hadoop.mapreduce.TaskAttemptContext
import org.apache.hadoop.mapreduce.lib.input.FileSplit

import ch.ninecode.model.Element

class CIMRecordReader (debug: Boolean = false) extends RecordReader[String, Element]
{
    val log: Log = LogFactory.getLog (getClass)
    var cim: CHIM = _

    def initialize (genericSplit: InputSplit, context: TaskAttemptContext): Unit =
    {
        if (debug)
        {
            log.info ("initialize")
            log.info (s"genericSplit: ${genericSplit.toString}")
            log.info (s"context: ${context.getTaskAttemptID.toString}")
        }
        val job = context.getConfiguration
        val split = genericSplit.asInstanceOf [FileSplit]
        val start = split.getStart
        val bytes = split.getLength
        val file = split.getPath

        // open the file and seek to the start of the split
        val fs = file.getFileSystem (job)
        val in = fs.open (file)

        val end = start + bytes
        val available = fs.getFileStatus (file).getLen
        val extra = if (available > end) Math.min (CHIM.OVERREAD.toLong, available - end) else 0L
        // ToDo: may need to handle block sizes bigger than 2GB - what happens for size > 2^31?
        val size = (bytes + extra).toInt
        val buffer = new Array[Byte](size)
        in.readFully (start, buffer)

        val low =
            if (0 == start)
            // strip any BOM(Byte Order Mark) i.e. 0xEF,0xBB,0xBF
                if ((size >= 3) && (buffer (0) == 0xef) && (buffer (1) == 0xbb) && (buffer (2) == 0xbf))
                    3
                else
                    0
            else
                0

        val first =
            if (0 != start)
            {
                // skip to next UTF-8 non-continuation byte (high order bit zero)
                // by advancing past at most 4 bytes
                var i = 0
                if ((buffer (low + i) & 0xc0) != 0xc0) // check for the start of a UTF-8 character
                    while (0 != (buffer (low + i) & 0x80) && (i < Math.min (4, size)))
                        i += 1
                low + i
            }
            else
                low

        val xml = Text.decode (buffer, first, size - first)
        val len = if (0 == extra) xml.length else Text.decode (buffer, first, (size - first - extra).toInt).length

        // ToDo: using first here is approximate,
        // the real character count would require reading the complete file
        // from 0 to (start + first) and converting to characters
        if (debug)
            log.debug (s"XML text starting at byte offset ${start + first} of length $len characters begins with: ${xml.substring (0, 120)}")
        CIMContext.DEBUG = debug
        cim = new CHIM (xml, first, first + len, start, start + bytes)
    }

    def close (): Unit =
    {
        if (debug)
        {
            log.info ("close")
            for (error <- cim.context.errors)
                log.error (error)
        }
        cim = null
    }

    def getCurrentKey: String = cim.value.id

    def getCurrentValue: Element = cim.value

    def getProgress: Float = cim.progress ()

    def nextKeyValue (): Boolean = cim.parse_one ()
}