All Downloads are FREE. Search and download functionalities are using the official Maven repository.

raw.inferrer.local.xml.XmlInferrer.scala Maven / Gradle / Ivy

There is a newer version: 0.33.11
Show newest version
/*
 * Copyright 2023 RAW Labs S.A.
 *
 * Use of this software is governed by the Business Source License
 * included in the file licenses/BSL.txt.
 *
 * As of the Change Date specified in that file, in accordance with
 * the Business Source License, use of this software will be governed
 * by the Apache License, Version 2.0, included in the file
 * licenses/APL.txt.
 */

package raw.inferrer.local.xml

import com.typesafe.scalalogging.StrictLogging
import raw.inferrer.api._
import raw.inferrer.local._
import raw.sources.api._
import raw.sources.bytestream.api.SeekableInputStream
import raw.utils.RawException

import java.io.Reader
import javax.xml.stream.XMLStreamException
import scala.util.control.NonFatal

object XmlInferrer {
  private val XML_SAMPLE_SIZE = "raw.inferrer.local.xml.sample-size"
}

class XmlInferrer(implicit protected val sourceContext: SourceContext)
    extends InferrerErrorHandler
    with XmlMergeTypes
    with EncodingInferrer
    with StrictLogging {

  import XmlInferrer._

  private val defaultSampleSize = settings.getInt(XML_SAMPLE_SIZE)

  def infer(
      is: SeekableInputStream,
      maybeEncoding: Option[Encoding],
      maybeSampleSize: Option[Int]
  ): TextInputStreamFormatDescriptor = {
    val r = getTextBuffer(is, maybeEncoding)
    try {
      TextInputStreamFormatDescriptor(r.encoding, r.confidence, infer(r.reader, maybeSampleSize))
    } catch {
      case ex: RawException => throw ex
      case NonFatal(e) => throw new RawException(s"xml inference failed unexpectedly", e)
    } finally {
      r.reader.close()
    }
  }

  def infer(reader: Reader, maybeSampleSize: Option[Int]): TextInputFormatDescriptor = {
    try {
      var nobjs = maybeSampleSize.getOrElse(defaultSampleSize)
      if (nobjs < 0) {
        nobjs = Int.MaxValue
      }
      val xmlReader = new InferrerXmlTypeReader(reader, nobjs)

      var innerType: SourceType = SourceNothingType()
      var count = 0
      while (xmlReader.hasNext && count < nobjs) {
        innerType = xmlReader.nextObj(innerType)
        count += 1
      }

      val result = xmlReader.uniquifyTemporalFormats(innerType)
      // If multiple top-level XML documents, make it a collection.
      val tipe = if (count > 1) SourceCollectionType(result.cleanedType, false) else result.cleanedType
      XmlInputFormatDescriptor(tipe, xmlReader.sampled, result.timeFormat, result.dateFormat, result.timestampFormat)
    } catch {
      case ex: XMLStreamException =>
        val col = ex.getLocation.getColumnNumber
        val row = ex.getLocation.getLineNumber
        throw new LocalInferrerException(s"error parsing XML at row: $row, col: $col", ex)
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy