org.elasticsearch.spark.sql.ScalaEsRowValueReader.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of elasticsearch-hadoop Show documentation
Elasticsearch Hadoop
There is a newer version: 8.16.0
/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.elasticsearch.spark.sql

import java.sql.Timestamp
import scala.collection.mutable.LinkedHashMap
import scala.collection.mutable.Map
import org.elasticsearch.hadoop.serialization.FieldType
import org.elasticsearch.hadoop.serialization.Parser
import org.elasticsearch.hadoop.serialization.builder.ValueParsingCallback
import org.elasticsearch.spark.serialization.ScalaValueReader
import org.apache.commons.logging.LogFactory
import org.elasticsearch.hadoop.cfg.ConfigurationOptions
import org.elasticsearch.hadoop.util.StringUtils

class ScalaRowValueReader extends ScalaValueReader with RowValueReader with ValueParsingCallback {

  var metadataMap = true
  var rootLevel = true
  var inArray = false
  var currentArrayRowOrder:Seq[String] = Seq.empty[String]

  override def readValue(parser: Parser, value: String, esType: FieldType) = {
    sparkRowField = if (getCurrentField == null) null else getCurrentField.getFieldName

    if (sparkRowField == null) {
      sparkRowField = Utils.ROOT_LEVEL_NAME
    }

    super.readValue(parser, value, esType)
  }

  override def createMap() = {
    if (readMetadata && metadataMap) {
      metadataMap = false
      // metadata has schema [String, String] so convert all values (like score) to String
      new LinkedHashMap[Any, Any] {
        override def put(key: Any, value: Any): Option[Any] = {
          super.put(key, if (value != null) value.toString() else null)
        }
      }
    }
    else {
      val rowOrd = 
      if (inArray) {
        // Recollect the current field name. If the last thing we read before a new object in a list was an empty
        // object, we won't be able to find the correct row order for the next row being created.
        // Example: foo:[{bar: baz, qux:{}},{bar:bizzy}]
        //                              ^   ^____This could break because parser think's that
        //                               \_________ this field is the current one and loads the wrong row order
        // By re-resolving the current field, we can avoid this edge case, because that is managed by a stack in the
        // superclass instead of the local sparkRowField.
        var latestRowField = if (getCurrentField == null) null else getCurrentField.getFieldName
        if (latestRowField == null) {
          throw new IllegalStateException(
            "No field information could be found while creating map for " +
              s"array: previous field [${sparkRowField}], row order [${currentArrayRowOrder}]"
          )
        }

        if (rowColumnsMap.contains(latestRowField)) {
            rowColumns(latestRowField)
        }
        else {
          currentArrayRowOrder
        }
      }
      else rowColumns(sparkRowField)

      new ScalaEsRow(rowOrd)
    }
  }

  // start array
  override def createArray(typ: FieldType): AnyRef = {

    val previousLevel = (inArray, currentArrayRowOrder)
    if (arrayFields.contains(sparkRowField)) {
      inArray = true
      // array of objects
      if (rowColumnsMap.contains(sparkRowField)) {
        currentArrayRowOrder = rowColumns(sparkRowField)
      }
      // array of values
      else {
        // ignore
      }
    }
    else {
      LogFactory.getLog(getClass).warn(
          s"""Field '$sparkRowField' is backed by an array but the associated Spark Schema does not reflect this;
              (use ${ConfigurationOptions.ES_READ_FIELD_AS_ARRAY_INCLUDE}/exclude) """.stripMargin)
    }
    // since the list is not used actually, return the parent field information usable for nested arrays
    previousLevel
  }

  // end array
  override def addToArray(array: AnyRef, values: java.util.List[Object]): AnyRef = {
    // restore previous state
    array match {
      case (pastInArray: Boolean, pastRowOrder: Seq[String @unchecked]) => {
        inArray = pastInArray
        currentArrayRowOrder = pastRowOrder
      }
      case _ => {
        inArray = false
        currentArrayRowOrder = null
      }
    }
    super.addToArray(array, values)
  }

  override def addToMap(map: AnyRef, key: AnyRef, value: Any) = {
    map match {
      case m: Map[_, _]        => super.addToMap(map, key, value)
      case r: ScalaEsRow       => addToBuffer(r, key, value)
    }
  }

  override def createDate(value: Long) = {
    new Timestamp(value)
  }

  def beginDoc(): Unit = {}

  def beginLeadMetadata(): Unit = { metadataMap = true }

  def endLeadMetadata(): Unit = {}

  def beginSource(): Unit = { rootLevel = true; sparkRowField = Utils.ROOT_LEVEL_NAME }

  def endSource(): Unit = {}

  def excludeSource(): Unit = { rootLevel = true; sparkRowField = Utils.ROOT_LEVEL_NAME }

  def beginTrailMetadata(): Unit = {}

  def endTrailMetadata(): Unit = {}

  def endDoc(): Unit = {}
  
  def beginGeoField(): Unit = {
    currentFieldIsGeo = true
  }
  
  def endGeoField(): Unit = {
    currentFieldIsGeo = false
  }
}