All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.locationtech.geomesa.arrow.io.SimpleFeatureArrowFileWriter.scala Maven / Gradle / Ivy

/***********************************************************************
 * Copyright (c) 2013-2024 Commonwealth Computer Research, Inc.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Apache License, Version 2.0
 * which accompanies this distribution and is available at
 * http://www.opensource.org/licenses/apache2.0.php.
 ***********************************************************************/

package org.locationtech.geomesa.arrow.io

import com.typesafe.scalalogging.LazyLogging
import org.apache.arrow.vector.VectorSchemaRoot
import org.apache.arrow.vector.dictionary.{Dictionary, DictionaryProvider}
import org.apache.arrow.vector.ipc.ArrowStreamWriter
import org.apache.arrow.vector.ipc.message.IpcOption
import org.geotools.api.feature.simple.{SimpleFeature, SimpleFeatureType}
import org.locationtech.geomesa.arrow.vector.SimpleFeatureVector.SimpleFeatureEncoding
import org.locationtech.geomesa.arrow.vector.{ArrowDictionary, SimpleFeatureVector}
import org.locationtech.geomesa.utils.io.CloseWithLogging

import java.io.{Closeable, Flushable, OutputStream}
import java.nio.channels.Channels

/**
  * For writing simple features to an arrow file.
  *
  * Uses arrow streaming format (no footer).
  * Closing the SimpleFeatureArrowFileWriter closes the given os.
  *
  * @param vector simple feature vector
  * @param provider dictionary provider
  * @param os output stream
  */
class SimpleFeatureArrowFileWriter private (
    vector: SimpleFeatureVector,
    provider: DictionaryProvider with Closeable,
    os: OutputStream,
    ipcOpts: IpcOption,
    sort: Option[(String, Boolean)],
    flattenStruct: Boolean = false
  ) extends Closeable with Flushable with LazyLogging {
  private val metadata = sort.map { case (field, reverse) => getSortAsMetadata(field, reverse) }.orNull
  private val root = {
    val potentialRoot = createRoot(vector.underlying, metadata)

    if (flattenStruct) {
      new VectorSchemaRoot(potentialRoot.getVector(sft.getTypeName))
    } else {
      potentialRoot
    }
  }
  private val writer = new ArrowStreamWriter(root, provider, Channels.newChannel(os), ipcOpts)

  private var index = 0

  def sft: SimpleFeatureType = vector.sft

  /**
    * Buffer a feature to write
    *
    * @param sf simple feature
    */
  def add(sf: SimpleFeature): Unit = {
    vector.writer.set(index, sf)
    index += 1
  }

  /**
   * Writes any currently buffered features to disk. This will create an ArrowBatch
   * containing the currently buffered features. Note that if there are no features,
   * an empty record batch will be created
   */
  override def flush(): Unit = {
    vector.writer.setValueCount(index)
    root.setRowCount(index)
    writer.writeBatch()
    vector.clear()
    index = 0
  }

  /**
    * Close the writer and flush any buffered features
    */
  override def close(): Unit = {
    try {
      if (index > 0) {
        flush()
      }
      writer.end()
    } finally {
      // note: don't close the vector schema root as it closes the vector as well
      CloseWithLogging.raise(Seq(writer, provider, vector))
    }
  }
}

object SimpleFeatureArrowFileWriter {

  import scala.collection.JavaConverters._

  /**
   * For writing simple features to an arrow file.
   *
   * Uses arrow streaming format (no footer).
   *
   * @param os output stream
   * @param sft simple feature type
   * @param dictionaries map of field names to dictionary values, used for dictionary encoding fields.
   *                     All values must be provided up front.
   * @param encoding encoding options
   */
  def apply(
      os: OutputStream,
      sft: SimpleFeatureType,
      dictionaries: Map[String, ArrowDictionary],
      encoding: SimpleFeatureEncoding,
      ipcOpts: IpcOption,
      sort: Option[(String, Boolean)],
      flattenStruct: Boolean = false): SimpleFeatureArrowFileWriter = {
    val vector = SimpleFeatureVector.create(sft, dictionaries, encoding)
    // convert the dictionary values into arrow vectors
    // make sure we load dictionaries before instantiating the stream writer
    val provider: DictionaryProvider with Closeable = new DictionaryProvider with Closeable {
      private val dictionaries = vector.dictionaries.collect { case (_, d) => d.id -> d.toDictionary(vector.encoding) }
      override def lookup(id: Long): Dictionary = dictionaries(id)
      override def getDictionaryIds: java.util.Set[java.lang.Long] = dictionaries.keys.map(Long.box).toSet.asJava
      override def close(): Unit = CloseWithLogging(dictionaries.values)
    }
    new SimpleFeatureArrowFileWriter(vector, provider, os, ipcOpts, sort, flattenStruct)
  }

  // convert the dictionary values into arrow vectors
  def provider(
      dictionaries: Map[String, ArrowDictionary],
      encoding: SimpleFeatureEncoding): DictionaryProvider with Closeable = {
    new DictionaryProvider with Closeable {
      private val dicts = dictionaries.collect { case (_, d) => d.id -> d.toDictionary(encoding) }
      override def lookup(id: Long): Dictionary = dicts(id)
      override def getDictionaryIds: java.util.Set[java.lang.Long] = dicts.keys.map(Long.box).toSet.asJava
      override def close(): Unit = CloseWithLogging(dicts.values)
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy