All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.dimajix.spark.sql.local.csv.CsvRelation.scala Maven / Gradle / Ivy

/*
 * Copyright (C) 2018 The Flowman Authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.dimajix.spark.sql.local.csv

import java.io.IOException
import java.io.OutputStreamWriter
import java.nio.file.Files
import java.nio.file.Path
import java.nio.file.StandardOpenOption
import java.util.stream.Collectors

import scala.collection.JavaConverters._
import scala.io.Source

import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.Row
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.types.StructType

import com.dimajix.spark.sql.local.BaseRelation


class CsvRelation(context: SQLContext, files:Seq[Path], options:CsvOptions, mschema:StructType) extends BaseRelation {
    override def sqlContext: SQLContext = context

    override def schema: StructType = mschema

    override def read(): DataFrame = {
        val rows = files.flatMap { f =>
            if (Files.isDirectory(f))
                readDirectory(f)
            else
                readFile(f)
        }
        sqlContext.createDataFrame(rows.asJava, schema)
    }

    override def write(df: DataFrame, mode: SaveMode): Unit = {
        val outputFile = files.head
        val outputStream = mode match {
            case SaveMode.Overwrite =>
                Files.createDirectories(outputFile.getParent)
                Files.newOutputStream(outputFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.WRITE)
            case SaveMode.ErrorIfExists =>
                if (Files.exists(outputFile))
                    throw new IOException(s"File '$outputFile' already exists")
                Files.createDirectories(outputFile.getParent)
                Files.newOutputStream(outputFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.WRITE)
            case SaveMode.Append =>
                Files.createDirectories(outputFile.getParent)
                Files.newOutputStream(outputFile, StandardOpenOption.CREATE, StandardOpenOption.APPEND, StandardOpenOption.WRITE)
        }
        val outputWriter = new OutputStreamWriter(outputStream, options.encoding)

        val writer = new UnivocityWriter(schema, outputWriter, options)
        try {
            if (options.headerFlag) {
                writer.writeHeader()
            }

            df.rdd.toLocalIterator.foreach(writer.writeRow)
        }
        finally {
            writer.close()
            outputWriter.close()
            outputStream.close()
        }
    }

    private def readFile(file:Path) : Seq[Row] = {
        val source = Source.fromInputStream(Files.newInputStream(file, StandardOpenOption.READ), options.encoding)
        try {
            val lines = source.getLines()
            val parser = new UnivocityReader(schema, options)
            UnivocityReader.parseIterator(lines, parser).toList
        }
        finally {
            source.close()
        }
    }

    private def readDirectory(file:Path) : Seq[Row] = {
        Files.list(file)
            .collect(Collectors.toList[Path])
            .asScala
            .flatMap { f =>
                if (Files.isRegularFile(f))
                    readFile(f)
                else
                    Seq.empty[Row]
            }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy