All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.spark.sql.catalyst.csv.CSVExprUtils.scala Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.spark.sql.catalyst.csv

import org.apache.commons.lang3.StringUtils

object CSVExprUtils {
  /**
   * Filter ignorable rows for CSV iterator (lines empty and starting with `comment`).
   * This is currently being used in CSV reading path and CSV schema inference.
   */
  def filterCommentAndEmpty(iter: Iterator[String], options: CSVOptions): Iterator[String] = {
    iter.filter { line =>
      line.trim.nonEmpty && !line.startsWith(options.comment.toString)
    }
  }

  def skipComments(iter: Iterator[String], options: CSVOptions): Iterator[String] = {
    if (options.isCommentSet) {
      val commentPrefix = options.comment.toString
      iter.dropWhile { line =>
        line.trim.isEmpty || line.trim.startsWith(commentPrefix)
      }
    } else {
      iter.dropWhile(_.trim.isEmpty)
    }
  }

  /**
   * Extracts header and moves iterator forward so that only data remains in it
   */
  def extractHeader(iter: Iterator[String], options: CSVOptions): Option[String] = {
    val nonEmptyLines = skipComments(iter, options)
    if (nonEmptyLines.hasNext) {
      Some(nonEmptyLines.next())
    } else {
      None
    }
  }

  /**
   * Helper method that converts string representation of a character to actual character.
   * It handles some Java escaped strings and throws exception if given string is longer than one
   * character.
   */
  @throws[IllegalArgumentException]
  def toChar(str: String): Char = {
    (str: Seq[Char]) match {
      case Seq() => throw new IllegalArgumentException("Delimiter cannot be empty string")
      case Seq('\\') => throw new IllegalArgumentException("Single backslash is prohibited." +
        " It has special meaning as beginning of an escape sequence." +
        " To get the backslash character, pass a string with two backslashes as the delimiter.")
      case Seq(c) => c
      case Seq('\\', 't') => '\t'
      case Seq('\\', 'r') => '\r'
      case Seq('\\', 'b') => '\b'
      case Seq('\\', 'f') => '\f'
      // In case user changes quote char and uses \" as delimiter in options
      case Seq('\\', '\"') => '\"'
      case Seq('\\', '\'') => '\''
      case Seq('\\', '\\') => '\\'
      case _ if str == """\u0000""" => '\u0000'
      case Seq('\\', _) =>
        throw new IllegalArgumentException(s"Unsupported special character for delimiter: $str")
      case _ =>
        throw new IllegalArgumentException(s"Delimiter cannot be more than one character: $str")
    }
  }

  /**
   * Helper method that converts string representation of a character sequence to actual
   * delimiter characters. The input is processed in "chunks", and each chunk is converted
   * by calling [[CSVExprUtils.toChar()]].  A chunk is either:
   * 
    *
  • a backslash followed by another character
  • *
  • a non-backslash character by itself
  • *
* , in that order of precedence. The result of the converting all chunks is returned as * a [[String]]. * *

Examples: *
  • `\t` will result in a single tab character as the separator (same as before) *
  • `|||` will result in a sequence of three pipe characters as the separator *
  • `\\` will result in a single backslash as the separator (same as before) *
  • `\.` will result in an error (since a dot is not a character that needs escaped) *
  • `\\.` will result in a backslash, then dot, as the separator character sequence *
  • `.\t.` will result in a dot, then tab, then dot as the separator character sequence *
  • *
* * @param str the string representing the sequence of separator characters * @return a [[String]] representing the multi-character delimiter * @throws IllegalArgumentException if any of the individual input chunks are illegal */ def toDelimiterStr(str: String): String = { var idx = 0 var delimiter = "" while (idx < str.length()) { // if the current character is a backslash, check it plus the next char // in order to use existing escape logic val readAhead = if (str(idx) == '\\') 2 else 1 // get the chunk of 1 or 2 input characters to convert to a single delimiter char val chunk = StringUtils.substring(str, idx, idx + readAhead) delimiter += toChar(chunk) // advance the counter by the length of input chunk processed idx += chunk.length() } delimiter.mkString("") } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy