com.datastax.driver.core.ParseUtils Maven / Gradle / Ivy

Go to download
/*
 * Copyright DataStax, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.datastax.driver.core;

import java.text.ParseException;
import java.text.ParsePosition;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.TimeZone;
import java.util.concurrent.TimeUnit;

/** Simple utility class used to help parsing CQL values (mainly UDT and collection ones). */
public abstract class ParseUtils {

  /** Valid ISO-8601 patterns for CQL timestamp literals. */
  private static final String[] iso8601Patterns =
      new String[] {
        "yyyy-MM-dd HH:mm",
        "yyyy-MM-dd HH:mm:ss",
        "yyyy-MM-dd HH:mmZ",
        "yyyy-MM-dd HH:mm:ssZ",
        "yyyy-MM-dd HH:mm:ss.SSS",
        "yyyy-MM-dd HH:mm:ss.SSSZ",
        "yyyy-MM-dd'T'HH:mm",
        "yyyy-MM-dd'T'HH:mmZ",
        "yyyy-MM-dd'T'HH:mm:ss",
        "yyyy-MM-dd'T'HH:mm:ssZ",
        "yyyy-MM-dd'T'HH:mm:ss.SSS",
        "yyyy-MM-dd'T'HH:mm:ss.SSSZ",
        "yyyy-MM-dd",
        "yyyy-MM-ddZ"
      };

  /**
   * Returns the index of the first character in toParse from idx that is not a "space".
   *
   * @param toParse the string to skip space on.
   * @param idx the index to start skipping space from.
   * @return the index of the first character in toParse from idx that is not a "space.
   */
  public static int skipSpaces(String toParse, int idx) {
    while (isBlank(toParse.charAt(idx)) && idx < toParse.length()) ++idx;
    return idx;
  }

  /**
   * Assuming that idx points to the beginning of a CQL value in toParse, returns the index of the
   * first character after this value.
   *
   * @param toParse the string to skip a value form.
   * @param idx the index to start parsing a value from.
   * @return the index ending the CQL value starting at {@code idx}.
   * @throws IllegalArgumentException if idx doesn't point to the start of a valid CQL value.
   */
  public static int skipCQLValue(String toParse, int idx) {
    if (idx >= toParse.length()) throw new IllegalArgumentException();

    if (isBlank(toParse.charAt(idx))) throw new IllegalArgumentException();

    int cbrackets = 0;
    int sbrackets = 0;
    int parens = 0;
    boolean inString = false;

    do {
      char c = toParse.charAt(idx);
      if (inString) {
        if (c == '\'') {
          if (idx + 1 < toParse.length() && toParse.charAt(idx + 1) == '\'') {
            ++idx; // this is an escaped quote, skip it
          } else {
            inString = false;
            if (cbrackets == 0 && sbrackets == 0 && parens == 0) return idx + 1;
          }
        }
        // Skip any other character
      } else if (c == '\'') {
        inString = true;
      } else if (c == '{') {
        ++cbrackets;
      } else if (c == '[') {
        ++sbrackets;
      } else if (c == '(') {
        ++parens;
      } else if (c == '}') {
        if (cbrackets == 0) return idx;

        --cbrackets;
        if (cbrackets == 0 && sbrackets == 0 && parens == 0) return idx + 1;
      } else if (c == ']') {
        if (sbrackets == 0) return idx;

        --sbrackets;
        if (cbrackets == 0 && sbrackets == 0 && parens == 0) return idx + 1;
      } else if (c == ')') {
        if (parens == 0) return idx;

        --parens;
        if (cbrackets == 0 && sbrackets == 0 && parens == 0) return idx + 1;
      } else if (isBlank(c) || !isIdentifierChar(c)) {
        if (cbrackets == 0 && sbrackets == 0 && parens == 0) return idx;
      }
    } while (++idx < toParse.length());

    if (inString || cbrackets != 0 || sbrackets != 0 || parens != 0)
      throw new IllegalArgumentException();
    return idx;
  }

  /**
   * Assuming that idx points to the beginning of a CQL identifier in toParse, returns the index of
   * the first character after this identifier.
   *
   * @param toParse the string to skip an identifier from.
   * @param idx the index to start parsing an identifier from.
   * @return the index ending the CQL identifier starting at {@code idx}.
   * @throws IllegalArgumentException if idx doesn't point to the start of a valid CQL identifier.
   */
  public static int skipCQLId(String toParse, int idx) {
    if (idx >= toParse.length()) throw new IllegalArgumentException();

    char c = toParse.charAt(idx);
    if (isIdentifierChar(c)) {
      while (idx < toParse.length() && isIdentifierChar(toParse.charAt(idx))) idx++;
      return idx;
    }

    if (c != '"') throw new IllegalArgumentException();

    while (++idx < toParse.length()) {
      c = toParse.charAt(idx);
      if (c != '"') continue;

      if (idx + 1 < toParse.length() && toParse.charAt(idx + 1) == '\"')
        ++idx; // this is an escaped double quote, skip it
      else return idx + 1;
    }
    throw new IllegalArgumentException();
  }

  /**
   * Return {@code true} if the given character is allowed in a CQL identifier, that is, if it is in
   * the range: {@code [0..9a..zA..Z-+._&]}.
   *
   * @param c The character to inspect.
   * @return {@code true} if the given character is allowed in a CQL identifier, {@code false}
   *     otherwise.
   */
  public static boolean isIdentifierChar(int c) {
    return (c >= '0' && c <= '9')
        || (c >= 'a' && c <= 'z')
        || (c >= 'A' && c <= 'Z')
        || c == '-'
        || c == '+'
        || c == '.'
        || c == '_'
        || c == '&';
  }

  /**
   * Return {@code true} if the given character is a valid whitespace character in CQL, that is, if
   * it is a regular space, a tabulation sign, or a new line sign.
   *
   * @param c The character to inspect.
   * @return {@code true} if the given character is a valid whitespace character, {@code false}
   *     otherwise.
   */
  public static boolean isBlank(int c) {
    return c == ' ' || c == '\t' || c == '\n';
  }

  /**
   * Check whether the given string corresponds to a valid CQL long literal. Long literals are
   * composed solely by digits, but can have an optional leading minus sign.
   *
   * @param str The string to inspect.
   * @return {@code true} if the given string corresponds to a valid CQL integer literal, {@code
   *     false} otherwise.
   */
  public static boolean isLongLiteral(String str) {
    if (str == null || str.isEmpty()) return false;
    char[] chars = str.toCharArray();
    for (int i = 0; i < chars.length; i++) {
      char c = chars[i];
      if ((c < '0' && (i != 0 || c != '-')) || c > '9') return false;
    }
    return true;
  }

  /**
   * Return {@code true} if the given string is surrounded by single quotes, and {@code false}
   * otherwise.
   *
   * @param value The string to inspect.
   * @return {@code true} if the given string is surrounded by single quotes, and {@code false}
   *     otherwise.
   */
  public static boolean isQuoted(String value) {
    return isQuoted(value, '\'');
  }

  /**
   * Quote the given string; single quotes are escaped. If the given string is null, this method
   * returns a quoted empty string ({@code ''}).
   *
   * @param value The value to quote.
   * @return The quoted string.
   */
  public static String quote(String value) {
    return quote(value, '\'');
  }

  /**
   * Unquote the given string if it is quoted; single quotes are unescaped. If the given string is
   * not quoted, it is returned without any modification.
   *
   * @param value The string to unquote.
   * @return The unquoted string.
   */
  public static String unquote(String value) {
    return unquote(value, '\'');
  }

  /**
   * Return {@code true} if the given string is surrounded by double quotes, and {@code false}
   * otherwise.
   *
   * @param value The string to inspect.
   * @return {@code true} if the given string is surrounded by double quotes, and {@code false}
   *     otherwise.
   */
  public static boolean isDoubleQuoted(String value) {
    return isQuoted(value, '\"');
  }

  /**
   * Double quote the given string; double quotes are escaped. If the given string is null, this
   * method returns a quoted empty string ({@code ""}).
   *
   * @param value The value to double quote.
   * @return The double quoted string.
   */
  public static String doubleQuote(String value) {
    return quote(value, '"');
  }

  /**
   * Unquote the given string if it is double quoted; double quotes are unescaped. If the given
   * string is not double quoted, it is returned without any modification.
   *
   * @param value The string to un-double quote.
   * @return The un-double quoted string.
   */
  public static String unDoubleQuote(String value) {
    return unquote(value, '"');
  }

  /**
   * Parse the given string as a date, using one of the accepted ISO-8601 date patterns.
   *
   * This method is adapted from Apache Commons {@code DateUtils.parseStrictly()} method (that is
   * used Cassandra side to parse date strings)..
   *
   * @throws ParseException If the given string is not a valid ISO-8601 date.
   * @see 'Working with
   *     timestamps' section of CQL specification
   */
  public static Date parseDate(String str) throws ParseException {
    SimpleDateFormat parser = new SimpleDateFormat();
    parser.setLenient(false);
    // set a default timezone for patterns that do not provide one
    parser.setTimeZone(TimeZone.getTimeZone("UTC"));
    // Java 6 has very limited support for ISO-8601 time zone formats,
    // so we need to transform the string first
    // so that accepted patterns are correctly handled,
    // such as Z for UTC, or "+00:00" instead of "+0000".
    // Note: we cannot use the X letter in the pattern
    // because it has been introduced in Java 7.
    str = str.replaceAll("(\\+|\\-)(\\d\\d):(\\d\\d)$", "$1$2$3");
    str = str.replaceAll("Z$", "+0000");
    ParsePosition pos = new ParsePosition(0);
    for (String parsePattern : iso8601Patterns) {
      parser.applyPattern(parsePattern);
      pos.setIndex(0);
      Date date = parser.parse(str, pos);
      if (date != null && pos.getIndex() == str.length()) {
        return date;
      }
    }
    throw new ParseException("Unable to parse the date: " + str, -1);
  }

  /**
   * Parse the given string as a date, using the supplied date pattern.
   *
   * 
This method is adapted from Apache Commons {@code DateUtils.parseStrictly()} method (that is
   * used Cassandra side to parse date strings)..
   *
   * @throws ParseException If the given string cannot be parsed with the given pattern.
   * @see 'Working with
   *     timestamps' section of CQL specification
   */
  public static Date parseDate(String str, String pattern) throws ParseException {
    SimpleDateFormat parser = new SimpleDateFormat();
    parser.setLenient(false);
    // set a default timezone for patterns that do not provide one
    parser.setTimeZone(TimeZone.getTimeZone("UTC"));
    // Java 6 has very limited support for ISO-8601 time zone formats,
    // so we need to transform the string first
    // so that accepted patterns are correctly handled,
    // such as Z for UTC, or "+00:00" instead of "+0000".
    // Note: we cannot use the X letter in the pattern
    // because it has been introduced in Java 7.
    str = str.replaceAll("(\\+|\\-)(\\d\\d):(\\d\\d)$", "$1$2$3");
    str = str.replaceAll("Z$", "+0000");
    ParsePosition pos = new ParsePosition(0);
    parser.applyPattern(pattern);
    pos.setIndex(0);
    Date date = parser.parse(str, pos);
    if (date != null && pos.getIndex() == str.length()) {
      return date;
    }
    throw new ParseException("Unable to parse the date: " + str, -1);
  }

  /**
   * Parse the given string as a time, using the following time pattern: {@code
   * hh:mm:ss[.fffffffff]}.
   *
   * This method is loosely based on {@code java.sql.Timestamp}.
   *
   * @param str The string to parse.
   * @return A long value representing the number of nanoseconds since midnight.
   * @throws ParseException if the string cannot be parsed.
   * @see 'Working with time'
   *     section of CQL specification
   */
  public static long parseTime(String str) throws ParseException {
    String nanos_s;

    long hour;
    long minute;
    long second;
    long a_nanos = 0;

    String formatError = "Timestamp format must be hh:mm:ss[.fffffffff]";
    String zeros = "000000000";

    if (str == null) throw new IllegalArgumentException(formatError);
    str = str.trim();

    // Parse the time
    int firstColon = str.indexOf(':');
    int secondColon = str.indexOf(':', firstColon + 1);

    // Convert the time; default missing nanos
    if (firstColon > 0 && secondColon > 0 && secondColon < str.length() - 1) {
      int period = str.indexOf('.', secondColon + 1);
      hour = Integer.parseInt(str.substring(0, firstColon));
      if (hour < 0 || hour >= 24) throw new IllegalArgumentException("Hour out of bounds.");

      minute = Integer.parseInt(str.substring(firstColon + 1, secondColon));
      if (minute < 0 || minute >= 60) throw new IllegalArgumentException("Minute out of bounds.");

      if (period > 0 && period < str.length() - 1) {
        second = Integer.parseInt(str.substring(secondColon + 1, period));
        if (second < 0 || second >= 60) throw new IllegalArgumentException("Second out of bounds.");

        nanos_s = str.substring(period + 1);
        if (nanos_s.length() > 9) throw new IllegalArgumentException(formatError);
        if (!Character.isDigit(nanos_s.charAt(0))) throw new IllegalArgumentException(formatError);
        nanos_s = nanos_s + zeros.substring(0, 9 - nanos_s.length());
        a_nanos = Integer.parseInt(nanos_s);
      } else if (period > 0) throw new ParseException(formatError, -1);
      else {
        second = Integer.parseInt(str.substring(secondColon + 1));
        if (second < 0 || second >= 60) throw new ParseException("Second out of bounds.", -1);
      }
    } else throw new ParseException(formatError, -1);

    long rawTime = 0;
    rawTime += TimeUnit.HOURS.toNanos(hour);
    rawTime += TimeUnit.MINUTES.toNanos(minute);
    rawTime += TimeUnit.SECONDS.toNanos(second);
    rawTime += a_nanos;
    return rawTime;
  }

  /**
   * Format the given long value as a CQL time literal, using the following time pattern: {@code
   * hh:mm:ss[.fffffffff]}.
   *
   * @param value A long value representing the number of nanoseconds since midnight.
   * @return The formatted value.
   * @see 'Working with time'
   *     section of CQL specification
   */
  public static String formatTime(long value) {
    int nano = (int) (value % 1000000000);
    value -= nano;
    value /= 1000000000;
    int seconds = (int) (value % 60);
    value -= seconds;
    value /= 60;
    int minutes = (int) (value % 60);
    value -= minutes;
    value /= 60;
    int hours = (int) (value % 24);
    value -= hours;
    value /= 24;
    assert (value == 0);
    StringBuilder sb = new StringBuilder();
    leftPadZeros(hours, 2, sb);
    sb.append(":");
    leftPadZeros(minutes, 2, sb);
    sb.append(":");
    leftPadZeros(seconds, 2, sb);
    sb.append(".");
    leftPadZeros(nano, 9, sb);
    return sb.toString();
  }

  /**
   * Return {@code true} if the given string is surrounded by the quote character given, and {@code
   * false} otherwise.
   *
   * @param value The string to inspect.
   * @return {@code true} if the given string is surrounded by the quote character, and {@code
   *     false} otherwise.
   */
  private static boolean isQuoted(String value, char quoteChar) {
    return value != null
        && value.length() > 1
        && value.charAt(0) == quoteChar
        && value.charAt(value.length() - 1) == quoteChar;
  }

  /**
   * @param quoteChar " or '
   * @return A quoted empty string.
   */
  private static String emptyQuoted(char quoteChar) {
    // don't handle non quote characters, this is done so that these are interned and don't create
    // repeated empty quoted strings.
    assert quoteChar == '"' || quoteChar == '\'';
    if (quoteChar == '"') return "\"\"";
    else return "''";
  }

  /**
   * Quotes text and escapes any existing quotes in the text. {@code String.replace()} is a bit too
   * inefficient (see JAVA-67, JAVA-1262).
   *
   * @param text The text.
   * @param quoteChar The character to use as a quote.
   * @return The text with surrounded in quotes with all existing quotes escaped with (i.e. '
   *     becomes '')
   */
  private static String quote(String text, char quoteChar) {
    if (text == null || text.isEmpty()) return emptyQuoted(quoteChar);

    int nbMatch = 0;
    int start = -1;
    do {
      start = text.indexOf(quoteChar, start + 1);
      if (start != -1) ++nbMatch;
    } while (start != -1);

    // no quotes found that need to be escaped, simply surround in quotes and return.
    if (nbMatch == 0) return quoteChar + text + quoteChar;

    // 2 for beginning and end quotes.
    // length for original text
    // nbMatch for escape characters to add to quotes to be escaped.
    int newLength = 2 + text.length() + nbMatch;
    char[] result = new char[newLength];
    result[0] = quoteChar;
    result[newLength - 1] = quoteChar;
    int newIdx = 1;
    for (int i = 0; i < text.length(); i++) {
      char c = text.charAt(i);
      if (c == quoteChar) {
        // escape quote with another occurrence.
        result[newIdx++] = c;
        result[newIdx++] = c;
      } else {
        result[newIdx++] = c;
      }
    }
    return new String(result);
  }

  /**
   * Unquotes text and unescapes non surrounding quotes. {@code String.replace()} is a bit too
   * inefficient (see JAVA-67, JAVA-1262).
   *
   * @param text The text
   * @param quoteChar The character to use as a quote.
   * @return The text with surrounding quotes removed and non surrounding quotes unescaped (i.e. ''
   *     becomes ')
   */
  private static String unquote(String text, char quoteChar) {
    if (!isQuoted(text, quoteChar)) return text;

    if (text.length() == 2) return "";

    String search = emptyQuoted(quoteChar);
    int nbMatch = 0;
    int start = -1;
    do {
      start = text.indexOf(search, start + 2);
      // ignore the second to last character occurrence, as the last character is a quote.
      if (start != -1 && start != text.length() - 2) ++nbMatch;
    } while (start != -1);

    // no escaped quotes found, simply remove surrounding quotes and return.
    if (nbMatch == 0) return text.substring(1, text.length() - 1);

    // length of the new string will be its current length - the number of occurrences.
    int newLength = text.length() - nbMatch - 2;
    char[] result = new char[newLength];
    int newIdx = 0;
    // track whenever a quoteChar is encountered and the previous character is not a quoteChar.
    boolean firstFound = false;
    for (int i = 1; i < text.length() - 1; i++) {
      char c = text.charAt(i);
      if (c == quoteChar) {
        if (firstFound) {
          // The previous character was a quoteChar, don't add this to result, this action in
          // effect removes consecutive quotes.
          firstFound = false;
        } else {
          // found a quoteChar and the previous character was not a quoteChar, include in result.
          firstFound = true;
          result[newIdx++] = c;
        }
      } else {
        // non quoteChar encountered, include in result.
        result[newIdx++] = c;
        firstFound = false;
      }
    }
    return new String(result);
  }

  private static void leftPadZeros(int value, int digits, StringBuilder sb) {
    sb.append(String.format("%0" + digits + "d", value));
  }

  private ParseUtils() {}
}