All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.doubleclick.util.impl.CSVParser Maven / Gradle / Ivy

/*
 * Copyright 2014 Google Inc. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.google.doubleclick.util.impl;

import com.google.common.base.Function;
import com.google.common.base.MoreObjects;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;

import javax.annotation.Nullable;

/**
 * CSV (comma-separated) and TSV (tab-separated) parser for internal use only.
 * Remove this if we find some alternative that's small, bug-free / well-maintained,
 * and has all required features (including some extensions we need).
 *
 * 

This parser is "record-oriented", it doesn't try to split a stream into records so this * will be done by the caller before invoking the parser. Unfortunately RFC-4180 supports * unescaped line breaks inside quoted fields, so a naive caller that just splits the stream * into records by looking at line breaks will fail to preserve the "internal line breaks". * In principle the caller can have the intelligence to split records correctly, but this * would ideally be implemented as part of the parser with a stream-oriented API. * *

WARNING: This class is public but it's an internal utility, not a supported API. */ public class CSVParser { public static final char EOT = (char) 0x03; public static final char NUL = (char) 0x00; private static final CSVParser TSV_PARSER = new CSVParser('\t', NUL, NUL, "", false); private static final CSVParser CSV_PARSER = new CSVParser(',', '"', NUL, "", false); final char separator; final char quote; final char escape; final String empty; final boolean trim; /** * Creates a CSV parser (or TSV, but let's not get picky about naming). * * @param separator Separator. Normally comma (',' / 0x2C) for CSV, or tab ('\t' / 0x09) for TSV. * @param quote Quote. Normally the double-quote ('"', 0x22). * @param escape Escape. Non-RFC extension, allows escaping individual characters inside quoted * or unquoted fields. Defaults to NUL (no support for escaping), a popular choice is '\'. * @param empty Empty value. Any absent field will be replaced by this value. Only a zero-char * field is considered absent; a quoted empty field ("") is not, so you can differentiate * between "no value at all" and "empty string value". The normal value for RFC-compliant CSV * or TSP parsing is the empty string, which causes no distinction between empty and 0-length. * @param trim If {@code true}, trims whitespaces in the start or end of all fields. */ public CSVParser(char separator, char quote, char escape, @Nullable String empty, boolean trim) { this.separator = separator; this.quote = quote; this.escape = escape; this.empty = empty; this.trim = trim; } /** * Returns a RFC 4180-compliant CSV parser. */ public static CSVParser csvParser() { return CSV_PARSER; } /** * Returns an IANA-standard TSV parser. */ public static CSVParser tsvParser() { return TSV_PARSER; } public boolean parse(InputStream is, String regex, Function, Boolean> sink) throws IOException { return parse(is, Pattern.compile(regex), sink); } public boolean parse(InputStream is, Pattern pattern, Function, Boolean> sink) throws IOException { BufferedReader rd = new BufferedReader(new InputStreamReader(is)); String record; while ((record = rd.readLine()) != null) { if (pattern.matcher(record).matches()) { try { if (!sink.apply(parse(record))) { return false; } } catch (ParseException e) { //logger.trace("Bad record: [{}]: {}", record, e.toString()); } } } return true; } /** * Parses one line / record. */ public List parse(String line) throws ParseException { List cols = new ArrayList<>(); boolean afterQuote = false; boolean afterEscape = false; boolean afterSeparator = false; boolean outerQuote = false; boolean afterEndField = false; StringBuilder sb = new StringBuilder(); for (int i = 0; ; ++i) { char c = (i == line.length()) ? EOT : line.charAt(i); if (afterEndField && c != separator && c != EOT) { if (!trim || c != ' ') { throw new ParseException("Extraneous character after end of quoted field", i); } } else if (afterEscape) { if (c == EOT) { // [abc\^] throw new ParseException("Escape not followed by a character", i); } else { // [abc\x...] => abcx... afterEscape = false; sb.append(c); } } else if (c == separator) { if (outerQuote && !afterQuote) { // ["abc,...] => abc,... sb.append(c); afterSeparator = false; } else { // [abc,...] => {abc, ...} endCol(cols, sb, i, outerQuote, afterQuote); afterQuote = afterEscape = afterEndField = outerQuote = false; afterSeparator = true; } } else if (c == EOT) { if (sb.length() != 0 || afterSeparator || outerQuote) { // [...,abc^] => {..., abc} // [...,^] => {..., ""} endCol(cols, sb, i, outerQuote, afterQuote); } return cols; } else if (c == escape) { // [...\...] afterEscape = true; afterSeparator = false; } else if (c == quote) { if (afterQuote && outerQuote) { // Two consecutive quotes inside quoted string, so the pair has to be internal // (the second quote cannot be terminating the field). sb.append(quote); afterQuote = false; } else if (sb.length() == 0 && !outerQuote) { outerQuote = true; } else if (sb.length() != 0 && !outerQuote) { // Fields that are not quote-delimited cannot have any internal quotes, // unless they are escaped which was already handled. throw new ParseException(escape == NUL ? "Unescaped quote inside non-quote-delimited field" : "Quote inside non-quote-delimited field", i); } else { afterQuote = true; } afterSeparator = false; } else if (c == ' ' && trim && ((!outerQuote && sb.length() == 0) // Trimmed space before field || afterEndField)) { // Trimmed space after field } else if (c == ' ' && trim && afterQuote) { afterEndField = true; } else if (outerQuote && afterQuote) { // ["abc"x] throw new ParseException("Extraneous character after end of quoted field", i); } else { // Common character. sb.append(c); afterSeparator = false; } } } protected void endCol( List cols, StringBuilder sb, int i, boolean outerQuote, boolean afterQuote) throws ParseException { if (outerQuote && !afterQuote) { throw new ParseException("Field starts with quote but ends unquoted", i); } if (trim && !outerQuote) { while (sb.length() != 0 && sb.charAt(sb.length() - 1) == ' ') { sb.setLength(sb.length() - 1); } } // Drop trailing whitespace, if any; like: ["xyz" ,] or [xyz ,] cols.add(!outerQuote && sb.length() == 0 ? empty : sb.toString()); sb.setLength(0); } @Override public String toString() { return MoreObjects.toStringHelper(this).omitNullValues() .add("separator", separator == NUL ? null : "0x" + Integer.toHexString(separator)) .add("quote", quote == NUL ? null : "0x" + Integer.toHexString(quote)) .add("escape", escape == NUL ? null : "0x" + Integer.toHexString(escape)) .add("empty", String.valueOf(empty)) .add("trim", trim) .toString(); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy