All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.commons.csv.CSVParser Maven / Gradle / Ivy

Go to download

Statistical sampling library for use in virtdata libraries, based on apache commons math 4

There is a newer version: 5.17.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.commons.csv;

import static org.apache.commons.csv.Token.Type.TOKEN;

import java.io.Closeable;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.net.URL;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.TreeMap;

/**
 * Parses CSV files according to the specified format.
 *
 * Because CSV appears in many different dialects, the parser supports many formats by allowing the
 * specification of a {@link CSVFormat}.
 *
 * The parser works record wise. It is not possible to go back, once a record has been parsed from the input stream.
 *
 * 

Creating instances

*

* There are several static factory methods that can be used to create instances for various types of resources: *

*
    *
  • {@link #parse(java.io.File, Charset, CSVFormat)}
  • *
  • {@link #parse(String, CSVFormat)}
  • *
  • {@link #parse(java.net.URL, java.nio.charset.Charset, CSVFormat)}
  • *
*

* Alternatively parsers can also be created by passing a {@link Reader} directly to the sole constructor. * * For those who like fluent APIs, parsers can be created using {@link CSVFormat#parse(java.io.Reader)} as a shortcut: *

*
 * for(CSVRecord record : CSVFormat.EXCEL.parse(in)) {
 *     ...
 * }
 * 
* *

Parsing record wise

*

* To parse a CSV input from a file, you write: *

* *
 * File csvData = new File("/path/to/csv");
 * CSVParser parser = CSVParser.parse(csvData, CSVFormat.RFC4180);
 * for (CSVRecord csvRecord : parser) {
 *     ...
 * }
 * 
* *

* This will read the parse the contents of the file using the * RFC 4180 format. *

* *

* To parse CSV input in a format like Excel, you write: *

* *
 * CSVParser parser = CSVParser.parse(csvData, CSVFormat.EXCEL);
 * for (CSVRecord csvRecord : parser) {
 *     ...
 * }
 * 
* *

* If the predefined formats don't match the format at hands, custom formats can be defined. More information about * customising CSVFormats is available in {@link CSVFormat CSVFormat Javadoc}. *

* *

Parsing into memory

*

* If parsing record wise is not desired, the contents of the input can be read completely into memory. *

* *
 * Reader in = new StringReader("a;b\nc;d");
 * CSVParser parser = new CSVParser(in, CSVFormat.EXCEL);
 * List<CSVRecord> list = parser.getRecords();
 * 
* *

* There are two constraints that have to be kept in mind: *

* *
    *
  1. Parsing into memory starts at the current position of the parser. If you have already parsed records from * the input, those records will not end up in the in memory representation of your CSV data.
  2. *
  3. Parsing into memory may consume a lot of system resources depending on the input. For example if you're * parsing a 150MB file of CSV data the contents will be read completely into memory.
  4. *
* *

Notes

*

* Internal parser state is completely covered by the format and the reader-state. *

* * @see package documentation for more details */ public final class CSVParser implements Iterable, Closeable { class CSVRecordIterator implements Iterator { private CSVRecord current; private CSVRecord getNextRecord() { try { return CSVParser.this.nextRecord(); } catch (final IOException e) { throw new IllegalStateException( e.getClass().getSimpleName() + " reading next record: " + e.toString(), e); } } @Override public boolean hasNext() { if (CSVParser.this.isClosed()) { return false; } if (this.current == null) { this.current = this.getNextRecord(); } return this.current != null; } @Override public CSVRecord next() { if (CSVParser.this.isClosed()) { throw new NoSuchElementException("CSVParser has been closed"); } CSVRecord next = this.current; this.current = null; if (next == null) { // hasNext() wasn't called before next = this.getNextRecord(); if (next == null) { throw new NoSuchElementException("No more CSV records available"); } } return next; } @Override public void remove() { throw new UnsupportedOperationException(); } } /** * Creates a parser for the given {@link File}. * * @param file * a CSV file. Must not be null. * @param charset * A Charset * @param format * the CSVFormat used for CSV parsing. Must not be null. * @return a new parser * @throws IllegalArgumentException * If the parameters of the format are inconsistent or if either file or format are null. * @throws IOException * If an I/O error occurs */ @SuppressWarnings("resource") public static CSVParser parse(final File file, final Charset charset, final CSVFormat format) throws IOException { Assertions.notNull(file, "file"); Assertions.notNull(format, "format"); return new CSVParser(new InputStreamReader(new FileInputStream(file), charset), format); } /** * Creates a CSV parser using the given {@link CSVFormat}. * *

* If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, * unless you close the {@code reader}. *

* * @param inputStream * an InputStream containing CSV-formatted input. Must not be null. * @param charset * a Charset. * @param format * the CSVFormat used for CSV parsing. Must not be null. * @return a new CSVParser configured with the given reader and format. * @throws IllegalArgumentException * If the parameters of the format are inconsistent or if either reader or format are null. * @throws IOException * If there is a problem reading the header or skipping the first record * @since 1.5 */ @SuppressWarnings("resource") public static CSVParser parse(final InputStream inputStream, final Charset charset, final CSVFormat format) throws IOException { Assertions.notNull(inputStream, "inputStream"); Assertions.notNull(format, "format"); return parse(new InputStreamReader(inputStream, charset), format); } /** * Creates a parser for the given {@link Path}. * * @param path * a CSV file. Must not be null. * @param charset * A Charset * @param format * the CSVFormat used for CSV parsing. Must not be null. * @return a new parser * @throws IllegalArgumentException * If the parameters of the format are inconsistent or if either file or format are null. * @throws IOException * If an I/O error occurs * @since 1.5 */ public static CSVParser parse(final Path path, final Charset charset, final CSVFormat format) throws IOException { Assertions.notNull(path, "path"); Assertions.notNull(format, "format"); return parse(Files.newInputStream(path), charset, format); } /** * Creates a CSV parser using the given {@link CSVFormat} * *

* If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, * unless you close the {@code reader}. *

* * @param reader * a Reader containing CSV-formatted input. Must not be null. * @param format * the CSVFormat used for CSV parsing. Must not be null. * @return a new CSVParser configured with the given reader and format. * @throws IllegalArgumentException * If the parameters of the format are inconsistent or if either reader or format are null. * @throws IOException * If there is a problem reading the header or skipping the first record * @since 1.5 */ public static CSVParser parse(final Reader reader, final CSVFormat format) throws IOException { return new CSVParser(reader, format); } /** * Creates a parser for the given {@link String}. * * @param string * a CSV string. Must not be null. * @param format * the CSVFormat used for CSV parsing. Must not be null. * @return a new parser * @throws IllegalArgumentException * If the parameters of the format are inconsistent or if either string or format are null. * @throws IOException * If an I/O error occurs */ public static CSVParser parse(final String string, final CSVFormat format) throws IOException { Assertions.notNull(string, "string"); Assertions.notNull(format, "format"); return new CSVParser(new StringReader(string), format); } // the following objects are shared to reduce garbage /** * Creates a parser for the given URL. * *

* If you do not read all records from the given {@code url}, you should call {@link #close()} on the parser, unless * you close the {@code url}. *

* * @param url * a URL. Must not be null. * @param charset * the charset for the resource. Must not be null. * @param format * the CSVFormat used for CSV parsing. Must not be null. * @return a new parser * @throws IllegalArgumentException * If the parameters of the format are inconsistent or if either url, charset or format are null. * @throws IOException * If an I/O error occurs */ public static CSVParser parse(final URL url, final Charset charset, final CSVFormat format) throws IOException { Assertions.notNull(url, "url"); Assertions.notNull(charset, "charset"); Assertions.notNull(format, "format"); return new CSVParser(new InputStreamReader(url.openStream(), charset), format); } private final CSVFormat format; /** A mapping of column names to column indices */ private final Map headerMap; /** The column order to avoid re-computing it. */ private final List headerNames; private final Lexer lexer; private final CSVRecordIterator csvRecordIterator; /** A record buffer for getRecord(). Grows as necessary and is reused. */ private final List recordList = new ArrayList<>(); /** * The next record number to assign. */ private long recordNumber; /** * Lexer offset when the parser does not start parsing at the beginning of the source. Usually used in combination * with {@link #recordNumber}. */ private final long characterOffset; private final Token reusableToken = new Token(); /** * Customized CSV parser using the given {@link CSVFormat} * *

* If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, * unless you close the {@code reader}. *

* * @param reader * a Reader containing CSV-formatted input. Must not be null. * @param format * the CSVFormat used for CSV parsing. Must not be null. * @throws IllegalArgumentException * If the parameters of the format are inconsistent or if either reader or format are null. * @throws IOException * If there is a problem reading the header or skipping the first record */ public CSVParser(final Reader reader, final CSVFormat format) throws IOException { this(reader, format, 0, 1); } /** * Customized CSV parser using the given {@link CSVFormat} * *

* If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, * unless you close the {@code reader}. *

* * @param reader * a Reader containing CSV-formatted input. Must not be null. * @param format * the CSVFormat used for CSV parsing. Must not be null. * @param characterOffset * Lexer offset when the parser does not start parsing at the beginning of the source. * @param recordNumber * The next record number to assign * @throws IllegalArgumentException * If the parameters of the format are inconsistent or if either reader or format are null. * @throws IOException * If there is a problem reading the header or skipping the first record * @since 1.1 */ @SuppressWarnings("resource") public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber) throws IOException { Assertions.notNull(reader, "reader"); Assertions.notNull(format, "format"); this.format = format; this.lexer = new Lexer(format, new ExtendedBufferedReader(reader)); this.csvRecordIterator = new CSVRecordIterator(); final Headers headers = createHeaders(); this.headerMap = headers.headerMap; this.headerNames = headers.headerNames; this.characterOffset = characterOffset; this.recordNumber = recordNumber - 1; } private void addRecordValue(final boolean lastRecord) { final String input = this.reusableToken.content.toString(); final String inputClean = this.format.getTrim() ? input.trim() : input; if (lastRecord && inputClean.isEmpty() && this.format.getTrailingDelimiter()) { return; } final String nullString = this.format.getNullString(); this.recordList.add(inputClean.equals(nullString) ? null : inputClean); } /** * Closes resources. * * @throws IOException * If an I/O error occurs */ @Override public void close() throws IOException { if (this.lexer != null) { this.lexer.close(); } } private Map createEmptyHeaderMap() { return this.format.getIgnoreHeaderCase() ? new TreeMap<>(String.CASE_INSENSITIVE_ORDER) : new LinkedHashMap<>(); } /** * Header information based on name and position. */ private static final class Headers { /** * Header column positions (0-based) */ final Map headerMap; /** * Header names in column order */ final List headerNames; Headers(final Map headerMap, final List headerNames) { this.headerMap = headerMap; this.headerNames = headerNames; } } /** * Creates the name to index mapping if the format defines a header. * * @return null if the format has no header. * @throws IOException if there is a problem reading the header or skipping the first record */ private Headers createHeaders() throws IOException { Map hdrMap = null; List headerNames = null; final String[] formatHeader = this.format.getHeader(); if (formatHeader != null) { hdrMap = createEmptyHeaderMap(); String[] headerRecord = null; if (formatHeader.length == 0) { // read the header from the first line of the file final CSVRecord nextRecord = this.nextRecord(); if (nextRecord != null) { headerRecord = nextRecord.values(); } } else { if (this.format.getSkipHeaderRecord()) { this.nextRecord(); } headerRecord = formatHeader; } // build the name to index mappings if (headerRecord != null) { for (int i = 0; i < headerRecord.length; i++) { final String header = headerRecord[i]; final boolean emptyHeader = header == null || header.trim().isEmpty(); if (emptyHeader && !this.format.getAllowMissingColumnNames()) { throw new IllegalArgumentException( "A header name is missing in " + Arrays.toString(headerRecord)); } // Note: This will always allow a duplicate header if the header is empty final boolean containsHeader = header != null && hdrMap.containsKey(header); if (containsHeader && !emptyHeader && !this.format.getAllowDuplicateHeaderNames()) { throw new IllegalArgumentException( String.format( "The header contains a duplicate name: \"%s\" in %s. If this is valid then use CSVFormat.withAllowDuplicateHeaderNames().", header, Arrays.toString(headerRecord))); } if (header != null) { hdrMap.put(header, Integer.valueOf(i)); if (headerNames == null) { headerNames = new ArrayList<>(headerRecord.length); } headerNames.add(header); } } } } if (headerNames == null) { headerNames = Collections.emptyList(); //immutable } else { headerNames = Collections.unmodifiableList(headerNames); } return new Headers(hdrMap, headerNames); } /** * Returns the current line number in the input stream. * *

* ATTENTION: If your CSV input has multi-line values, the returned number does not correspond to * the record number. *

* * @return current line number */ public long getCurrentLineNumber() { return this.lexer.getCurrentLineNumber(); } /** * Gets the first end-of-line string encountered. * * @return the first end-of-line string * @since 1.5 */ public String getFirstEndOfLine() { return lexer.getFirstEol(); } /** * Returns a copy of the header map. *

* The map keys are column names. The map values are 0-based indices. *

*

* Note: The map can only provide a one-to-one mapping when the format did not * contain null or duplicate column names. *

* * @return a copy of the header map. */ public Map getHeaderMap() { if (this.headerMap == null) { return null; } final Map map = createEmptyHeaderMap(); map.putAll(this.headerMap); return map; } /** * Returns the header map. * * @return the header map. */ Map getHeaderMapRaw() { return this.headerMap; } /** * Returns a read-only list of header names that iterates in column order. *

* Note: The list provides strings that can be used as keys in the header map. * The list will not contain null column names if they were present in the input * format. *

* * @return read-only list of header names that iterates in column order. * @see #getHeaderMap() * @since 1.7 */ public List getHeaderNames() { return headerNames; } /** * Returns the current record number in the input stream. * *

* ATTENTION: If your CSV input has multi-line values, the returned number does not correspond to * the line number. *

* * @return current record number */ public long getRecordNumber() { return this.recordNumber; } /** * Parses the CSV input according to the given format and returns the content as a list of * {@link CSVRecord CSVRecords}. * *

* The returned content starts at the current parse-position in the stream. *

* * @return list of {@link CSVRecord CSVRecords}, may be empty * @throws IOException * on parse error or input read-failure */ public List getRecords() throws IOException { CSVRecord rec; final List records = new ArrayList<>(); while ((rec = this.nextRecord()) != null) { records.add(rec); } return records; } /** * Gets whether this parser is closed. * * @return whether this parser is closed. */ public boolean isClosed() { return this.lexer.isClosed(); } /** * Returns an iterator on the records. * *

* An {@link IOException} caught during the iteration are re-thrown as an * {@link IllegalStateException}. *

*

* If the parser is closed a call to {@link Iterator#next()} will throw a * {@link NoSuchElementException}. *

*/ @Override public Iterator iterator() { return csvRecordIterator; } /** * Parses the next record from the current point in the stream. * * @return the record as an array of values, or {@code null} if the end of the stream has been reached * @throws IOException * on parse error or input read-failure */ CSVRecord nextRecord() throws IOException { CSVRecord result = null; this.recordList.clear(); StringBuilder sb = null; final long startCharPosition = lexer.getCharacterPosition() + this.characterOffset; do { this.reusableToken.reset(); this.lexer.nextToken(this.reusableToken); switch (this.reusableToken.type) { case TOKEN: this.addRecordValue(false); break; case EORECORD: this.addRecordValue(true); break; case EOF: if (this.reusableToken.isReady) { this.addRecordValue(true); } break; case INVALID: throw new IOException("(line " + this.getCurrentLineNumber() + ") invalid parse sequence"); case COMMENT: // Ignored currently if (sb == null) { // first comment for this record sb = new StringBuilder(); } else { sb.append(Constants.LF); } sb.append(this.reusableToken.content); this.reusableToken.type = TOKEN; // Read another token break; default: throw new IllegalStateException("Unexpected Token type: " + this.reusableToken.type); } } while (this.reusableToken.type == TOKEN); if (!this.recordList.isEmpty()) { this.recordNumber++; final String comment = sb == null ? null : sb.toString(); result = new CSVRecord(this, this.recordList.toArray(new String[this.recordList.size()]), comment, this.recordNumber, startCharPosition); } return result; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy