All Downloads are FREE. Search and download functionalities are using the official Maven repository.

leap.lang.csv.CSVParser Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package leap.lang.csv;

import static leap.lang.csv.Token.Type.TOKEN;

import java.io.Closeable;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;

import leap.lang.Args;

/**
 * Parses CSV files according to the specified configuration.
 *
 * Because CSV appears in many different dialects, the parser supports many configuration settings by allowing the
 * specification of a {@link CSVFormat}.
 *
 * 

* To parse a CSV input with tabs as separators, '"' (double-quote) as an optional value encapsulator, and comments * starting with '#', you write: *

* *
 * Reader in = new StringReader("a\tb\nc\td");
 * Iterable<CSVRecord> parser = CSVFormat.DEFAULT
 *     .withCommentStart('#')
 *     .withDelimiter('\t')
 *     .withQuoteChar('"').parse(in);
 *  for (CSVRecord csvRecord : parse) {
 *     ...
 *  }
 * 
* *

* To parse CSV input in a given format like Excel, you write: *

* *
 * Reader in = new StringReader("a;b\nc;d");
 * Iterable<CSVRecord> parser = CSVFormat.EXCEL.parse(in);
 * for (CSVRecord record : parser) {
 *     ...
 * }
 * 
*

* You may also get a List of records: *

* *
 * Reader in = new StringReader("a;b\nc;d");
 * CSVParser parser = new CSVParser(in, CSVFormat.EXCEL);
 * List<CSVRecord> list = parser.getRecords();
 * 
*

* See also the various static parse methods on this class. *

*

* Internal parser state is completely covered by the format and the reader-state. *

* *

* see package documentation for more details *

* * @version $Id: CSVParser.java 1519269 2013-09-01 13:36:08Z britter $ */ final class CSVParser implements Iterable, Closeable { /** * Creates a parser for the given {@link File}. * * @param file * a CSV file. Must not be null. * @param format * the CSVFormat used for CSV parsing. Must not be null. * @return a new parser * @throws IllegalArgumentException * If the parameters of the format are inconsistent or if either file or format are null. * @throws IOException * If an I/O error occurs */ public static CSVParser parse(File file, final CSVFormat format) throws IOException { Args.notNull(file, "file"); Args.notNull(format, "format"); return new CSVParser(new FileReader(file), format); } /** * Creates a parser for the given {@link String}. * * @param string * a CSV string. Must not be null. * @param format * the CSVFormat used for CSV parsing. Must not be null. * @return a new parser * @throws IllegalArgumentException * If the parameters of the format are inconsistent or if either string or format are null. * @throws IOException * If an I/O error occurs */ public static CSVParser parse(String string, final CSVFormat format) throws IOException { Args.notNull(string, "string"); Args.notNull(format, "format"); return new CSVParser(new StringReader(string), format); } /** * Creates a parser for the given URL. * *

* If you do not read all records from the given {@code url}, you should call {@link #close()} on the parser, unless * you close the {@code url}. *

* * @param url * a URL. Must not be null. * @param charset * the charset for the resource. Must not be null. * @param format * the CSVFormat used for CSV parsing. Must not be null. * @return a new parser * @throws IllegalArgumentException * If the parameters of the format are inconsistent or if either url, charset or format are null. * @throws IOException * If an I/O error occurs */ public static CSVParser parse(URL url, Charset charset, final CSVFormat format) throws IOException { Args.notNull(url, "url"); Args.notNull(charset, "charset"); Args.notNull(format, "format"); return new CSVParser(new InputStreamReader(url.openStream(), charset == null ? Charset.forName("UTF-8") : charset), format); } // the following objects are shared to reduce garbage private final CSVFormat format; private final Map headerMap; private final Lexer lexer; /** A record buffer for getRecord(). Grows as necessary and is reused. */ private final List record = new ArrayList(); private boolean readComment = true; private String recordComment; private long recordNumber; private final Token reusableToken = new Token(); /** * Customized CSV parser using the given {@link CSVFormat} * *

* If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, * unless you close the {@code reader}. *

* * @param reader * a Reader containing CSV-formatted input. Must not be null. * @param format * the CSVFormat used for CSV parsing. Must not be null. * @throws IllegalArgumentException * If the parameters of the format are inconsistent or if either reader or format are null. * @throws IOException * If an I/O error occurs */ public CSVParser(final Reader reader, final CSVFormat format) throws IOException { Args.notNull(reader, "reader"); Args.notNull(format, "format"); format.validate(); this.format = format; this.lexer = new Lexer(format, new ExtendedBufferedReader(reader)); this.headerMap = this.initializeHeader(); } private void addRecordValue() { final String input = this.reusableToken.content.toString(); final String nullString = this.format.getNullString(); if (nullString == null) { this.record.add(input); } else { this.record.add(input.equalsIgnoreCase(nullString) ? null : input); } } /** * Closes resources. * * @throws IOException * If an I/O error occurs */ public void close() throws IOException { if (this.lexer != null) { this.lexer.close(); } } /** * Returns the current line number in the input stream. *

* ATTENTION: If your CSV input has multi-line values, the returned number does not correspond to the record number. * * @return current line number */ public long getCurrentLineNumber() { return this.lexer.getCurrentLineNumber(); } /** * Returns a copy of the header map that iterates in column order. *

* The map keys are column names. The map values are 0-based indices. * * @return a copy of the header map that iterates in column order. */ public Map getHeaderMap() { return new LinkedHashMap(this.headerMap); } /** * Returns the current record number in the input stream. *

* ATTENTION: If your CSV input has multi-line values, the returned number does not correspond to the line number. * * @return current line number */ public long getRecordNumber() { return this.recordNumber; } /** * Parses the CSV input according to the given format and returns the content as an array of {@link CSVRecord} * entries. *

* The returned content starts at the current parse-position in the stream. * * @return list of {@link CSVRecord} entries, may be empty * @throws IOException * on parse error or input read-failure */ public List getRecords() throws IOException { final List records = new ArrayList(); CSVRecord rec; while ((rec = this.nextRecord()) != null) { records.add(rec); } return records; } List getRecords1() throws IOException { final List records = new ArrayList(); String[] rec; while ((rec = this.nextRecord1()) != null) { records.add(rec); } return records; } /** * Initializes the name to index mapping if the format defines a header. */ private Map initializeHeader() throws IOException { Map hdrMap = null; final String[] formatHeader = this.format.getHeader(); if (formatHeader != null) { hdrMap = new LinkedHashMap(); String[] header = null; if (formatHeader.length == 0) { // read the header from the first line of the file final CSVRecord record = this.nextRecord(); if (record != null) { header = record.values(); } } else { if (this.format.getSkipHeaderRecord()) { this.nextRecord(); } header = formatHeader; } // build the name to index mappings if (header != null) { for (int i = 0; i < header.length; i++) { hdrMap.put(header[i], Integer.valueOf(i)); } } } return hdrMap; } public boolean isClosed() { return this.lexer.isClosed(); } /** * Returns an iterator on the records. * *

IOExceptions occurring during the iteration are wrapped in a * RuntimeException. * If the parser is closed a call to {@code next()} will throw a * NoSuchElementException.

*/ public Iterator iterator() { return new Iterator() { private CSVRecord current; private CSVRecord getNextRecord() { try { return CSVParser.this.nextRecord(); } catch (final IOException e) { // TODO: This is not great, throw an ISE instead? throw new RuntimeException(e); } } public boolean hasNext() { if (CSVParser.this.isClosed()) { return false; } if (this.current == null) { this.current = this.getNextRecord(); } return this.current != null; } public CSVRecord next() { if (CSVParser.this.isClosed()) { throw new NoSuchElementException("CSVParser has been closed"); } CSVRecord next = this.current; this.current = null; if (next == null) { // hasNext() wasn't called before next = this.getNextRecord(); if (next == null) { throw new NoSuchElementException("No more CSV records available"); } } return next; } public void remove() { throw new UnsupportedOperationException(); } }; } /** * Parses the next record from the current point in the stream. * * @return the record as an array of values, or null if the end of the stream has been reached * @throws IOException * on parse error or input read-failure */ CSVRecord nextRecord() throws IOException { if(tryNextRecord()){ return new CSVRecord(this.record.toArray(new String[this.record.size()]), this.headerMap,recordComment,this.recordNumber); } return null; } String[] nextRecord1() throws IOException { if(tryNextRecord()){ return this.record.toArray(new String[this.record.size()]); } return null; } boolean tryNextRecord() throws IOException { this.record.clear(); this.recordComment = null; StringBuilder sb = null; do { this.reusableToken.reset(); this.lexer.nextToken(this.reusableToken); switch (this.reusableToken.type) { case TOKEN: this.addRecordValue(); break; case EORECORD: this.addRecordValue(); break; case EOF: if (this.reusableToken.isReady) { this.addRecordValue(); } break; case INVALID: throw new IOException("(line " + this.getCurrentLineNumber() + ") invalid parse sequence"); case COMMENT: // Ignored currently if(readComment){ if (sb == null) { // first comment for this record sb = new StringBuilder(); } else { sb.append(Constants.LF); } sb.append(this.reusableToken.content); } this.reusableToken.type = TOKEN; // Read another token break; } } while (this.reusableToken.type == TOKEN); if (!this.record.isEmpty()) { this.recordNumber++; this.recordComment = sb == null ? null : sb.toString(); return true; } return false; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy