leap.lang.csv.CSVParser Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of leap-lang Show documentation
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package leap.lang.csv;

import static leap.lang.csv.Token.Type.TOKEN;

import java.io.Closeable;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;

import leap.lang.Args;

/**
 * Parses CSV files according to the specified configuration.
 *
 * Because CSV appears in many different dialects, the parser supports many configuration settings by allowing the
 * specification of a {@link CSVFormat}.
 *
 * 
 * To parse a CSV input with tabs as separators, '"' (double-quote) as an optional value encapsulator, and comments
 * starting with '#', you write:
 * 
 *
 *  * Reader in = new StringReader("a\tb\nc\td");
 * Iterable<CSVRecord> parser = CSVFormat.DEFAULT
 *     .withCommentStart('#')
 *     .withDelimiter('\t')
 *     .withQuoteChar('"').parse(in);
 *  for (CSVRecord csvRecord : parse) {
 *     ...
 *  }
 * 
 *
 * 
 * To parse CSV input in a given format like Excel, you write:
 * 
 *
 *  * Reader in = new StringReader("a;b\nc;d");
 * Iterable<CSVRecord> parser = CSVFormat.EXCEL.parse(in);
 * for (CSVRecord record : parser) {
 *     ...
 * }
 * 
 * 
 * You may also get a List of records:
 * 
 *
 *  * Reader in = new StringReader("a;b\nc;d");
 * CSVParser parser = new CSVParser(in, CSVFormat.EXCEL);
 * List<CSVRecord> list = parser.getRecords();
 * 
 * 
 * See also the various static parse methods on this class.
 * 
 * 
 * Internal parser state is completely covered by the format and the reader-state.
 * 
 *
 * 
 * see package documentation for more details
 * 
 *
 * @version $Id: CSVParser.java 1519269 2013-09-01 13:36:08Z britter $
 */
final class CSVParser implements Iterable, Closeable {

    /**
     * Creates a parser for the given {@link File}.
     *
     * @param file
     *            a CSV file. Must not be null.
     * @param format
     *            the CSVFormat used for CSV parsing. Must not be null.
     * @return a new parser
     * @throws IllegalArgumentException
     *             If the parameters of the format are inconsistent or if either file or format are null.
     * @throws IOException
     *             If an I/O error occurs
     */
    public static CSVParser parse(File file, final CSVFormat format) throws IOException {
        Args.notNull(file, "file");
        Args.notNull(format, "format");

        return new CSVParser(new FileReader(file), format);
    }

    /**
     * Creates a parser for the given {@link String}.
     *
     * @param string
     *            a CSV string. Must not be null.
     * @param format
     *            the CSVFormat used for CSV parsing. Must not be null.
     * @return a new parser
     * @throws IllegalArgumentException
     *             If the parameters of the format are inconsistent or if either string or format are null.
     * @throws IOException
     *             If an I/O error occurs
     */
    public static CSVParser parse(String string, final CSVFormat format) throws IOException {
        Args.notNull(string, "string");
        Args.notNull(format, "format");

        return new CSVParser(new StringReader(string), format);
    }

    /**
     * Creates a parser for the given URL.
     *
     * 
     * If you do not read all records from the given {@code url}, you should call {@link #close()} on the parser, unless
     * you close the {@code url}.
     * 
     *
     * @param url
     *            a URL. Must not be null.
     * @param charset
     *            the charset for the resource. Must not be null.
     * @param format
     *            the CSVFormat used for CSV parsing. Must not be null.
     * @return a new parser
     * @throws IllegalArgumentException
     *             If the parameters of the format are inconsistent or if either url, charset or format are null.
     * @throws IOException
     *             If an I/O error occurs
     */
    public static CSVParser parse(URL url, Charset charset, final CSVFormat format) throws IOException {
        Args.notNull(url, "url");
        Args.notNull(charset, "charset");
        Args.notNull(format, "format");

        return new CSVParser(new InputStreamReader(url.openStream(),
                             charset == null ? Charset.forName("UTF-8") : charset), format);
    }

    // the following objects are shared to reduce garbage

    private final CSVFormat format;
    private final Map headerMap;

    private final Lexer lexer;

    /** A record buffer for getRecord(). Grows as necessary and is reused. */
    private final List record = new ArrayList();
    
    private boolean readComment = true;
    
    private String recordComment;

    private long recordNumber;

    private final Token reusableToken = new Token();

    /**
     * Customized CSV parser using the given {@link CSVFormat}
     *
     * 
     * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
     * unless you close the {@code reader}.
     * 
     *
     * @param reader
     *            a Reader containing CSV-formatted input. Must not be null.
     * @param format
     *            the CSVFormat used for CSV parsing. Must not be null.
     * @throws IllegalArgumentException
     *             If the parameters of the format are inconsistent or if either reader or format are null.
     * @throws IOException
     *             If an I/O error occurs
     */
    public CSVParser(final Reader reader, final CSVFormat format) throws IOException {
        Args.notNull(reader, "reader");
        Args.notNull(format, "format");

        format.validate();
        this.format = format;
        this.lexer = new Lexer(format, new ExtendedBufferedReader(reader));
        this.headerMap = this.initializeHeader();
    }

    private void addRecordValue() {
        final String input = this.reusableToken.content.toString();
        final String nullString = this.format.getNullString();
        if (nullString == null) {
            this.record.add(input);
        } else {
            this.record.add(input.equalsIgnoreCase(nullString) ? null : input);
        }
    }

    /**
     * Closes resources.
     *
     * @throws IOException
     *             If an I/O error occurs
     */
    public void close() throws IOException {
        if (this.lexer != null) {
            this.lexer.close();
        }
    }

    /**
     * Returns the current line number in the input stream.
     * 
     * ATTENTION: If your CSV input has multi-line values, the returned number does not correspond to the record number.
     *
     * @return current line number
     */
    public long getCurrentLineNumber() {
        return this.lexer.getCurrentLineNumber();
    }

    /**
     * Returns a copy of the header map that iterates in column order.
     * 

     * The map keys are column names. The map values are 0-based indices.
     *
     * @return a copy of the header map that iterates in column order.
     */
    public Map getHeaderMap() {
        return new LinkedHashMap(this.headerMap);
    }

    /**
     * Returns the current record number in the input stream.
     * 

     * ATTENTION: If your CSV input has multi-line values, the returned number does not correspond to the line number.
     *
     * @return current line number
     */
    public long getRecordNumber() {
        return this.recordNumber;
    }

    /**
     * Parses the CSV input according to the given format and returns the content as an array of {@link CSVRecord}
     * entries.
     * 

     * The returned content starts at the current parse-position in the stream.
     *
     * @return list of {@link CSVRecord} entries, may be empty
     * @throws IOException
     *             on parse error or input read-failure
     */
    public List getRecords() throws IOException {
        final List records = new ArrayList();
        CSVRecord rec;
        while ((rec = this.nextRecord()) != null) {
            records.add(rec);
        }
        return records;
    }
    
    List getRecords1() throws IOException {
        final List records = new ArrayList();
        String[] rec;
        while ((rec = this.nextRecord1()) != null) {
            records.add(rec);
        }
        return records;
    }

    /**
     * Initializes the name to index mapping if the format defines a header.
     */
    private Map initializeHeader() throws IOException {
        Map hdrMap = null;
        final String[] formatHeader = this.format.getHeader();
        if (formatHeader != null) {
            hdrMap = new LinkedHashMap();

            String[] header = null;
            if (formatHeader.length == 0) {
                // read the header from the first line of the file
                final CSVRecord record = this.nextRecord();
                if (record != null) {
                    header = record.values();
                }
            } else {
                if (this.format.getSkipHeaderRecord()) {
                    this.nextRecord();
                }
                header = formatHeader;
            }

            // build the name to index mappings
            if (header != null) {
                for (int i = 0; i < header.length; i++) {
                    hdrMap.put(header[i], Integer.valueOf(i));
                }
            }
        }
        return hdrMap;
    }

    public boolean isClosed() {
        return this.lexer.isClosed();
    }

    /**
     * Returns an iterator on the records.
     *
     * 
IOExceptions occurring during the iteration are wrapped in a
     * RuntimeException.
     * If the parser is closed a call to {@code next()} will throw a
     * NoSuchElementException.
     */
    public Iterator iterator() {
        return new Iterator() {
            private CSVRecord current;

            private CSVRecord getNextRecord() {
                try {
                    return CSVParser.this.nextRecord();
                } catch (final IOException e) {
                    // TODO: This is not great, throw an ISE instead?
                    throw new RuntimeException(e);
                }
            }

            public boolean hasNext() {
                if (CSVParser.this.isClosed()) {
                    return false;
                }
                if (this.current == null) {
                    this.current = this.getNextRecord();
                }

                return this.current != null;
            }

            public CSVRecord next() {
                if (CSVParser.this.isClosed()) {
                    throw new NoSuchElementException("CSVParser has been closed");
                }
                CSVRecord next = this.current;
                this.current = null;

                if (next == null) {
                    // hasNext() wasn't called before
                    next = this.getNextRecord();
                    if (next == null) {
                        throw new NoSuchElementException("No more CSV records available");
                    }
                }

                return next;
            }

            public void remove() {
                throw new UnsupportedOperationException();
            }
        };
    }

    /**
     * Parses the next record from the current point in the stream.
     *
     * @return the record as an array of values, or null if the end of the stream has been reached
     * @throws IOException
     *             on parse error or input read-failure
     */
    CSVRecord nextRecord() throws IOException {
    	if(tryNextRecord()){
            return new CSVRecord(this.record.toArray(new String[this.record.size()]), this.headerMap,recordComment,this.recordNumber);
    	}
    	return null;
    }
    
    String[] nextRecord1() throws IOException {
    	if(tryNextRecord()){
            return this.record.toArray(new String[this.record.size()]);
    	}
    	return null;
    }
    
    boolean tryNextRecord() throws IOException {
        this.record.clear();
        this.recordComment = null;
        StringBuilder sb = null;
        do {
            this.reusableToken.reset();
            this.lexer.nextToken(this.reusableToken);
            switch (this.reusableToken.type) {
            case TOKEN:
                this.addRecordValue();
                break;
            case EORECORD:
                this.addRecordValue();
                break;
            case EOF:
                if (this.reusableToken.isReady) {
                    this.addRecordValue();
                }
                break;
            case INVALID:
                throw new IOException("(line " + this.getCurrentLineNumber() + ") invalid parse sequence");
            case COMMENT: // Ignored currently
            	if(readComment){
                    if (sb == null) { // first comment for this record
                        sb = new StringBuilder();
                    } else {
                        sb.append(Constants.LF);
                    }
                    sb.append(this.reusableToken.content);
            	}
                this.reusableToken.type = TOKEN; // Read another token
                break;
            }
        } while (this.reusableToken.type == TOKEN);

        if (!this.record.isEmpty()) {
            this.recordNumber++;
            this.recordComment = sb == null ? null : sb.toString();
            return true;
        }
        return false;
    }
}