com.univocity.parsers.common.CommonParserSettings Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of univocity-parsers Show documentation
univocity's open source parsers for processing different text formats using a consistent API
There is a newer version: 2.9.1
/*******************************************************************************
 * Copyright 2014 uniVocity Software Pty Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package com.univocity.parsers.common;

import java.util.*;

import com.univocity.parsers.common.input.*;
import com.univocity.parsers.common.input.concurrent.*;
import com.univocity.parsers.common.processor.*;

/**
 * This is the parent class for all configuration classes used by parsers ({@link AbstractParser})
 *
 * By default, all parsers work with, at least, the following configuration options in addition to the ones provided by {@link CommonSettings}:
 *
 * 

 * 	rowProcessor: a callback implementation of the interface {@link RowProcessor} which handles the life cycle of the parsing process and processes each record extracted from the input
 *  headerExtractionEnabled (defaults to false): indicates whether or not the first valid record parsed from the input should be considered as the row containing the names of each column
 *  columnReorderingEnabled (defaults to true): indicates whether fields selected using the field selection methods (defined by the parent class {@link CommonSettings}) should be reordered.
 *  	When disabled, each parsed record will contain values for all columns, in the order they occur in the input. Fields which were not selected will not be parsed but and the record will contain empty values.
 *  	
When enabled, each parsed record will contain values only for the selected columns. The values will be ordered according to the selection.
 *  
inputBufferSize (defaults to 1024*1024 characters): The number of characters held by the parser's buffer when processing the input.
 *  
readInputOnSeparateThread (defaults true if the number of available processors at runtime is greater than 1):
 *  	When enabled, a reading thread (in input.concurrent.ConcurrentCharInputReader) will be started and load characters from the input, while the parser is processing its input buffer.
 *         This yields better performance, especially when reading from big input (greater than 100 mb)
		
When disabled, the parsing process will briefly pause so the buffer can be replenished every time it is exhausted (in {@link DefaultCharInputReader} it is not as bad or slow as it sounds, and can even be (slightly) more efficient if your input is small)
 *  
numberOfRecordsToRead (defaults to -1): Defines how many (valid) records are to be parsed before the process is stopped. A negative value indicates there's no limit.
 *  lineSeparatorDetectionEnabled (defaults to false): Attempts to identify what is the line separator being used in the input.
 *  	The first row of the input will be read until a sequence of '\r\n', or characters '\r' or '\n' is found. If a match is found, then it will be used as the line separator to use to parse the input
 * 
 *
 * @param  the format supported by this parser.
 *
 * @see com.univocity.parsers.common.processor.RowProcessor
 * @see com.univocity.parsers.csv.CsvParserSettings
 * @see com.univocity.parsers.fixed.FixedWidthParserSettings
 *
 * @author uniVocity Software Pty Ltd - [email protected]
 *
 */
public abstract class CommonParserSettings extends CommonSettings {

	private boolean headerExtractionEnabled = false;
	private RowProcessor rowProcessor;
	private boolean columnReorderingEnabled = true;
	private int inputBufferSize = 1024 * 1024;
	private boolean readInputOnSeparateThread = Runtime.getRuntime().availableProcessors() > 1;
	private int numberOfRecordsToRead = -1;
	private boolean lineSeparatorDetectionEnabled = false;

	/**
	 * Indicates whether or not a separate thread will be used to read characters from the input while parsing (defaults true if the number of available
	 * processors at runtime is greater than 1)
	 * 	When enabled, a reading thread (in com.univocity.parsers.common.input.concurrent.ConcurrentCharInputReader)
	 *     will be started and load characters from the input, while the parser is processing its input buffer.
	 *     This yields better performance, especially when reading from big input (greater than 100 mb)
	 *  
When disabled, the parsing process will briefly pause so the buffer can be replenished every time it is exhausted
	 *     (in {@link DefaultCharInputReader} it is not as bad or slow as it sounds, and can even be (slightly) more efficient if your input is small)
	 * @return true if the input should be read on a separate thread, false otherwise
	 */
	public boolean getReadInputOnSeparateThread() {
		return readInputOnSeparateThread;
	}

	/**
	 * Defines whether or not a separate thread will be used to read characters from the input while parsing (defaults true if the number of available
	 *  processors at runtime is greater than 1)
	 * 	
When enabled, a reading thread (in com.univocity.parsers.common.input.concurrent.ConcurrentCharInputReader) will be
	 *     started and load characters from the input, while the
	 *     parser is processing its input buffer. This yields better performance, especially when reading from big input (greater than 100 mb)
	 *  
When disabled, the parsing process will briefly pause so the buffer can be replenished every time it is exhausted (in {@link DefaultCharInputReader}
	 *     it is not as bad or slow as it sounds, and can even be (slightly) more efficient if your input is small)
	 * @param readInputOnSeparateThread the flag indicating whether or not the input should be read on a separate thread
	 */
	public void setReadInputOnSeparateThread(boolean readInputOnSeparateThread) {
		this.readInputOnSeparateThread = readInputOnSeparateThread;
	}

	/**
	 * Indicates whether or not the first valid record parsed from the input should be considered as the row containing the names of each column
	 * @return true if the first valid record parsed from the input should be considered as the row containing the names of each column, false otherwise
	 */
	public boolean isHeaderExtractionEnabled() {
		return headerExtractionEnabled;
	}

	/**
	 * Defines whether or not the first valid record parsed from the input should be considered as the row containing the names of each column
	 * @param headerExtractionEnabled a flag indicating whether the first valid record parsed from the input should be considered as the row containing the names of each column
	 */
	public void setHeaderExtractionEnabled(boolean headerExtractionEnabled) {
		this.headerExtractionEnabled = headerExtractionEnabled;
	}

	/**
	 * Returns the callback implementation of the interface {@link RowProcessor} which handles the lifecyle of the parsing process and processes each record extracted from the input
	 * @return Returns the RowProcessor used by the parser to handle each record
	 *
	 * @see com.univocity.parsers.common.processor.ObjectRowProcessor
	 * @see com.univocity.parsers.common.processor.ObjectRowListProcessor
	 * @see com.univocity.parsers.common.processor.MasterDetailProcessor
	 * @see com.univocity.parsers.common.processor.MasterDetailListProcessor
	 * @see com.univocity.parsers.common.processor.BeanProcessor
	 * @see com.univocity.parsers.common.processor.BeanListProcessor
	 */
	public RowProcessor getRowProcessor() {
		if (rowProcessor == null) {
			return NoopRowProcessor.instance;
		}
		return rowProcessor;
	}

	/**
	 * Defines the callback implementation of the interface {@link RowProcessor} which handles the lifecyle of the parsing process and processes each record extracted from the input
	 * @param processor the RowProcessor instance which should used by the parser to handle each record
	 *
	 * @see com.univocity.parsers.common.processor.ObjectRowProcessor
	 * @see com.univocity.parsers.common.processor.ObjectRowListProcessor
	 * @see com.univocity.parsers.common.processor.MasterDetailProcessor
	 * @see com.univocity.parsers.common.processor.MasterDetailListProcessor
	 * @see com.univocity.parsers.common.processor.BeanProcessor
	 * @see com.univocity.parsers.common.processor.BeanListProcessor
	 */
	public void setRowProcessor(RowProcessor processor) {
		this.rowProcessor = processor;
	}

	/**
	 * An implementation of {@link CharInputReader} which loads the parser buffer in parallel or sequentially, as defined by the readInputOnSeparateThread property
	 * @return The input reader as chosen with the readInputOnSeparateThread property.
	 */
	CharInputReader newCharInputReader() {
		if (readInputOnSeparateThread) {
			if (lineSeparatorDetectionEnabled) {
				return new ConcurrentCharInputReader(getFormat().getNormalizedNewline(), this.getInputBufferSize(), 10);
			} else {
				return new ConcurrentCharInputReader(getFormat().getLineSeparator(), getFormat().getNormalizedNewline(), this.getInputBufferSize(), 10);
			}
		} else {
			if (lineSeparatorDetectionEnabled) {
				return new DefaultCharInputReader(getFormat().getNormalizedNewline(), this.getInputBufferSize());
			} else {
				return new DefaultCharInputReader(getFormat().getLineSeparator(), getFormat().getNormalizedNewline(), this.getInputBufferSize());
			}
		}
	}

	/**
	 * The number of valid records to be parsed before the process is stopped. A negative value indicates there's no limit (defaults to -1).
	 * @return the number of records to read before stopping the parsing process.
	 */
	public int getNumberOfRecordsToRead() {
		return numberOfRecordsToRead;
	}

	/**
	 * Defines the number of valid records to be parsed before the process is stopped. A negative value indicates there's no limit (defaults to -1).
	 * @param numberOfRecordsToRead the number of records to read before stopping the parsing process.
	 */
	public void setNumberOfRecordsToRead(int numberOfRecordsToRead) {
		this.numberOfRecordsToRead = numberOfRecordsToRead;
	}

	/**
	 * Indicates whether fields selected using the field selection methods (defined by the parent class {@link CommonSettings}) should be reordered (defaults to true).
	 * 	
When disabled, each parsed record will contain values for all columns, in the order they occur in the input. Fields which were not selected will not be parsed but and the record will contain empty values.
	 * 	
When enabled, each parsed record will contain values only for the selected columns. The values will be ordered according to the selection.
	 * @return true if the selected fields should be reordered and returned by the parser, false otherwise
	 */
	public boolean isColumnReorderingEnabled() {
		return columnReorderingEnabled;
	}

	/**
	 * Defines whether fields selected using the field selection methods (defined by the parent class {@link CommonSettings}) should be reordered (defaults to true).
	 * 	
When disabled, each parsed record will contain values for all columns, in the order they occur in the input. Fields which were not selected will not be parsed but and the record will contain empty values.
	 * 	When enabled, each parsed record will contain values only for the selected columns. The values will be ordered according to the selection.
	 * @param columnReorderingEnabled the flag indicating whether or not selected fields should be reordered and returned by the parser
	 */
	public void setColumnReorderingEnabled(boolean columnReorderingEnabled) {
		this.columnReorderingEnabled = columnReorderingEnabled;
	}

	/**
	 * Informs the number of characters held by the parser's buffer when processing the input (defaults to 1024*1024 characters).
	 * @return the number of characters held by the parser's buffer when processing the input
	 */
	public int getInputBufferSize() {
		return inputBufferSize;
	}

	/**
	 * Defines the number of characters held by the parser's buffer when processing the input (defaults to 1024*1024 characters).
	 * @param inputBufferSize the new input buffer size (in number of characters)
	 */
	public void setInputBufferSize(int inputBufferSize) {
		this.inputBufferSize = inputBufferSize;
	}

	/**
	 * Returns an instance of CharAppender with the configured limit of maximum characters per column and the default value used to represent a null value (when the String parsed from the input is empty)
	 * @return an instance of CharAppender with the configured limit of maximum characters per column and the default value used to represent a null value (when the String parsed from the input is empty)
	 */
	protected CharAppender newCharAppender() {
		return new DefaultCharAppender(getMaxCharsPerColumn(), getNullValue());
	}

	/**
	 * Indicates whether the parser should detect the line separator automatically.
	 * @return {@code true} if the first line of the input should be used to search for common line separator sequences (the matching sequence will be used as the line separator for parsing). Otherwise {@code false}.
	 */
	public final boolean isLineSeparatorDetectionEnabled() {
		return lineSeparatorDetectionEnabled;
	}

	/**
	 * Defines whether the parser should detect the line separator automatically.
	 * @param lineSeparatorDetectionEnabled a flag indicating whether the first line of the input should be used to search for common line separator sequences (the matching sequence will be used as the line separator for parsing).
	 */
	public final void setLineSeparatorDetectionEnabled(boolean lineSeparatorDetectionEnabled) {
		this.lineSeparatorDetectionEnabled = lineSeparatorDetectionEnabled;
	}

	@Override
	protected void addConfiguration(Map out) {
		super.addConfiguration(out);
		out.put("Header extraction enabled", headerExtractionEnabled);
		out.put("Row processor", rowProcessor == null ? "none" : rowProcessor.getClass().getName());
		out.put("Column reordering enabled", columnReorderingEnabled);
		out.put("Input buffer size", inputBufferSize);
		out.put("Input reading on separate thread", readInputOnSeparateThread);
		out.put("Number of records to read", numberOfRecordsToRead == -1 ? "all" : numberOfRecordsToRead);
		out.put("Line separator detection enabled", lineSeparatorDetectionEnabled);
	}
}