All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.stratosphere.api.java.io.CsvOutputFormat Maven / Gradle / Ivy

There is a newer version: 0.5.2-hadoop2
Show newest version
/***********************************************************************************************************************
 * Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations under the License.
 **********************************************************************************************************************/

package eu.stratosphere.api.java.io;

import java.io.BufferedOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import eu.stratosphere.api.common.InvalidProgramException;
import eu.stratosphere.api.common.io.FileOutputFormat;
import eu.stratosphere.api.java.tuple.Tuple;
import eu.stratosphere.api.java.typeutils.InputTypeConfigurable;
import eu.stratosphere.types.TypeInformation;
import eu.stratosphere.core.fs.Path;
import eu.stratosphere.types.StringValue;

/**
 * This is an OutputFormat to serialize {@link eu.stratosphere.api.java.tuple.Tuple}s to text. The output is
 * structured by record delimiters and field delimiters as common in CSV files.
 * Record delimiter separate records from each other ('\n' is common). Field
 * delimiters separate fields within a record.
 */
public class CsvOutputFormat extends FileOutputFormat implements InputTypeConfigurable {
	private static final long serialVersionUID = 1L;

	@SuppressWarnings("unused")
	private static final Log LOG = LogFactory.getLog(CsvOutputFormat.class);
	
	// --------------------------------------------------------------------------------------------
	
	public static final String DEFAULT_LINE_DELIMITER = CsvInputFormat.DEFAULT_LINE_DELIMITER;
	
	public static final String DEFAULT_FIELD_DELIMITER = String.valueOf(CsvInputFormat.DEFAULT_FIELD_DELIMITER);

	// --------------------------------------------------------------------------------------------

	private transient Writer wrt;

	private String fieldDelimiter;

	private String recordDelimiter;

	private String charsetName;

	private boolean allowNullValues = true;

	private boolean quoteStrings = false;

	// --------------------------------------------------------------------------------------------
	// Constructors and getters/setters for the configurable parameters
	// --------------------------------------------------------------------------------------------

	/**
	 * Creates an instance of CsvOutputFormat. Lines are separated by the newline character '\n',
	 * fields are separated by ','.
	 * 
	 * @param outputPath The path where the CSV file is written.
	 */
	public CsvOutputFormat(Path outputPath) {
		this(outputPath, DEFAULT_LINE_DELIMITER, DEFAULT_FIELD_DELIMITER);
	}

	/**
	 * Creates an instance of CsvOutputFormat. Lines are separated by the newline character '\n',
	 * fields by the given field delimiter.
	 * 
	 * @param outputPath The path where the CSV file is written.
	 * @param fieldDelimiter
	 *            The delimiter that is used to separate fields in a tuple.
	 */
	public CsvOutputFormat(Path outputPath, String fieldDelimiter) {
		this(outputPath, DEFAULT_LINE_DELIMITER, fieldDelimiter);
	}

	/**
	 * Creates an instance of CsvOutputFormat.
	 * 
	 * @param outputPath The path where the CSV file is written.
	 * @param recordDelimiter
	 *            The delimiter that is used to separate the tuples.
	 * @param fieldDelimiter
	 *            The delimiter that is used to separate fields in a tuple.
	 */
	public CsvOutputFormat(Path outputPath, String recordDelimiter, String fieldDelimiter) {
		super(outputPath);
		if (recordDelimiter == null) {
			throw new IllegalArgumentException("RecordDelmiter shall not be null.");
		}
		if (fieldDelimiter == null) {
			throw new IllegalArgumentException("FieldDelimiter shall not be null.");
		}

		this.fieldDelimiter = fieldDelimiter;
		this.recordDelimiter = recordDelimiter;
		this.allowNullValues = false;
	}
	
	/**
	 * Configures the format to either allow null values (writing an empty field),
	 * or to throw an exception when encountering a null field.
	 * 

* by default, null values are allowed. * * @param allowNulls Flag to indicate whether the output format should accept null values. */ public void setAllowNullValues(boolean allowNulls) { this.allowNullValues = allowNulls; } /** * Sets the charset with which the CSV strings are written to the file. * If not specified, the output format uses the systems default character encoding. * * @param charsetName The name of charset to use for encoding the output. */ public void setCharsetName(String charsetName) { this.charsetName = charsetName; } /** * Configures whether the output format should quote string values. String values are fields * of type {@link java.lang.String} and {@link eu.stratosphere.types.StringValue}, as well as * all subclasses of the latter. *

* By default, strings are not quoted. * * @param quoteStrings Flag indicating whether string fields should be quoted. */ public void setQuoteStrings(boolean quoteStrings) { this.quoteStrings = quoteStrings; } // -------------------------------------------------------------------------------------------- @Override public void open(int taskNumber, int numTasks) throws IOException { super.open(taskNumber, numTasks); this.wrt = this.charsetName == null ? new OutputStreamWriter(new BufferedOutputStream(this.stream, 4096)) : new OutputStreamWriter(new BufferedOutputStream(this.stream, 4096), this.charsetName); } @Override public void close() throws IOException { if (wrt != null) { this.wrt.close(); } super.close(); } @Override public void writeRecord(T element) throws IOException { int numFields = element.getArity(); for (int i = 0; i < numFields; i++) { Object v = element.getField(i); if (v != null) { if (i != 0) { this.wrt.write(this.fieldDelimiter); } if (quoteStrings) { if (v instanceof String || v instanceof StringValue) { this.wrt.write('"'); this.wrt.write(v.toString()); this.wrt.write('"'); } else { this.wrt.write(v.toString()); } } else { this.wrt.write(v.toString()); } } else { if (this.allowNullValues) { if (i != 0) { this.wrt.write(this.fieldDelimiter); } } else { throw new RuntimeException("Cannot write tuple with value at position: " + i); } } } // add the record delimiter this.wrt.write(this.recordDelimiter); } // -------------------------------------------------------------------------------------------- /** * * The purpose of this method is solely to check whether the data type to be processed * is in fact a tuple type. */ @Override public void setInputType(TypeInformation type) { if (!type.isTupleType()) { throw new InvalidProgramException("The " + CsvOutputFormat.class.getSimpleName() + " can only be used to write tuple data sets."); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy