edu.ucr.cs.bdlab.io.CSVFeatureWriter Maven / Gradle / Ivy
package edu.ucr.cs.bdlab.io;
import edu.ucr.cs.bdlab.geolite.IFeature;
import edu.ucr.cs.bdlab.util.DynamicArrays;
import edu.ucr.cs.bdlab.util.IntArray;
import edu.ucr.cs.bdlab.util.OperationParam;
import edu.ucr.cs.bdlab.wktparser.WKTParser;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.Arrays;
import java.util.function.BiFunction;
/**
* Writes records that have value of type {@link edu.ucr.cs.bdlab.geolite.IFeature} while keys are ignored.
*/
@FeatureWriter.Metadata(extension = ".csv", shortName = "csv")
public class CSVFeatureWriter extends FeatureWriter {
@OperationParam(
description = "The field separator for output text files. Defaults to the same as the input"
)
public static final String FieldSeparator = "oseparator";
@OperationParam(
description = "For CSV files, adds a header line to the output file"
)
public static final String WriteHeader = "oheader";
/**A constant for new line character for code readability*/
protected static final char NewLine = '\n';
/**A name to use for each dimension*/
protected final String[] DimensionNames = {"x", "y", "z", "w", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j",
"k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v"};
/**The first line of the output will create a CSV header*/
protected boolean writeHeader;
/**The output stream that writes the CSV file*/
protected DataOutputStream out;
/**Encodes features into text*/
protected BiFunction featureEncoder;
/**A temporary string builder*/
protected StringBuilder str;
/**The indexes of the columns where the geometry will be written*/
protected int[] columnIndexes;
/**A single character used as field separator*/
protected char fieldSeparator;
/**Type of geometry to be written (point, envelope, wkt)*/
private String geometryType;
/**
* Initializes the writer to the given file path.
* @param csvPath
* @param conf
* @throws IOException
*/
@Override
public void initialize(Path csvPath, Configuration conf) throws IOException {
FileSystem fs = csvPath.getFileSystem(conf);
this.initialize(fs.create(csvPath), conf);
}
@Override
public void initialize(OutputStream out, Configuration conf) {
this.writeHeader = conf.getBoolean(WriteHeader, conf.getBoolean(CSVFeatureReader.SkipHeader, false));
this.fieldSeparator = conf.get(FieldSeparator, conf.get(CSVFeatureReader.FieldSeparator, "\t")).charAt(0);
detectGeometryTypeAndColumnIndexes(conf.get(SpatialOutputFormat.OutputFormat, conf.get(SpatialInputFormat.InputFormat)));
switch (geometryType) {
case "point":
featureEncoder = new CSVPointEncoder(fieldSeparator, columnIndexes);
break;
case "wkt":
assert columnIndexes.length == 1;
featureEncoder = new CSVWKTEncoder(fieldSeparator, columnIndexes[0]);
break;
case "envelope":
assert columnIndexes.length % 2 == 0;
featureEncoder = new CSVEnvelopeEncoder(fieldSeparator, columnIndexes);
break;
default:
throw new RuntimeException(String.format("Unsupported output type '%s'", geometryType));
}
str = new StringBuilder();
this.out = out instanceof DataOutputStream? (DataOutputStream) out : new DataOutputStream(out);
}
/**
* Infers the geometry type and column indexes from the given user-friendly format
* @param userFriendlyFormat
*/
protected void detectGeometryTypeAndColumnIndexes(String userFriendlyFormat) {
int[] integers; // The integers between the parentheses or an empty array if not parentheses
int openParenthesis = userFriendlyFormat.indexOf('(');
int closeParenthesis = userFriendlyFormat.indexOf(')', openParenthesis + 1);
if (openParenthesis != -1) {
geometryType = userFriendlyFormat.substring(0, openParenthesis).toLowerCase();
String[] integersParts = userFriendlyFormat.substring(openParenthesis + 1, closeParenthesis).split(",");
integers = new int[integersParts.length];
for (int $i = 0; $i < integersParts.length; $i++)
integers[$i] = Integer.parseInt(integersParts[$i]);
} else {
geometryType = userFriendlyFormat.toLowerCase();
integers = new int[0];
}
// The indexes of the columns that contain the geometry
if (this.geometryType.equals("point")) {
// A two-dimensional point
this.geometryType = "point";
this.columnIndexes = CSVFeature.createColumnIndexes(2, integers);
} else if (geometryType.equals("envelope")) {
// A two-dimensional envelope
this.columnIndexes = CSVFeature.createColumnIndexes(4, integers);
} else if (geometryType.equals("pointk")) {
// A k-dimensional point where the first integer indicates the number of dimensions
this.geometryType = "point";
int numDimensions = integers[0];
integers = Arrays.copyOfRange(integers, 1, integers.length);
this.columnIndexes = CSVFeature.createColumnIndexes(numDimensions, integers);
} else if (geometryType.equals("envelopek")) {
// A k-dimensional envelope where the first integer indicates the number of dimensions
this.geometryType = "envelope";
int numDimensions = integers[0];
integers = Arrays.copyOfRange(integers, 1, integers.length);
this.columnIndexes = CSVFeature.createColumnIndexes(numDimensions * 2, integers);
} else if (geometryType.equals("wkt")) {
this.columnIndexes = CSVFeature.createColumnIndexes(1, integers);
} else if (geometryType.equals("nogeom")) {
this.columnIndexes = new int[0];
} else {
// No valid input format detected
}
}
@Override
public void write(Object dummy, IFeature f) throws IOException, InterruptedException {
if (writeHeader) {
writeHeader(f, str);
writeHeader = false;
}
// Write the feature as a CSV file
featureEncoder.apply(f, str);
str.append(NewLine);
if (str.length() > 16*1024) {
// String builder is long enough to flush to disk
byte[] bytes = str.toString().getBytes();
str.setLength(0);
out.write(bytes, 0, bytes.length);
}
}
private void writeHeader(IFeature feature, StringBuilder str) {
int numDimensions = feature.getGeometry().getCoordinateDimension();
int[] invertedIndex = DynamicArrays.invertedIndex(columnIndexes);
int $a = 0; // The index of the attribute to write next
boolean firstColumn = true;
for (int i$ : invertedIndex) {
if (!firstColumn)
str.append(fieldSeparator);
if (i$ == -1)
str.append(feature.getAttributeName($a++));
else {
switch (geometryType) {
case "wkt": str.append("geometry"); break;
case "point": str.append(DimensionNames[i$]); break;
case "envelope": str.append(i$ < numDimensions? (DimensionNames[i$]+"min") : (DimensionNames[i$ - numDimensions]+"max")); break;
default: throw new RuntimeException(String.format("Unsupported format '%s'", geometryType));
}
}
firstColumn = false;
}
while ($a < feature.getNumAttributes()) {
if (!firstColumn)
str.append(fieldSeparator);
str.append(feature.getAttributeName($a++));
firstColumn = false;
}
str.append(NewLine);
}
@Override
public int estimateSize(IFeature f) {
StringBuilder str = new StringBuilder();
featureEncoder.apply(f, str);
str.append(NewLine);
return str.length();
}
@Override
public void close(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
// Write any remaining bytes in the string builder
byte[] bytes = str.toString().getBytes();
str.setLength(0);
out.write(bytes, 0, bytes.length);
out.close();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy