edu.ucr.cs.bdlab.io.ShapefileGeometryReader Maven / Gradle / Ivy
/*
* Copyright 2018 University of California, Riverside
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.ucr.cs.bdlab.io;
import edu.ucr.cs.bdlab.geolite.Envelope;
import edu.ucr.cs.bdlab.geolite.GeometryType;
import edu.ucr.cs.bdlab.geolite.IGeometry;
import edu.ucr.cs.bdlab.geolite.MultiPoint;
import edu.ucr.cs.bdlab.geolite.Point;
import edu.ucr.cs.bdlab.geolite.twod.LineString2D;
import edu.ucr.cs.bdlab.geolite.twod.MultiLineString2D;
import edu.ucr.cs.bdlab.geolite.twod.MultiPolygon2D;
import edu.ucr.cs.bdlab.util.IOUtil;
import edu.ucr.cs.bdlab.util.WritableExternalizable;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import java.io.BufferedInputStream;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.File;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.Enumeration;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
public class ShapefileGeometryReader extends RecordReader {
private static final Log LOG = LogFactory.getLog(ShapefileGeometryReader.class);
/**Marker for empty shapes in Shapefile*/
public static final int NullShape = 0;
/**Marker for point shapes in Shapefile with x and y coordinates*/
public static final int PointShape = 1;
/**Marker for multi-point shapes in Shapefile*/
public static final int MultiPointShape = 8;
/**Marker for polyline (linestring) shapes in Shapefile*/
public static final int PolylineShape = 3;
/**Marker for polygon shapes in Shapefile*/
public static final int PolygonShape = 5;
/**Marker for point shapes in Shapefile with x, y, and m attributes*/
public static final int PointMShape = 21;
/**Marker for multi-point shapes in Shapefile with x, y, and m attributes*/
public static final int MultiPointMShape = 28;
/**Marker for polyline (linestring) shapes in Shapefile with x, y, and m attributes*/
public static final int PolylineMShape = 23;
/**Marker for polygon shapes in Shapefile with x, y, and m attributes*/
public static final int PolygonMShape = 25;
/**Marker for point shapes in Shapefile with x, y, z, and m attributes*/
public static final int PointMZShape = 11;
/**Marker for multi-point shapes in Shapefile with x, y, z, and m attributes*/
public static final int MultiPointMZShape = 18;
/**Marker for polyline (linestring) shapes in Shapefile with x, y, z, and m attributes*/
public static final int PolylineMZShape = 13;
/**Marker for polygon shapes in Shapefile with x, y, z, and m attributes*/
public static final int PolygonMZShape = 15;
/**Marker for multipatch shapes in Shapefile with x, y, z, and m attributes*/
public static final int MultiPatchMZShape = 31;
/**The filename*/
protected String filename;
/**Holds the fixed-size 100-byte header of Shapefiles. */
public static class Header implements WritableExternalizable {
/**The signature of the Shapefiles*/
static final int Signature = 9994;
/**Length of the file in 16-bit words (i.e., size in bytes / 2)*/
int fileLength;
/**As of now, the version should be always 1000*/
int version;
/**Type of shapes stored in the file*/
int shapeType;
/**The minimum bounding rectangle (MBR) of the file*/
double xmin, ymin, xmax, ymax;
/**Bounds on the third z dimension. Value is zero when not used.*/
double zmin, zmax;
/**Bounds on the measure value. Zero if not used.*/
double mmin, mmax;
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(Signature); // File signature
out.writeInt(0); // Five unused integers
out.writeInt(0);
out.writeInt(0);
out.writeInt(0);
out.writeInt(0);
out.writeInt(fileLength);
IOUtil.writeIntLittleEndian(out, version);
IOUtil.writeIntLittleEndian(out, shapeType);
IOUtil.writeLongLittleEndian(out, Double.doubleToLongBits(xmin));
IOUtil.writeLongLittleEndian(out, Double.doubleToLongBits(ymin));
IOUtil.writeLongLittleEndian(out, Double.doubleToLongBits(xmax));
IOUtil.writeLongLittleEndian(out, Double.doubleToLongBits(ymax));
IOUtil.writeLongLittleEndian(out, Double.doubleToLongBits(zmin));
IOUtil.writeLongLittleEndian(out, Double.doubleToLongBits(zmax));
IOUtil.writeLongLittleEndian(out, Double.doubleToLongBits(mmin));
IOUtil.writeLongLittleEndian(out, Double.doubleToLongBits(mmax));
}
@Override
public void readFields(DataInput in) throws IOException {
int code = in.readInt();
if (code != Signature)
throw new RuntimeException(String.format("Invalid Shapefile code %d. Expected %d.", code, Signature));
in.skipBytes(5 * 4); // Skip the five unused integers
this.fileLength = in.readInt();
this.version = IOUtil.readIntLittleEndian(in);
this.shapeType = IOUtil.readIntLittleEndian(in);
this.xmin = Double.longBitsToDouble(IOUtil.readLongLittleEndian(in));
this.ymin = Double.longBitsToDouble(IOUtil.readLongLittleEndian(in));
this.xmax = Double.longBitsToDouble(IOUtil.readLongLittleEndian(in));
this.ymax = Double.longBitsToDouble(IOUtil.readLongLittleEndian(in));
this.zmin = Double.longBitsToDouble(IOUtil.readLongLittleEndian(in));
this.zmax = Double.longBitsToDouble(IOUtil.readLongLittleEndian(in));
this.mmin = Double.longBitsToDouble(IOUtil.readLongLittleEndian(in));
this.mmax = Double.longBitsToDouble(IOUtil.readLongLittleEndian(in));
}
}
/**If the input is a ZIP file, this object stores that zip file*/
protected ZipFile zipFile;
/**The input stream to the shapefile*/
private DataInputStream in;
/**Header of the file being read*/
private Header header;
/**An optional MBB that can be used to filter records*/
private Envelope filterMBR;
/**The MBB of the current record*/
private Envelope mbr;
/**A mutable geometry used to iterate over the input file*/
private IGeometry geometry;
/**The record number (in shapefile) for the current record being read*/
private int currentRecordNumber;
/**The length of the record (in shapefile) currently being read*/
private int currentRecordLength;
/**The position of the reader in the current record starting at zero*/
private long offsetOfCurrentRecord;
/**The position in the shapefile*/
protected int pos;
/**The index of the current shape in the file one-based*/
protected int iShape;
/**A temporary buffer to read and parse a record*/
protected ByteBuffer readBuffers;
/**Header size for a polyline. Four 64-bit double for MBR, and two 32-bit int for number of parts and number of points*/
protected static final int PolylineHeaderSize = 8 * 4 + 4 * 2;
/**Header size for a multipoint. Four 64-bit double for MBR, and one 32-bit int for number of points*/
protected static final int MultiPointHeaderSize = 8 * 4 + 4;
/**A flag that indicates that we should create a new geometry for each record*/
protected boolean immutableObjects;
/**Offsets of all records in bytes as they appear in the .shx file*/
protected int[] recordOffsets;
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
this.initialize(((FileSplit) inputSplit).getPath(), taskAttemptContext.getConfiguration());
}
public void initialize(Path path, Configuration conf) throws IOException {
FileSystem fs = path.getFileSystem(conf);
recordOffsets = null;
this.filename = path.getName();
int i = filename.lastIndexOf('.');
String extension = filename.substring(i).toLowerCase();
if (extension.equals(".shp")) {
// A path to a shapefile, open directly
initialize(fs.open(path), conf);
// Read the .shx file
String shxFileName = this.filename.substring(0, i) + ".shx";
Path shxFilePath = new Path(path.getParent(), shxFileName);
if (fs.exists(shxFilePath)) {
FSDataInputStream shxIn = fs.open(shxFilePath);
try {
readIndexFile(shxIn);
} finally {
shxIn.close();
}
}
} else if (extension.equals(".zip")) {
// Open the first shapefile encountered in the ZIP file
if (fs instanceof LocalFileSystem) {
// The ZIP file is stored locally, open it directly
String fullPath = path.toUri().getPath();
zipFile = new ZipFile(fullPath);
} else {
// The file is stored remotely. We have to copy it locally
File tempZipFile = File.createTempFile(filename, ".shp");
fs.copyToLocalFile(path, new Path(tempZipFile.toString()));
zipFile = new ZipFile(tempZipFile);
}
Enumeration extends ZipEntry> entries = zipFile.entries();
boolean shpFileFound = false;
boolean shxFileFound = false;
while (entries.hasMoreElements() && (!shpFileFound || !shxFileFound)) {
ZipEntry entry = entries.nextElement();
String entryName = entry.getName();
if (entryName.toLowerCase().endsWith(".shp")) {
// Found the shape file
DataInputStream in = new DataInputStream(new BufferedInputStream(zipFile.getInputStream(entry)));
initialize(in, conf);
shpFileFound = true;
} else if (entryName.toLowerCase().endsWith(".shx")) {
// Found the index file
shxFileFound = true;
DataInputStream shxIn = new DataInputStream(new BufferedInputStream(zipFile.getInputStream(entry)));
try {
readIndexFile(shxIn);
} finally {
shxIn.close();
}
}
}
// Could not find any shapefile entries in the zip file
if (!shpFileFound)
throw new RuntimeException("Could not find any .shp files in the file "+path);
// Could not find the index file in the zip file
if (!shxFileFound)
LOG.warn("Could not find any .shx files in the file "+path+". Assuming consecutive records in the .shp file");
} else {
throw new RuntimeException(String.format("Unsupported file extension '%s'", extension));
}
}
/**
* Fully read the entire index (.shx) file and load it into memory to iterate over the file.
* @param shxIn
*/
protected void readIndexFile(DataInputStream shxIn) throws IOException {
Header shxHeader = new Header();
shxHeader.readFields(shxIn);
// Number of records is the total file size - header size divided by 8-bytes per record
int numRecords = (shxHeader.fileLength * 2 - 100) / 8;
this.recordOffsets = new int[numRecords];
for (int $i = 0; $i < numRecords; $i++) {
int offset = shxIn.readInt();
int length = shxIn.readInt();
this.recordOffsets[$i] = offset * 2;
}
}
protected void initialize(DataInputStream in, Configuration conf) throws IOException {
this.in = in;
this.immutableObjects = conf.getBoolean(SpatialInputFormat.ImmutableObjects, false);
header = new Header();
header.readFields(this.in);
pos = 100;
mbr = new Envelope();
String filterMBRStr = conf.get(SpatialInputFormat.FilterMBR);
if (filterMBRStr != null) {
String[] parts = filterMBRStr.split(",");
double[] coords = new double[parts.length];
for (int i = 0; i < coords.length; i++)
coords[i] = Double.parseDouble(parts[i]);
this.filterMBR = new Envelope(coords.length / 2, coords);
} else {
this.filterMBR = null;
}
iShape = 0;
readBuffers = ByteBuffer.allocate(PolylineHeaderSize);
readBuffers.order(ByteOrder.LITTLE_ENDIAN);
fetchNextRecord();
}
/**
* Fetches the next record to start reading shapes
* @return {@code true} if a record was fetched. {@code false} if EOF is reached.
* @throws IOException
*/
private boolean fetchNextRecord() throws IOException {
if (pos >= header.fileLength * 2 || (recordOffsets != null && iShape >= recordOffsets.length))
return false;
offsetOfCurrentRecord = recordOffsets == null? pos : recordOffsets[iShape];
iShape++;
if (pos < offsetOfCurrentRecord) {
// Need to skip some bytes to reach the next record
in.skipBytes((int) (offsetOfCurrentRecord - pos));
pos = (int) offsetOfCurrentRecord;
}
currentRecordNumber = in.readInt(); pos += 4;
currentRecordLength = in.readInt(); pos += 4;
return true;
}
@Override
public boolean nextKeyValue() throws IOException {
while (true) {
// Check if the current record has ended
if (pos >= offsetOfCurrentRecord + currentRecordLength * 2) {
if (!fetchNextRecord())
return false;
}
// Fetch next shape from the current record
int shapeType = IOUtil.readIntLittleEndian(in); pos += 4;
GeometryType geometryType;
switch (shapeType % 10) {
case NullShape: geometryType = GeometryType.EMPTY; break;
case PointShape: geometryType = GeometryType.POINT; break;
case PolylineShape: geometryType = GeometryType.MULTILINESTRING; break;
case PolygonShape: geometryType = GeometryType.MULTIPOLYGON; break;
case MultiPointShape: geometryType = GeometryType.MULTIPOINT; break;
default: throw new RuntimeException(String.format("Unsupported shape type '%s' in file '%s'", shapeType, filename));
}
if (geometryType == GeometryType.EMPTY) {
// This indicates a feature without a geometry a shapefile
geometry.setEmpty();
// Empty geometries are returned only when no filtering is associated
if (filterMBR == null)
return true;
} else {
if (shapeType != header.shapeType)
LOG.warn(String.format("Unexpected change in shape type in file '%s'", filename));
if (geometry == null || shapeType != header.shapeType || immutableObjects) {
geometry = geometryType.createInstance();
header.shapeType = shapeType;
}
}
boolean hasZValues = shapeType / 10 == 1;
boolean hasMValues = hasZValues || shapeType / 10 == 2;
double x, y, z, m;
double xmin, ymin, xmax, ymax;
int numParts, numPoints, geometrySizeInBytes;
int firstPointPosition, firstZValuePosition, firstMValuePosition;
switch (geometryType) {
case EMPTY:
// Do nothing
break;
case POINT:
Point point;
point = (Point) geometry;
if (hasMValues && hasZValues)
point.setCoordinateDimension(4);
else if (hasMValues)
point.setCoordinateDimension(3);
else
point.setCoordinateDimension(2);
point.coords[0] = Double.longBitsToDouble(IOUtil.readLongLittleEndian(in)); pos += 8;
point.coords[1] = Double.longBitsToDouble(IOUtil.readLongLittleEndian(in)); pos += 8;
mbr.set(point.coords, point.coords);
if (shapeType == PointMShape) {
point.coords[3] = Double.longBitsToDouble(IOUtil.readLongLittleEndian(in)); pos += 8;
} else if (shapeType == PointMZShape) {
point.coords[2] = Double.longBitsToDouble(IOUtil.readLongLittleEndian(in)); pos += 8;
point.coords[3] = Double.longBitsToDouble(IOUtil.readLongLittleEndian(in)); pos += 8;
}
assert offsetOfCurrentRecord + currentRecordLength * 2 + 8 == pos;
if (filterMBR == null || filterMBR.intersects(point))
return true;
break;
case MULTIPOINT:
MultiPoint multiPoint = (MultiPoint) geometry;
int numDimensions = 2;
if (hasMValues)
numDimensions++;
if (hasZValues)
numDimensions++;
// Read header size (fixed size regardless of the shape size)
in.readFully(readBuffers.array(), 0, MultiPointHeaderSize);
readBuffers.limit(MultiPointHeaderSize); // To ensure we do not parse beyond the limit
pos += MultiPointHeaderSize;
// Read MBR
xmin = readBuffers.getDouble(0);
ymin = readBuffers.getDouble(8);
xmax = readBuffers.getDouble(16);
ymax = readBuffers.getDouble(24);
mbr.set(new double[] {xmin, ymin}, new double[] {xmax, ymax});
numPoints = readBuffers.getInt(32);
// Calculate geometry size in bytes to read it fully
// Coordinate data + bound data for M and Z
geometrySizeInBytes = 8 * numPoints * numDimensions + 2 * 8 * (numDimensions - 2);
// Verify the size is similar to the record header (geometry type: int + Header + points)
assert 4 + MultiPointHeaderSize + geometrySizeInBytes == currentRecordLength * 2;
// Adjust the size of the readBuffer to ensure that we parse the input correctly
if (readBuffers.capacity() < geometrySizeInBytes) {
readBuffers = ByteBuffer.allocate(geometrySizeInBytes);
readBuffers.order(ByteOrder.LITTLE_ENDIAN);
} else
readBuffers.limit(geometrySizeInBytes); // Ensures that we do not parse beyond the limit
in.readFully(readBuffers.array(), 0, geometrySizeInBytes);
pos += geometrySizeInBytes;
assert offsetOfCurrentRecord + currentRecordLength * 2 + 8 == pos;
firstZValuePosition = hasZValues? 2 * 8 * numPoints : 0;
firstMValuePosition = firstZValuePosition + (hasMValues? 8 * (2 + numPoints) : 0);
double[] coords = new double[numDimensions];
if (filterMBR == null || filterMBR.intersects(mbr)) {
for (int $i = 0; $i < numPoints; $i++) {
coords[0] = readBuffers.getDouble(8 * 2 * $i);
coords[1] = readBuffers.getDouble(8 * 2 * $i + 8);
if (hasZValues && hasMValues) {
coords[2] = readBuffers.getDouble(firstZValuePosition + 8 * $i);
coords[3] = readBuffers.getDouble(firstMValuePosition + 8 * $i);
} else if (hasMValues)
coords[2] = readBuffers.getDouble(firstMValuePosition + 8 * $i);
multiPoint.addPoint(coords);
}
}
return true;
case MULTILINESTRING:
case MULTIPOLYGON:
LineString2D linestring;
// Read header size (fixed size regardless of the shape size)
in.readFully(readBuffers.array(), 0, PolylineHeaderSize);
readBuffers.limit(PolylineHeaderSize); // To ensure we do not parse beyond the limit
pos += PolylineHeaderSize;
xmin = readBuffers.getDouble(0);
ymin = readBuffers.getDouble(8);
xmax = readBuffers.getDouble(16);
ymax = readBuffers.getDouble(24);
mbr.set(new double[] {xmin, ymin}, new double[] {xmax, ymax});
numParts = readBuffers.getInt(32);
numPoints = readBuffers.getInt(32 + 4);
// x, y coordinates
geometrySizeInBytes = 4 * numParts + 2 * 8 * numPoints;
int iPart = 0;
// Add measured values
if (hasMValues)
geometrySizeInBytes += 2 * 8 + numPoints * 8;
if (hasZValues)
geometrySizeInBytes += 2 * 8 + numPoints * 8;
// Verify the size is similar to the record header (geometry type: int + Header + points)
assert 4 + PolylineHeaderSize + geometrySizeInBytes == currentRecordLength * 2 :
String.format("Incorrect size of record #%d. Expected size %d actual size in file is %d",
iShape, 4 + PolylineHeaderSize + geometrySizeInBytes, currentRecordLength * 2);
firstPointPosition = numParts * 4;
firstZValuePosition = firstPointPosition + 2 * 8 * numPoints + 8 * 2;
firstMValuePosition = firstZValuePosition + (hasZValues? 8 * numPoints + 8 * 2 : 0);
// Adjust the size of the readBuffer to ensure that we parse the input correctly
if (readBuffers.capacity() < geometrySizeInBytes) {
readBuffers = ByteBuffer.allocate(geometrySizeInBytes);
readBuffers.order(ByteOrder.LITTLE_ENDIAN);
} else
readBuffers.limit(geometrySizeInBytes); // Ensures that we do not parse beyond the limit
in.readFully(readBuffers.array(), 0, geometrySizeInBytes);
pos += geometrySizeInBytes;
assert offsetOfCurrentRecord + currentRecordLength * 2 + 8 == pos;
linestring = (LineString2D) geometry;
if (filterMBR == null || filterMBR.intersects(mbr)) {
// The shape matches the filterMBR
linestring.setEmpty();
int lastPointInCurrentPart = (iPart + 1) < numParts ? readBuffers.getInt(iPart * 4 + 4) : numPoints;
int iPoint = 0;
boolean firstPointInPart = true;
while (iPoint < numPoints) {
if (linestring.getType() == GeometryType.MULTIPOLYGON && firstPointInPart) {
firstPointInPart = false;
// Only for polygons, check if this ring is in CW order. If so, it indicates an outer shell which
// is modeled as a new polygon
// See https://stackoverflow.com/questions/1165647/how-to-determine-if-a-list-of-polygon-points-are-in-clockwise-order
double sum = 0.0;
int $i = iPoint;
double x1 = readBuffers.getDouble(firstPointPosition + 8 * 2 * $i);
double y1 = readBuffers.getDouble(firstPointPosition + 8 * 2 * $i + 8);
while (++$i < lastPointInCurrentPart) {
double x2 = readBuffers.getDouble(firstPointPosition + 8 * 2 * $i);
double y2 = readBuffers.getDouble(firstPointPosition + 8 * 2 * $i + 8);
sum += (x2 - x1) * (y2 + y1);
x1 = x2;
y1 = y2;
}
boolean cwOrder = sum > 0;
boolean outerRing = cwOrder;
if (outerRing)
((MultiPolygon2D) linestring).endCurrentPolygon();
}
x = readBuffers.getDouble(firstPointPosition + 8 * 2 * iPoint);
y = readBuffers.getDouble(firstPointPosition + 8 * 2 * iPoint + 8);
m = hasMValues ? readBuffers.getDouble(firstMValuePosition + 8 * iPoint) : 0.0;
z = hasZValues ? readBuffers.getDouble(firstZValuePosition + 8 * iPoint) : 0.0;
if (hasZValues && hasMValues)
linestring.addPointXYZM(x, y, z, m);
else if (hasMValues)
linestring.addPointXYM(x, y, m);
else
linestring.addPoint(x, y);
iPoint++;
if (iPoint == lastPointInCurrentPart) {
switch (linestring.getType()) {
case MULTILINESTRING:
((MultiLineString2D) linestring).endCurrentLineString();
break;
case MULTIPOLYGON:
((MultiPolygon2D) linestring).closeLastRingNoCheck();
break;
default:
throw new RuntimeException(String.format("Unsupported geometry type '%s' in file '%s'", linestring.getType(), filename));
}
iPart++;
firstPointInPart = true;
lastPointInCurrentPart = (iPart + 1) < numParts ? readBuffers.getInt(iPart * 4 + 4) : numPoints;
}
} // While iPoint < numPoints
}
return true;
default:
throw new RuntimeException(String.format("Unsupported shape type '%s' in file '%s'", shapeType, filename));
}
}
}
@Override
public Envelope getCurrentKey() {
return mbr;
}
@Override
public IGeometry getCurrentValue() {
return geometry;
}
@Override
public float getProgress() {
return (float)pos / header.fileLength / 2.0f;
}
@Override
public void close() throws IOException {
if (in != null) {
in.close();
in = null;
}
if (zipFile != null) {
zipFile.close();
zipFile = null;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy