com.jmatio.io.MatFileReader Maven / Gradle / Ivy

Go to download
/*
 * Code licensed under new-style BSD (see LICENSE).
 * All code up to tags/original: Copyright (c) 2006, Wojciech Gradkowski
 * All code after tags/original: Copyright (c) 2015, DiffPlug
 */
package com.jmatio.io;

import java.io.EOFException;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.RandomAccessFile;
import java.lang.ref.WeakReference;
import java.lang.reflect.Method;
import java.nio.ByteBuffer;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.security.AccessController;
import java.security.PrivilegedAction;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.zip.InflaterInputStream;

import com.jmatio.common.MatDataTypes;
import com.jmatio.io.MatFileWriter.ByteArrayOutputStream2;
import com.jmatio.io.stream.ByteBufferInputStream;
import com.jmatio.io.stream.HeapBufferDataOutputStream;
import com.jmatio.io.stream.MatFileInputStream;
import com.jmatio.types.ByteStorageSupport;
import com.jmatio.types.MLArray;
import com.jmatio.types.MLCell;
import com.jmatio.types.MLChar;
import com.jmatio.types.MLDouble;
import com.jmatio.types.MLEmptyArray;
import com.jmatio.types.MLInt16;
import com.jmatio.types.MLInt32;
import com.jmatio.types.MLInt64;
import com.jmatio.types.MLInt8;
import com.jmatio.types.MLJavaObject;
import com.jmatio.types.MLNumericArray;
import com.jmatio.types.MLObject;
import com.jmatio.types.MLSingle;
import com.jmatio.types.MLSparse;
import com.jmatio.types.MLStructure;
import com.jmatio.types.MLUInt16;
import com.jmatio.types.MLUInt32;
import com.jmatio.types.MLUInt64;
import com.jmatio.types.MLUInt8;

/**
 * MAT-file reader. Reads MAT-file into MLArray objects.
 * 
 * Usage:
 * 
 * //read in the file
 * MatFileReader mfr = new MatFileReader( "mat_file.mat" );
 * 
 * //get array of a name "my_array" from file
 * MLArray mlArrayRetrived = mfr.getMLArray( "my_array" );
 * 
 * //or get the collection of all arrays that were stored in the file
 * Map content = mfr.getContent();
 * 
 * 
 * @see ca.mjdsystems.jmatio.io.MatFileFilter
 * @author Wojciech Gradkowski ([email protected])
 */
/**
 * @author Wojciech Gradkowski ([email protected])
 * 
 */
public class MatFileReader {
	public static final int MEMORY_MAPPED_FILE = 1;
	public static final int DIRECT_BYTE_BUFFER = 2;
	public static final int HEAP_BYTE_BUFFER = 4;

	/**
	 * Type of matlab mat file.
	 */
	private final MatFileType matType;
	/**
	 * MAT-file header
	 */
	MatFileHeader matFileHeader;
	/**
	 * Container for red MLArrays
	 */
	Map data;
	/**
	 * Array name filter
	 */
	private MatFileFilter filter;
	/**
	 * Whether or not we have found an MCOS type variable.  Needed to know if further processing is needed.
	 */
	private boolean haveMCOS = false;
	/**
	 * Holds the likely candidate for the MCOS extra data at the end of a MAT file.
	 */
	private MLUInt8 mcosData;

	/**
	 * Creates instance of MatFileReader and reads MAT-file 
	 * from location given as fileName.
	 * 
	 * This method reads MAT-file without filtering.
	 * 
	 * @param fileName the MAT-file path String
	 * @throws IOException when error occurred while processing the file.
	 */
	public MatFileReader(String fileName) throws FileNotFoundException, IOException {
		this(new File(fileName), new MatFileFilter(), MatFileType.Regular);
	}

	/**
	 * Creates instance of MatFileReader and reads MAT-file 
	 * from location given as fileName.
	 * 
	 * Results are filtered by MatFileFilter. Arrays that do not meet
	 * filter match condition will not be available in results.
	 * 
	 * @param fileName the MAT-file path String
	 * @param MatFileFilter array name filter.
	 * @throws IOException when error occurred while processing the file.
	 */
	public MatFileReader(String fileName, MatFileFilter filter) throws IOException {
		this(new File(fileName), filter, MatFileType.Regular);
	}

	/**
	 * Creates instance of MatFileReader and reads MAT-file
	 * from file. 
	 * 
	 * This method reads MAT-file without filtering.
	 * 
	 * @param file the MAT-file
	 * @throws IOException when error occurred while processing the file.
	 */
	public MatFileReader(File file) throws IOException {
		this(file, new MatFileFilter(), MatFileType.Regular);

	}

	/**
	 * Creates instance of MatFileReader and reads MAT-file from
	 * file.
	 * 
	 * Results are filtered by MatFileFilter. Arrays that do not
	 * meet filter match condition will not be available in results.
	 * 
	 * Note: this method reads file using the memory mapped file policy, see
	 * notes to {@link #read(File, MatFileFilter, com.jmatio.io.MatFileReader.MallocPolicy)}
	 * 
	 * @param file
	 *            the MAT-file
	 * @param MatFileFilter
	 *            array name filter.
	 * @throws IOException
	 *             when error occurred while processing the file.
	 */
	public MatFileReader(File file, MatFileFilter filter, MatFileType matType) throws IOException {
		this(matType);

		read(file, filter, MEMORY_MAPPED_FILE);
	}

	public MatFileReader(File file, MatFileFilter filter) throws IOException {
		this(file, filter, MatFileType.Regular);
	}

	public MatFileReader(MatFileType matType) {
		this.matType = matType;
		filter = new MatFileFilter();
		data = new LinkedHashMap();
	}

	public MatFileReader() {
		this(MatFileType.Regular);
	}

	/**
	 * Creates instance of MatFileReader and reads MAT-file from
	 * file.
	 * 
	 * This method reads MAT-file without filtering.
	 * 
	 * @param stream
	 *            the MAT-file stream
	 * @throws IOException
	 *             when error occurred while processing the file.
	 */
	public MatFileReader(InputStream stream, MatFileType type) throws IOException {
		this(stream, new MatFileFilter(), type);
	}

	/**
	 * Creates instance of MatFileReader and reads MAT-file from
	 * file.
	 * 

	 * Results are filtered by MatFileFilter. Arrays that do not
	 * meet filter match condition will not be available in results.
	 * 

	 * Note: this method reads file using the memory mapped file policy, see
	 * notes to 
	 * {@link #read(File, MatFileFilter, com.jmatio.io.MatFileReader.MallocPolicy)}
	 * 
	 * 
	 * @param stream
	 *            the MAT-file stream
	 * @param MatFileFilter
	 *            array name filter.
	 * @throws IOException
	 *             when error occurred while processing the file.
	 */
	public MatFileReader(InputStream stream, MatFileFilter filter, MatFileType type) throws IOException {
		this(type);

		read(stream, filter);
	}

	/**
	 * Reads the content of a MAT-file and returns the mapped content.
	 * 

	 * This method calls
	 * read(file, new MatFileFilter(), MallocPolicy.MEMORY_MAPPED_FILE).
	 * 
	 * @param file
	 *            a valid MAT-file file to be read
	 * @return the same as {@link #getContent()}
	 * @throws IOException
	 *             if error occurs during file processing
	 */
	public synchronized Map read(File file) throws IOException {
		return read(file, new MatFileFilter(), MEMORY_MAPPED_FILE);
	}

	/**
	 * Reads the content of a MAT-file and returns the mapped content.
	 * 

	 * This method calls read(stream, new MatFileFilter()).
	 * 
	 * @param stream
	 *            a valid MAT-file stream to be read
	 * @return the same as {@link #getContent()}
	 * @throws IOException
	 *             if error occurs during file processing
	 */
	public synchronized Map read(InputStream stream) throws IOException {
		return read(stream, new MatFileFilter());
	}

	/**
	 * Reads the content of a MAT-file and returns the mapped content.
	 * 

	 * This method calls
	 * read(file, new MatFileFilter(), policy).
	 * 
	 * @param file
	 *            a valid MAT-file file to be read
	 * @param policy
	 *            the file memory allocation policy
	 * @return the same as {@link #getContent()}
	 * @throws IOException
	 *             if error occurs during file processing
	 */
	public synchronized Map read(File file, int policy) throws IOException {
		return read(file, new MatFileFilter(), policy);
	}

	/**
	 * Reads the content of a MAT-file and returns the mapped content.
	 * 

	 * Because of java bug #4724038
	 * which disables releasing the memory mapped resource, additional different
	 * allocation modes are available.
	 * 

	 * {@link #MEMORY_MAPPED_FILE} - a memory mapped file
	 * {@link #DIRECT_BYTE_BUFFER} - a uses
	 * {@link ByteBuffer#allocateDirect(int)} method to read in
	 * the file contents
	 * {@link #HEAP_BYTE_BUFFER} - a uses
	 * {@link ByteBuffer#allocate(int)} method to read in the
	 * file contents
	 * 

	 * Note: memory mapped file will try to invoke a nasty code to relase
	 * it's resources
	 * 
	 * @param file
	 *            a valid MAT-file file to be read
	 * @param filter
	 *            the array filter applied during reading
	 * @param policy
	 *            the file memory allocation policy
	 * @return the same as {@link #getContent()}
	 * @see MatFileFilter
	 * @throws IOException
	 *             if error occurs during file processing
	 */
	private static final int DIRECT_BUFFER_LIMIT = 1 << 25;

	public synchronized Map read(File file, MatFileFilter filter, int policy) throws IOException {
		return read(new RandomAccessFile(file, "r"), filter, policy);
	}

	public synchronized Map read(RandomAccessFile raFile, MatFileFilter filter, int policy) throws IOException {
		this.filter = filter;

		//clear the results
		for (String key : data.keySet()) {
			data.remove(key);
		}

		FileChannel roChannel = null;
		ByteBuffer buf = null;
		WeakReference bufferWeakRef = null;
		try {
			//Create a read-only memory-mapped file
			roChannel = raFile.getChannel();
			// until java bug #4715154 is fixed I am not using memory mapped files
			// The bug disables re-opening the memory mapped files for writing
			// or deleting until the VM stops working. In real life I need to open
			// and update files
			switch (policy) {
			case DIRECT_BYTE_BUFFER:
				buf = ByteBuffer.allocateDirect((int) roChannel.size());
				roChannel.read(buf, 0);
				buf.rewind();
				break;
			case HEAP_BYTE_BUFFER:
				int filesize = (int) roChannel.size();
				System.gc();
				buf = ByteBuffer.allocate(filesize);

				// The following two methods couldn't be used (at least under MS Windows)
				// since they are implemented in a suboptimal way. Each of them
				// allocates its own _direct_ buffer of exactly the same size,
				// the buffer passed as parameter has, reads data into it and
				// only afterwards moves data into the buffer passed as parameter.
				// roChannel.read(buf, 0);        // ends up in outOfMemory
				// raFile.readFully(buf.array()); // ends up in outOfMemory
				int numberOfBlocks = filesize / DIRECT_BUFFER_LIMIT + ((filesize % DIRECT_BUFFER_LIMIT) > 0 ? 1 : 0);
				if (numberOfBlocks > 1) {
					ByteBuffer tempByteBuffer = ByteBuffer.allocateDirect(DIRECT_BUFFER_LIMIT);
					for (long block = 0; block < numberOfBlocks; block++) {
						tempByteBuffer.clear();
						roChannel.read(tempByteBuffer, block * DIRECT_BUFFER_LIMIT);
						tempByteBuffer.flip();
						buf.put(tempByteBuffer);
					}
					tempByteBuffer = null;
				} else
					roChannel.read(buf, 0);

				buf.rewind();
				break;
			case MEMORY_MAPPED_FILE:
				buf = roChannel.map(FileChannel.MapMode.READ_ONLY, 0, (int) roChannel.size());
				bufferWeakRef = new WeakReference((MappedByteBuffer) buf);
				break;
			default:
				throw new IllegalArgumentException("Unknown file allocation policy");
			}
			// Do the actual work.
			parseData(buf);

			return getContent();
		} catch (IOException e) {
			throw e;
		} finally {
			if (roChannel != null) {
				roChannel.close();
			}
			if (raFile != null) {
				raFile.close();
			}
			if (buf != null && bufferWeakRef != null && policy == MEMORY_MAPPED_FILE) {
				try {
					clean(buf);
				} catch (Exception e) {
					int GC_TIMEOUT_MS = 1000;
					buf = null;
					long start = System.currentTimeMillis();
					while (bufferWeakRef.get() != null) {
						if (System.currentTimeMillis() - start > GC_TIMEOUT_MS) {
							break; //a hell cannot be unmapped - hopefully GC will
									//do it's job later
						}
						System.gc();
						Thread.yield();
					}
				}
			}
		}
	}

	private void parseData(ByteBuffer buf) throws IOException {
		//read in file header
		readHeader(buf);

		while (buf.remaining() > 0) {
			readData(buf);
		}
		if (haveMCOS) {
			parseMCOS(mcosData);
			if (data.get("@") == mcosData) {
				data.remove("@");
			}
			for (Map.Entry it : data.entrySet()) {
				if (it.getValue() == mcosData) {
					data.remove(it.getKey());
					break;
				}
			}
		}
		mcosData = null;
	}

	private void parseMCOS(MLUInt8 mcosData) throws IOException {
		// First, parse back out the mcosData.
		ByteBuffer buffer = mcosData.getRealByteBuffer();
		ByteBufferInputStream dataStream = new ByteBufferInputStream(buffer, buffer.limit());

		Map mcosContent;

		MatFileReader matFile = new MatFileReader(dataStream, MatFileType.ReducedHeader);
		mcosContent = matFile.getContent();
		MLCell mcosInfo = (MLCell) ((MLStructure) mcosContent.get("@0")).getField("MCOS");
		ByteBuffer mcosDataBuf = ((MLUInt8) mcosInfo.get(0)).getRealByteBuffer();
		// This bytebuffer needs to be read in the byte order of the MAT file order.  Thus fix.
		mcosDataBuf.order(matFile.getMatFileHeader().getByteOrder());

		// Parse out the data buffer.  First get version information.  Should always equal 2.
		int version = mcosDataBuf.getInt();
		if (version != 2) {
			throw new IllegalStateException("MAT file's MCOS data has a different version(?).  Got: " + version + ", wanted 2.");
		}

		// Get the string count + define the string array.
		int strCount = mcosDataBuf.getInt();
		String[] strs = new String[strCount];

		// Get the segment indexes.
		int segmentIndexes[] = new int[6];
		for (int i = 0; i < segmentIndexes.length; ++i) {
			segmentIndexes[i] = mcosDataBuf.getInt();
		}

		// There should now be 8 0 bytes.  Make sure this is true to avoid object format changes.
		if (mcosDataBuf.getLong() != 0) {
			throw new IllegalStateException("MAT file's MCOS data has different byte values for unknown fields!  Aborting!");
		}

		// Finally, read in each string.  Java doesn't provide an easy way to do this in bulk, so just use a stupid formula for now.
		for (int i = 0; i < strCount; ++i) {
			StringBuilder sb = new StringBuilder();
			for (char next = (char) mcosDataBuf.get(); next != '\0'; next = (char) mcosDataBuf.get()) {
				sb.append(next);
			}
			strs[i] = sb.toString();
		}

		// Sanity check, next 8 byte aligned position in the buffer should equal the start of the first segment!
		if (((mcosDataBuf.position() + 0x07) & ~0x07) != segmentIndexes[0]) {
			throw new IllegalStateException("Data from the strings section was not all read!");
		}

		// First segment, class information.  Really just need the class names.
		List classNamesList = new ArrayList();
		mcosDataBuf.position(segmentIndexes[0]);
		// There are 16 unknown bytes.  Ensure they are 0.
		if (mcosDataBuf.getLong() != 0 || mcosDataBuf.getLong() != 0) {
			throw new IllegalStateException("MAT file's MCOS data has different byte values for unknown fields!  Aborting!");
		}
		while (mcosDataBuf.position() < segmentIndexes[1]) {
			mcosDataBuf.getInt(); // packageNameIndex - Unused for now.
			int classNameIndex = mcosDataBuf.getInt();
			String className = strs[classNameIndex - 1];
			classNamesList.add(className);
			if (mcosDataBuf.getLong() != 0) {
				throw new IllegalStateException("MAT file's MCOS data has different byte values for unknown fields!  Aborting!");
			}
		}

		// Sanity check, position in the buffer should equal the start of the second segment!
		if (mcosDataBuf.position() != segmentIndexes[1]) {
			throw new IllegalStateException("Data from the class section was not all read!");
		}

		// @todo: Second segment, Object properties containing other properties.  Not used yet, thus ignored.
		mcosDataBuf.position(segmentIndexes[2]);

		// Third segment.  Contains all the useful per-object information.
		Map objectInfoList = new HashMap();
		// There are 24 unknown bytes.  Ensure they are 0.
		if (mcosDataBuf.getLong() != 0 || mcosDataBuf.getLong() != 0 || mcosDataBuf.getLong() != 0) {
			throw new IllegalStateException("MAT file's MCOS data has different byte values for unknown fields!  Aborting!");
		}
		int objectCount = 1;
		while (mcosDataBuf.position() < segmentIndexes[3]) {
			// First fetch the data.
			int classIndex = mcosDataBuf.getInt();
			if (mcosDataBuf.getLong() != 0) {
				throw new IllegalStateException("MAT file's MCOS data has different byte values for unknown fields!  Aborting!");
			}
			int segment2Index = mcosDataBuf.getInt();
			int segment4Index = mcosDataBuf.getInt();
			mcosDataBuf.getInt(); // This value is random.  But we need to move the buffer forward, so read it without a check.
			int objectId = objectCount++; // It would appear that the "objectId" is in fact some other MATLAB value.  Thus ignore,
			// and use the index into this segment as the id instead.

			// Then parse it into the form needed for the object.
			objectInfoList.put(objectId - 1, new MatMCOSObjectInformation(classNamesList.get(classIndex - 1), classIndex, objectId, segment2Index, segment4Index));
		}

		// Sanity check, position in the buffer should equal the start of the fourth segment!
		if (mcosDataBuf.position() != segmentIndexes[3]) {
			throw new IllegalStateException("Data from the object section was not all read!  At: " + mcosDataBuf.position() + ", wanted: " + segmentIndexes[3]);
		}

		// Fourth segment.  Contains the regular properties for objects.
		// There are 8 unknown bytes.  Ensure they are 0.
		if (mcosDataBuf.getLong() != 0) {
			throw new IllegalStateException("MAT file's MCOS data has different byte values for unknown fields!  Aborting!");
		}
		List> segment4Properties = new ArrayList>();
		while (mcosDataBuf.position() < segmentIndexes[4]) {
			Map properties = new HashMap();
			int propertiesCount = mcosDataBuf.getInt();
			for (int i = 0; i < propertiesCount; ++i) {
				int nameIndex = mcosDataBuf.getInt();
				int flag = mcosDataBuf.getInt();
				int heapIndex = mcosDataBuf.getInt();

				String propertyName = strs[nameIndex - 1];
				MLArray property;
				switch (flag) {
				case 0:
					property = new MLChar(propertyName, strs[heapIndex - 1]);
					break;
				case 1:
					property = mcosInfo.get(heapIndex + 2);
					break;
				case 2:
					// @todo: Handle a boolean.
					throw new UnsupportedOperationException("Mat file parsing does not yet support booleans!");
				default:
					throw new UnsupportedOperationException("Don't yet support parameter type: " + flag + "!");
				}
				if (property instanceof MLUInt32) {
					int[][] data = ((MLUInt32) property).getArray();
					if (data[0][0] == 0xdd000000 && data[1][0] == 0x02) {
						MLObjectPlaceholder objHolder = new MLObjectPlaceholder(propertyName, "", data);
						property = processMCOS(objHolder, classNamesList, objectInfoList);
					}
				}
				properties.put(propertyName, property);
			}
			segment4Properties.add(properties);
			mcosDataBuf.position((mcosDataBuf.position() + 0x07) & ~0x07);
		}

		// Sanity check, position in the buffer should equal the start of the fifth segment!
		if (mcosDataBuf.position() != segmentIndexes[4]) {
			throw new IllegalStateException("Data from the properties section (2) was not all read!  At: " + mcosDataBuf.position() + ", wanted: " + segmentIndexes[4]);
		}

		// Now merge in the properties from segment 4 into object.
		for (MatMCOSObjectInformation it : objectInfoList.values()) {
			Map objAttributes = it.structure;
			if (it.segment4PropertiesIndex > 0) {
				for (Map.Entry attribute : segment4Properties.get(it.segment4PropertiesIndex - 1).entrySet()) {
					objAttributes.put(attribute.getKey(), attribute.getValue());
				}
			} else {
				throw new IllegalStateException("Properties are not found!  Not sure where to look ...");
			}
		}

		// Finally, merge in attributes from the global grab bag.
		MLCell attribBag = (MLCell) mcosInfo.get(mcosInfo.getSize() - 1); // Get the grab bag.
		for (MatMCOSObjectInformation it : objectInfoList.values()) {
			MLStructure attributes = (MLStructure) attribBag.get(it.classId);
			Collection attributeNames = attributes.getFieldNames();
			Map objAttributes = it.structure;
			for (String attributeName : attributeNames) {
				if (objAttributes.get(attributeName) == null) {
					objAttributes.put(attributeName, attributes.getField(attributeName));
				}
			}
		}

		for (Map.Entry it : data.entrySet()) {
			if (it.getValue() instanceof MLObjectPlaceholder) {
				MLObjectPlaceholder objHolder = (MLObjectPlaceholder) it.getValue();
				it.setValue(processMCOS(objHolder, classNamesList, objectInfoList));
			}
		}
	}

	private MLObject processMCOS(MLObjectPlaceholder objHolder, List classNamesList, Map objectInfoList) {
		int classId = objHolder.classId;
		MLObject obj = new MLObject(objHolder.name, classNamesList.get(classId - 1), objHolder.getDimensions(), 0);
		for (int i = 0; i < obj.getSize(); ++i) {
			MatMCOSObjectInformation objectInformation = objectInfoList.get(objHolder.objectIds[i] - 1);
			if (classId != objectInformation.classId) {
				throw new IllegalStateException("Found an object in array with a different class id! Actual: " + objectInformation.classId + ", expected: " + classId + "!");
			}
			obj.setFields(i, objectInformation.structure);
		}
		return obj;
	}

	/**
	 * Read a mat file from a stream. Internally this will read the stream fully
	 * into memory before parsing it.
	 * 
	 * @param stream
	 *            a valid MAT-file stream to be read
	 * @param filter
	 *            the array filter applied during reading
	 * 
	 * @return the same as {@link #getContent()}
	 * @see MatFileFilter
	 * @throws IOException
	 *             if error occurs during file processing
	 */
	public synchronized Map read(InputStream stream, MatFileFilter filter) throws IOException {
		this.filter = filter;

		data.clear();

		ByteBuffer buf = null;

		final ByteArrayOutputStream2 baos = new ByteArrayOutputStream2();
		copy(stream, baos);
		buf = ByteBuffer.wrap(baos.getBuf(), 0, baos.getCount());

		// Do the actual work
		parseData(buf);

		return getContent();
	}

	private void copy(InputStream stream, ByteArrayOutputStream2 output) throws IOException {
		final byte[] buffer = new byte[1024 * 4];
		int n = 0;
		while (-1 != (n = stream.read(buffer))) {
			output.write(buffer, 0, n);
		}
	}

	/**
	 * Workaround taken from bug #4724038
	 * to release the memory mapped byte buffer.
	 * 
	 * Little quote from SUN: This is highly inadvisable, to put it mildly.
	 * It is exceedingly dangerous to forcibly unmap a mapped byte buffer that's
	 * visible to Java code. Doing so risks both the security and stability of
	 * the system
	 * 

	 * Since the memory byte buffer used to map the file is not exposed to the
	 * outside world, maybe it's save to use it without being cursed by the SUN.
	 * Since there is no other solution this will do (don't trust voodoo GC
	 * invocation)
	 * 
	 * @param buffer
	 *            the buffer to be unmapped
	 * @throws Exception
	 *             all kind of evil stuff
	 */
	private void clean(final Object buffer) throws Exception {
		AccessController.doPrivileged(new PrivilegedAction