de.intarsys.tools.reader.ReaderTools Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of isrt Show documentation
The basic runtime tools and interfaces for intarsys components.
There is a newer version: 4.11
package de.intarsys.tools.reader;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.HashMap;
import java.util.Map;

import de.intarsys.tools.collection.Entry;

/**
 * Tool class for common {@link Reader} related tasks.
 * 
 */
public class ReaderTools {

	private static final Map escapes = new HashMap();

	static {
		// escape the escape
		escapes.put(new Character('\\'), "\\"); //$NON-NLS-1$
		// escape the escape
		escapes.put(new Character('"'), "\""); //$NON-NLS-1$
		// escape to insert whitespace
		escapes.put(new Character('n'), "\n"); //$NON-NLS-1$
		escapes.put(new Character('r'), "\r"); //$NON-NLS-1$
		escapes.put(new Character('t'), "\t"); //$NON-NLS-1$
		// escape to remove whitespace
		escapes.put(new Character('\n'), null);
		escapes.put(new Character('\r'), null);
		escapes.put(new Character('\t'), null);
		escapes.put(new Character(' '), null);
	}

	/**
	 * Try to detect the unicode transformation format (UTF encoding) from the
	 * BOM. If no BOM is detected, null is returned and the input buffer is
	 * reset.
	 * 
	 * The {@link InputStream} is must support the mark operation!
	 * 
	 * For BOM marker bytes, see http://unicode.org/faq/utf_bom.html
	 * 
	 * 
	 * Bytes 				Encoding Form 
	 * 00 00 FE FF 			UTF-32, big-endian 
	 * FF FE 00 00 			UTF-32, little-endian 
	 * FE FF 				UTF-16, big-endian 
	 * FF FE 				UTF-16, little-endian 
	 * EF BB BF 			UTF-8
	 * 
	 * 
	 * @param is
	 * @return An {@link InputStreamReader} with the correct encoding
	 * @throws IOException
	 */
	static public InputStreamReader createReaderScanBom(InputStream is)
			throws IOException {
		String encoding = null;
		int i;
		is.mark(4);
		i = is.read();
		if (i == 0x00) {
			i = is.read();
			if (i == 0x00) {
				i = is.read();
				if (i == 0xfe) {
					i = is.read();
					if (i == 0xff) {
						// UTF-32, big endian
						encoding = "UTF-32BE";
					}
				}
			}
		} else if (i == 0xff) {
			i = is.read();
			if (i == 0xfe) {
				is.mark(4);
				i = is.read();
				if (i == 0x00) {
					i = is.read();
					if (i == 0x00) {
						// UTF-32, little endian
						encoding = "UTF-32LE";
					}
				}
				if (encoding == null) {
					is.reset();
					// UTF-16, little endian
					encoding = "UTF-16LE";
				}
			}
		} else if (i == 0xfe) {
			i = is.read();
			if (i == 0xff) {
				// UTF-16, big endian
				encoding = "UTF-16BE";
			}
		} else if (i == 0xef) {
			i = is.read();
			if (i == 0xbb) {
				i = is.read();
				if (i == 0xbf) {
					// UTF-8
					encoding = "UTF-8";
				}
			}
		}
		if (encoding == null) {
			is.reset();
			return null;
		}
		return new InputStreamReader(is, encoding);
	}

	/**
	 * Try to detect the input stream encoding from the meta tags "$$$" embedded
	 * in the stream. If no encoding is detected, the input is reset and null is
	 * returned.
	 * 
	 * The {@link InputStream} is must support the mark operation!
	 * 
	 * @param is
	 * @return An {@link InputStreamReader} with the correct encoding
	 * @throws IOException
	 */
	static public InputStreamReader createReaderScanMeta(final InputStream is)
			throws IOException {
		// use single byte encoding and read encoding from meta data
		Reader tempReader = new Reader() {

			@Override
			public void close() throws IOException {
			}

			@Override
			public void mark(int readlimit) throws IOException {
				is.mark(readlimit);
			}

			@Override
			public boolean markSupported() {
				return is.markSupported();
			}

			@Override
			public int read() throws IOException {
				return is.read();
			};

			@Override
			public int read(char[] cbuf, int off, int len) throws IOException {
				// not used
				return -1;
			};

			@Override
			public void reset() throws IOException {
				is.reset();
			};
		};
		String encoding = ReaderTools.readMetaEncoding(tempReader);
		if (encoding != null) {
			return new InputStreamReader(is, encoding);
		}
		return null;
	}

	/**
	 * Create a {@link TaggedReader} and automatically detect the encoding from
	 * different heuristics. First, the BOM markers are checked, then embedded
	 * meta information is scanned.
	 * 

	 * If no encoding can be guessed, either the defaultCharsetName or the
	 * platform encoding is used.
	 * 

	 * Meta information tags (lines starting with '$$$') are scanned.
	 * 
	 * @param is
	 * @param defaultCharsetName
	 * @return A {@link TaggedReader} with the correct encoding
	 * @throws IOException
	 */
	public static TaggedReader createTaggedReader(InputStream is,
			String defaultCharsetName, int size) throws IOException {
		// try to detect utf byte order marker
		InputStreamReader reader = createReaderScanBom(is);
		if (reader == null) {
			reader = createReaderScanMeta(is);
			if (reader == null) {
				if (defaultCharsetName == null) {
					reader = new InputStreamReader(is);
				} else {
					reader = new InputStreamReader(is, defaultCharsetName);
				}
			}
		}
		// now we must scan meta tags...
		TaggedReader tagged = new TaggedReader(reader, size);
		return tagged;
	}

	/**
	 * Read a Map.Entry object from r. The end of the entry is marked by
	 * delimiter.
	 * 

	 * The syntax for an entry is
	 * 
	 * 
	 * ws* key ws* '=' value [delimiter | EOF]
	 * value = string | quoted_string
	 * quoted_string = '"' [ char | escape ]* '"'
	 * 
	 * 
	 * @param reader
	 * @param delimiter
	 * @return A single map entry read from the reader.
	 * @throws IOException
	 */
	public static Map.Entry readEntry(Reader reader,
			char delimiter) throws IOException {
		String key = ""; //$NON-NLS-1$
		String value = ""; //$NON-NLS-1$
		StringBuffer sb = new StringBuffer();
		boolean readKey = true;
		boolean quoted = false;
		boolean ignore = false;
		while (true) {
			int i = reader.read();
			if (i == '=' && readKey) {
				key = sb.toString().trim();
				sb.setLength(0);
				readKey = false;
			} else if (i == '\\' && quoted) {
				i = reader.read();
				if (i != -1) {
					String mapped = escapes.get(new Character((char) i));
					sb.append(mapped);
				}
			} else if (i == '"' && !readKey) {
				if (quoted) {
					quoted = false;
					ignore = true;
				} else {
					if (sb.length() == 0) {
						quoted = true;
					} else {
						if (!ignore) {
							sb.append((char) i);
						}
					}
				}
			} else if (i == -1) {
				if (readKey) {
					key = sb.toString().trim();
					value = ""; //$NON-NLS-1$
				} else {
					value = sb.toString();
				}
				if (key.length() > 0) {
					return new Entry(key, value);
				} else if (value.length() > 0) {
					return new Entry(key, value);
				} else {
					return null;
				}
			} else if (i == '\r' && !quoted && delimiter == '\n') {
				continue;
			} else if (i == delimiter && !quoted) {
				if (readKey) {
					key = sb.toString().trim();
					value = ""; //$NON-NLS-1$
				} else {
					value = sb.toString();
				}
				if (key.length() > 0) {
					return new Entry(key, value);
				} else if (value.length() > 0) {
					return new Entry(key, value);
				} else {
					return new Entry(null, null);
				}
			} else {
				if (!ignore) {
					sb.append((char) i);
				}
			}
		}
	}

	/**
	 * Try to detect meta data embedded in the input.
	 * 
	 * Meta data lines start with a '$$$' immediately at the line beginning and
	 * end at the line end. Meta data lines are scanned until a line without
	 * meta data is found. Meta data is encoded as entries (as provided in
	 * readEntry method).
	 * 

	 * The maximum length for a meta data line is 1024.
	 * 

	 * After execution reader is either positioned after the last meta tag. The
	 * reader instance must support the "mark/reset" sequence.
	 * 
	 * @param reader
	 * @return All meta data in the reader as a {@link Map}
	 * @throws IOException
	 */
	public static Map readMetaData(Reader reader)
			throws IOException {
		int i;
		Map meta = new HashMap();
		while (true) {
			reader.mark(1024);
			i = reader.read();
			if (i == '$') {
				i = reader.read();
				if (i == '$') {
					i = reader.read();
					if (i == '$') {
						// meta data found...
						Map.Entry entry = ReaderTools
								.readEntry(reader, '\n');
						if (entry.getKey() != null) {
							meta.put(entry.getKey(), entry.getValue());
						}
						continue;
					}
				}
			}
			reader.reset();
			break;
		}
		return meta;
	}

	/**
	 * Try to detect encoding specific meta data embedded in the input. The
	 * first meta data line is read and the value is returned if the meta
	 * information key is "encoding".
	 * 

	 * After execution reader is either positioned at the start or after the
	 * "encoding" meta tag. The reader instance must support the "mark/reset"
	 * sequence.
	 * 
	 * 

	 * For more information on meta data see readMetaData.
	 * 
	 * @param reader
	 * @return The meta data for reader defining the encoding
	 * @throws IOException
	 */
	public static String readMetaEncoding(Reader reader) throws IOException {
		int i;
		reader.mark(1024);
		i = reader.read();
		if (i == '$') {
			i = reader.read();
			if (i == '$') {
				i = reader.read();
				if (i == '$') {
					// meta data found...
					Map.Entry entry = ReaderTools.readEntry(
							reader, '\n');
					if ("encoding".equals(entry.getKey())) {
						reader.reset();
						return entry.getValue();
					}
				}
			}
		}
		reader.reset();
		return null;
	}

	/**
	 * Read a string token from r. The end of the token is marked by delimiter.
	 * The syntax for a token is
	 * 
	 * 
	 * value [delimiter | EOF]
	 * value = string | quoted_string
	 * quoted_string = '"' [ char | escape ]* '"'
	 * 
	 * 
	 * @param reader
	 * @param delimiter
	 * @return A single token.
	 * @throws IOException
	 */
	public static String readToken(Reader reader, char delimiter)
			throws IOException {
		StringBuffer sb = new StringBuffer();
		boolean quoted = false;
		boolean ignore = false;
		while (true) {
			int i = reader.read();
			if (i == '\\' && quoted) {
				i = reader.read();
				if (i != -1) {
					String mapped = escapes.get(new Character((char) i));
					sb.append(mapped);
				}
			} else if (i == '\r' && !quoted && delimiter == '\n') {
				continue;
			} else if (i == '"') {
				if (quoted) {
					quoted = false;
					ignore = true;
				} else {
					if (sb.length() == 0) {
						quoted = true;
					} else {
						if (!ignore) {
							sb.append((char) i);
						}
					}
				}
			} else if (i == -1) {
				if (sb.length() == 0) {
					return null;
				}
				return sb.toString();
			} else if (i == delimiter && !quoted) {
				return sb.toString();
			} else {
				if (!ignore) {
					sb.append((char) i);
				}
			}
		}
	}

}