All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.intarsys.tools.reader.ReaderTools Maven / Gradle / Ivy

There is a newer version: 4.11
Show newest version
package de.intarsys.tools.reader;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.HashMap;
import java.util.Map;

import de.intarsys.tools.collection.Entry;

/**
 * Tool class for common {@link Reader} related tasks.
 * 
 */
public class ReaderTools {

	private static final Map escapes = new HashMap();

	static {
		// escape the escape
		escapes.put(new Character('\\'), "\\"); //$NON-NLS-1$
		// escape the escape
		escapes.put(new Character('"'), "\""); //$NON-NLS-1$
		// escape to insert whitespace
		escapes.put(new Character('n'), "\n"); //$NON-NLS-1$
		escapes.put(new Character('r'), "\r"); //$NON-NLS-1$
		escapes.put(new Character('t'), "\t"); //$NON-NLS-1$
		// escape to remove whitespace
		escapes.put(new Character('\n'), null);
		escapes.put(new Character('\r'), null);
		escapes.put(new Character('\t'), null);
		escapes.put(new Character(' '), null);
	}

	/**
	 * Try to detect the unicode transformation format (UTF encoding) from the
	 * BOM. If no BOM is detected, null is returned and the input buffer is
	 * reset.
	 * 

* The {@link InputStream} is must support the mark operation! * * For BOM marker bytes, see http://unicode.org/faq/utf_bom.html * *

	 * Bytes 				Encoding Form 
	 * 00 00 FE FF 			UTF-32, big-endian 
	 * FF FE 00 00 			UTF-32, little-endian 
	 * FE FF 				UTF-16, big-endian 
	 * FF FE 				UTF-16, little-endian 
	 * EF BB BF 			UTF-8
	 * 
* * @param is * @return An {@link InputStreamReader} with the correct encoding * @throws IOException */ static public InputStreamReader createReaderScanBom(InputStream is) throws IOException { String encoding = null; int i; is.mark(4); i = is.read(); if (i == 0x00) { i = is.read(); if (i == 0x00) { i = is.read(); if (i == 0xfe) { i = is.read(); if (i == 0xff) { // UTF-32, big endian encoding = "UTF-32BE"; } } } } else if (i == 0xff) { i = is.read(); if (i == 0xfe) { is.mark(4); i = is.read(); if (i == 0x00) { i = is.read(); if (i == 0x00) { // UTF-32, little endian encoding = "UTF-32LE"; } } if (encoding == null) { is.reset(); // UTF-16, little endian encoding = "UTF-16LE"; } } } else if (i == 0xfe) { i = is.read(); if (i == 0xff) { // UTF-16, big endian encoding = "UTF-16BE"; } } else if (i == 0xef) { i = is.read(); if (i == 0xbb) { i = is.read(); if (i == 0xbf) { // UTF-8 encoding = "UTF-8"; } } } if (encoding == null) { is.reset(); return null; } return new InputStreamReader(is, encoding); } /** * Try to detect the input stream encoding from the meta tags "$$$" embedded * in the stream. If no encoding is detected, the input is reset and null is * returned. *

* The {@link InputStream} is must support the mark operation! * * @param is * @return An {@link InputStreamReader} with the correct encoding * @throws IOException */ static public InputStreamReader createReaderScanMeta(final InputStream is) throws IOException { // use single byte encoding and read encoding from meta data Reader tempReader = new Reader() { @Override public void close() throws IOException { } @Override public void mark(int readlimit) throws IOException { is.mark(readlimit); } @Override public boolean markSupported() { return is.markSupported(); } @Override public int read() throws IOException { return is.read(); }; @Override public int read(char[] cbuf, int off, int len) throws IOException { // not used return -1; }; @Override public void reset() throws IOException { is.reset(); }; }; String encoding = ReaderTools.readMetaEncoding(tempReader); if (encoding != null) { return new InputStreamReader(is, encoding); } return null; } /** * Create a {@link TaggedReader} and automatically detect the encoding from * different heuristics. First, the BOM markers are checked, then embedded * meta information is scanned. *

* If no encoding can be guessed, either the defaultCharsetName or the * platform encoding is used. *

* Meta information tags (lines starting with '$$$') are scanned. * * @param is * @param defaultCharsetName * @return A {@link TaggedReader} with the correct encoding * @throws IOException */ public static TaggedReader createTaggedReader(InputStream is, String defaultCharsetName, int size) throws IOException { // try to detect utf byte order marker InputStreamReader reader = createReaderScanBom(is); if (reader == null) { reader = createReaderScanMeta(is); if (reader == null) { if (defaultCharsetName == null) { reader = new InputStreamReader(is); } else { reader = new InputStreamReader(is, defaultCharsetName); } } } // now we must scan meta tags... TaggedReader tagged = new TaggedReader(reader, size); return tagged; } /** * Read a Map.Entry object from r. The end of the entry is marked by * delimiter. *

* The syntax for an entry is * *

	 * ws* key ws* '=' value [delimiter | EOF]
	 * value = string | quoted_string
	 * quoted_string = '"' [ char | escape ]* '"'
	 * 
* * @param reader * @param delimiter * @return A single map entry read from the reader. * @throws IOException */ public static Map.Entry readEntry(Reader reader, char delimiter) throws IOException { String key = ""; //$NON-NLS-1$ String value = ""; //$NON-NLS-1$ StringBuffer sb = new StringBuffer(); boolean readKey = true; boolean quoted = false; boolean ignore = false; while (true) { int i = reader.read(); if (i == '=' && readKey) { key = sb.toString().trim(); sb.setLength(0); readKey = false; } else if (i == '\\' && quoted) { i = reader.read(); if (i != -1) { String mapped = escapes.get(new Character((char) i)); sb.append(mapped); } } else if (i == '"' && !readKey) { if (quoted) { quoted = false; ignore = true; } else { if (sb.length() == 0) { quoted = true; } else { if (!ignore) { sb.append((char) i); } } } } else if (i == -1) { if (readKey) { key = sb.toString().trim(); value = ""; //$NON-NLS-1$ } else { value = sb.toString(); } if (key.length() > 0) { return new Entry(key, value); } else if (value.length() > 0) { return new Entry(key, value); } else { return null; } } else if (i == '\r' && !quoted && delimiter == '\n') { continue; } else if (i == delimiter && !quoted) { if (readKey) { key = sb.toString().trim(); value = ""; //$NON-NLS-1$ } else { value = sb.toString(); } if (key.length() > 0) { return new Entry(key, value); } else if (value.length() > 0) { return new Entry(key, value); } else { return new Entry(null, null); } } else { if (!ignore) { sb.append((char) i); } } } } /** * Try to detect meta data embedded in the input. *

* Meta data lines start with a '$$$' immediately at the line beginning and * end at the line end. Meta data lines are scanned until a line without * meta data is found. Meta data is encoded as entries (as provided in * readEntry method). *

* The maximum length for a meta data line is 1024. *

* After execution reader is either positioned after the last meta tag. The * reader instance must support the "mark/reset" sequence. * * @param reader * @return All meta data in the reader as a {@link Map} * @throws IOException */ public static Map readMetaData(Reader reader) throws IOException { int i; Map meta = new HashMap(); while (true) { reader.mark(1024); i = reader.read(); if (i == '$') { i = reader.read(); if (i == '$') { i = reader.read(); if (i == '$') { // meta data found... Map.Entry entry = ReaderTools .readEntry(reader, '\n'); if (entry.getKey() != null) { meta.put(entry.getKey(), entry.getValue()); } continue; } } } reader.reset(); break; } return meta; } /** * Try to detect encoding specific meta data embedded in the input. The * first meta data line is read and the value is returned if the meta * information key is "encoding". *

* After execution reader is either positioned at the start or after the * "encoding" meta tag. The reader instance must support the "mark/reset" * sequence. * *

* For more information on meta data see readMetaData. * * @param reader * @return The meta data for reader defining the encoding * @throws IOException */ public static String readMetaEncoding(Reader reader) throws IOException { int i; reader.mark(1024); i = reader.read(); if (i == '$') { i = reader.read(); if (i == '$') { i = reader.read(); if (i == '$') { // meta data found... Map.Entry entry = ReaderTools.readEntry( reader, '\n'); if ("encoding".equals(entry.getKey())) { reader.reset(); return entry.getValue(); } } } } reader.reset(); return null; } /** * Read a string token from r. The end of the token is marked by delimiter. * The syntax for a token is * *

	 * value [delimiter | EOF]
	 * value = string | quoted_string
	 * quoted_string = '"' [ char | escape ]* '"'
	 * 
* * @param reader * @param delimiter * @return A single token. * @throws IOException */ public static String readToken(Reader reader, char delimiter) throws IOException { StringBuffer sb = new StringBuffer(); boolean quoted = false; boolean ignore = false; while (true) { int i = reader.read(); if (i == '\\' && quoted) { i = reader.read(); if (i != -1) { String mapped = escapes.get(new Character((char) i)); sb.append(mapped); } } else if (i == '\r' && !quoted && delimiter == '\n') { continue; } else if (i == '"') { if (quoted) { quoted = false; ignore = true; } else { if (sb.length() == 0) { quoted = true; } else { if (!ignore) { sb.append((char) i); } } } } else if (i == -1) { if (sb.length() == 0) { return null; } return sb.toString(); } else if (i == delimiter && !quoted) { return sb.toString(); } else { if (!ignore) { sb.append((char) i); } } } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy