net.sf.jmatchparser.util.charset.UTFBOMCharsetsProvider Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of jMatchParser-charset Show documentation

A java-based parser for parsing/grabbing web sites and other text or XML documents, based on a nondeterministic parser language, creating XML output. Also contains a few utility classes for HTML, CSV and text parsing, and additional character sets. The jMatchParser-charset module contains the character sets.

The newest version!

/*
 * Copyright (c) 2010 - 2011 Michael Schierl
 * 
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 
 * - Redistributions of source code must retain the above copyright notice,
 *   this list of conditions and the following disclaimer.
 *   
 * - Redistributions in binary form must reproduce the above copyright
 *   notice, this list of conditions and the following disclaimer in the
 *   documentation and/or other materials provided with the distribution.
 *   
 * - Neither name of the copyright holders nor the names of its
 *   contributors may be used to endorse or promote products derived from
 *   this software without specific prior written permission.
 *   
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND THE CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * HOLDERS OR THE CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
 * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
package net.sf.jmatchparser.util.charset;

import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;
import java.nio.charset.spi.CharsetProvider;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

/**
 * Charset provider that provides an UTF-BOM.charset charset
 * for every other supported charset, and a charset-BOM
 * charset for each UTF charset.
 * 
 * 
 * The UTF-BOM. charsets will try to detect a byte order mark of
 * UTF-16LE, UTF-16BE or UTF-8.
 * 
 * 

 * If no byte order mark could be detected, it falls back to the charset given
 * at the end of the charset name.
 * 
 * 

 * This provider also provides charsets UTF-8-BOM,
 * UTF-16LE-BOM and UTF-16BE-BOM, which act like their
 * counterparts without -BOM, but will add a byte order mark when
 * encoding and strip it when decoding (if present).
 * 
 * 

 * Two additional charsets, UTF-8-Binary and UTF-8-Binary-PUA
 * are supersets of UTF-8 that will be binary safe on decoding (i. e. every byte
 * sequence will remain intact if decoded and encoded again). The first
 * mentioned charset will use unpaired surrogates in the range U+DC80
 * to U+DCFF, as suggested in the UTF-8 Wikipedia article; the second one uses codepoints U+E980
 * to U+E9FF from the Private Use Area, escaping those code points (and
 * the escape character) with a U+E97F character if needed.
 * 
 * 
 * This class is loaded automatically via SPI when it is in the class path.
 */
public class UTFBOMCharsetsProvider extends CharsetProvider {

	@Override
	public Charset charsetForName(String charsetName) {
		if (charsetName.equals("UTF-8-BOM") || charsetName.equals("UTF-16LE-BOM") || charsetName.equals("UTF-16BE-BOM")) {
			Charset cs = Charset.forName(charsetName.substring(0, charsetName.length() - 4));
			return new AddBOMCharset(cs);
		}
		if (charsetName.equals(UTF8BinaryCharset.SURROGATE_VARIANT)) {
			return new UTF8BinaryCharset(false);
		}
		if (charsetName.equals(UTF8BinaryCharset.PUA_VARIANT)) {
			return new UTF8BinaryCharset(true);
		}
		if (charsetName.startsWith(UTFBOMCharset.PREFIX)) {
			try {
				Charset cs = Charset.forName(charsetName.substring(UTFBOMCharset.PREFIX.length()));
				return new UTFBOMCharset(cs);
			} catch (UnsupportedCharsetException ex) {
			}
		}
		return null;
	}

	// reentrance checker
	private static ThreadLocal in = new ThreadLocal();

	@Override
	public Iterator charsets() {
		List l = new ArrayList();
		l.add(charsetForName(UTF8BinaryCharset.SURROGATE_VARIANT));
		l.add(charsetForName(UTF8BinaryCharset.PUA_VARIANT));
		l.add(charsetForName("UTF-8-BOM"));
		l.add(charsetForName("UTF-16LE-BOM"));
		l.add(charsetForName("UTF-16BE-BOM"));
		if (in.get() != null)
			return l.iterator();
		in.set(true);
		try {
			for (String cs : Charset.availableCharsets().keySet()) {
				l.add(charsetForName(UTFBOMCharset.PREFIX + cs));
			}
		} finally {
			in.set(null);
		}
		return l.iterator();
	}
}