All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.sf.jmatchparser.util.charset.UTFBOMCharsetsProvider Maven / Gradle / Ivy

Go to download

A java-based parser for parsing/grabbing web sites and other text or XML documents, based on a nondeterministic parser language, creating XML output. Also contains a few utility classes for HTML, CSV and text parsing, and additional character sets. The jMatchParser-charset module contains the character sets.

The newest version!
/*
 * Copyright (c) 2010 - 2011 Michael Schierl
 * 
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 
 * - Redistributions of source code must retain the above copyright notice,
 *   this list of conditions and the following disclaimer.
 *   
 * - Redistributions in binary form must reproduce the above copyright
 *   notice, this list of conditions and the following disclaimer in the
 *   documentation and/or other materials provided with the distribution.
 *   
 * - Neither name of the copyright holders nor the names of its
 *   contributors may be used to endorse or promote products derived from
 *   this software without specific prior written permission.
 *   
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND THE CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * HOLDERS OR THE CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
 * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
package net.sf.jmatchparser.util.charset;

import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;
import java.nio.charset.spi.CharsetProvider;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

/**
 * Charset provider that provides an UTF-BOM.charset charset
 * for every other supported charset, and a charset-BOM
 * charset for each UTF charset.
 * 
 * 

* The UTF-BOM. charsets will try to detect a byte order mark of * UTF-16LE, UTF-16BE or UTF-8. * *

* If no byte order mark could be detected, it falls back to the charset given * at the end of the charset name. * *

* This provider also provides charsets UTF-8-BOM, * UTF-16LE-BOM and UTF-16BE-BOM, which act like their * counterparts without -BOM, but will add a byte order mark when * encoding and strip it when decoding (if present). * *

* Two additional charsets, UTF-8-Binary and UTF-8-Binary-PUA * are supersets of UTF-8 that will be binary safe on decoding (i. e. every byte * sequence will remain intact if decoded and encoded again). The first * mentioned charset will use unpaired surrogates in the range U+DC80 * to U+DCFF, as suggested in the UTF-8 Wikipedia article; the second one uses codepoints U+E980 * to U+E9FF from the Private Use Area, escaping those code points (and * the escape character) with a U+E97F character if needed. * *

* This class is loaded automatically via SPI when it is in the class path. */ public class UTFBOMCharsetsProvider extends CharsetProvider { @Override public Charset charsetForName(String charsetName) { if (charsetName.equals("UTF-8-BOM") || charsetName.equals("UTF-16LE-BOM") || charsetName.equals("UTF-16BE-BOM")) { Charset cs = Charset.forName(charsetName.substring(0, charsetName.length() - 4)); return new AddBOMCharset(cs); } if (charsetName.equals(UTF8BinaryCharset.SURROGATE_VARIANT)) { return new UTF8BinaryCharset(false); } if (charsetName.equals(UTF8BinaryCharset.PUA_VARIANT)) { return new UTF8BinaryCharset(true); } if (charsetName.startsWith(UTFBOMCharset.PREFIX)) { try { Charset cs = Charset.forName(charsetName.substring(UTFBOMCharset.PREFIX.length())); return new UTFBOMCharset(cs); } catch (UnsupportedCharsetException ex) { } } return null; } // reentrance checker private static ThreadLocal in = new ThreadLocal(); @Override public Iterator charsets() { List l = new ArrayList(); l.add(charsetForName(UTF8BinaryCharset.SURROGATE_VARIANT)); l.add(charsetForName(UTF8BinaryCharset.PUA_VARIANT)); l.add(charsetForName("UTF-8-BOM")); l.add(charsetForName("UTF-16LE-BOM")); l.add(charsetForName("UTF-16BE-BOM")); if (in.get() != null) return l.iterator(); in.set(true); try { for (String cs : Charset.availableCharsets().keySet()) { l.add(charsetForName(UTFBOMCharset.PREFIX + cs)); } } finally { in.set(null); } return l.iterator(); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy