net.sf.jmatchparser.util.charset.ExtraCharsetsProvider Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of jMatchParser-charset Show documentation

A java-based parser for parsing/grabbing web sites and other text or XML documents, based on a nondeterministic parser language, creating XML output. Also contains a few utility classes for HTML, CSV and text parsing, and additional character sets. The jMatchParser-charset module contains the character sets.

The newest version!

/*
 * Copyright (c) 2009 - 2011 Michael Schierl
 * 
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 
 * - Redistributions of source code must retain the above copyright notice,
 *   this list of conditions and the following disclaimer.
 *   
 * - Redistributions in binary form must reproduce the above copyright
 *   notice, this list of conditions and the following disclaimer in the
 *   documentation and/or other materials provided with the distribution.
 *   
 * - Neither name of the copyright holders nor the names of its
 *   contributors may be used to endorse or promote products derived from
 *   this software without specific prior written permission.
 *   
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND THE CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * HOLDERS OR THE CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
 * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
package net.sf.jmatchparser.util.charset;

import java.nio.charset.Charset;
import java.nio.charset.spi.CharsetProvider;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;

/**
 * Charset provider that provides over 150 extra character sets.
 * 
 * 
 * It also provides more aliases for existing charsets, based on the IANA Character Set
 * Database.
 * 
 * 

 * This class is loaded automatically via SPI when it is in the class path.
 * 
 * 
Character sets added by this provider
 * 
 * From RFC 1345
 * 
 * ISO_646.basic:1983
 * INVARIANT
 * BS_4730
 * NATS-SEFI
 * NATS-SEFI-ADD
 * NATS-DANO
 * NATS-DANO-ADD
 * SEN_850200_B
 * SEN_850200_C
 * JIS_C6220-1969-jp
 * JIS_C6220-1969-ro
 * IT
 * PT
 * ES
 * greek7-old
 * latin-greek
 * DIN_66003
 * iso-ir-25
 * Latin-greek-1
 * ISO_5427
 * BS_viewdata
 * INIS
 * INIS-8
 * INIS-cyrillic
 * ISO_5427:1981
 * ISO_5428:1980
 * GB_1988-80
 * NS_4551-1
 * NS_4551-2
 * NF_Z_62-010
 * PT2
 * ES2
 * MSZ_7795.3
 * greek7
 * ASMO_449
 * JIS_C6229-1984-a
 * JIS_C6229-1984-b
 * JIS_C6229-1984-b-add
 * JIS_C6229-1984-hand
 * JIS_C6229-1984-hand-add
 * JIS_C6229-1984-kana
 * ISO_2033-1983
 * T.61-7bit
 * ECMA-cyrillic
 * CSA_Z243.4-1985-1
 * CSA_Z243.4-1985-2
 * CSA_Z243.4-1985-gr
 * CSN_369103
 * JUS_I.B1.002
 * IEC_P27-1
 * JUS_I.B1.003-serb
 * JUS_I.B1.003-mac
 * greek-ccitt
 * NC_NC00-10:81
 * ISO_6937-2-25
 * ISO_8859-supp
 * ISO_10367-box
 * latin-lap
 * DS_2089
 * KSC5636
 * DEC-MCS
 * hp-roman8
 * macintosh
 * IBM038
 * IBM274
 * IBM275
 * IBM281
 * IBM290
 * IBM423
 * IBM851
 * IBM880
 * IBM891
 * IBM903
 * IBM904
 * IBM905
 * EBCDIC-AT-DE
 * EBCDIC-AT-DE-A
 * EBCDIC-CA-FR
 * EBCDIC-DK-NO
 * EBCDIC-DK-NO-A
 * EBCDIC-FI-SE
 * EBCDIC-FI-SE-A
 * EBCDIC-FR
 * EBCDIC-IT
 * EBCDIC-PT
 * EBCDIC-ES
 * EBCDIC-ES-A
 * EBCDIC-ES-S
 * EBCDIC-UK
 * EBCDIC-US
 * videotex-suppl
 * iso-ir-90
 * ANSI_X3.110-1983
 * T.61-8bit
 * T.101-G2
 * ISO_6937-2-add
 * us-dk
 * dk-us
 * 
 * 
 * From Unicode.org
 * 
 * ISO-8859-1:1998
 * ISO-8859-10:1998
 * ISO-8859-11:2001
 * ISO-8859-13:1998
 * ISO-8859-14:1998
 * ISO-8859-15:1999
 * ISO-8859-16:2001
 * ISO-8859-2:1999
 * ISO-8859-3:1998
 * ISO-8859-4:1998
 * ISO-8859-5:1999
 * ISO-8859-6:1999
 * ISO-8859-7:1987a
 * ISO-8859-7:1987b
 * ISO-8859-7:2003
 * ISO-8859-8:1999
 * ISO-8859-9:1999
 * ISO-8859-10
 * ISO-8859-14
 * ISO-8859-16
 * MacCeltic
 * MacCenteuro
 * Apple-MacCroatian
 * Apple-MacCyrillic
 * MacDingbats
 * MacGaelic
 * Apple-MacGreek
 * MacIcelandic
 * MacInuit
 * Apple-MacRoman
 * MacRomanian
 * Apple-MacTurkish
 * Microsoft-MacIcelandic
 * Microsoft-MacLatin2
 * AtariST
 * KZ-1048
 * US-ASCII-QUOTES
 * NextStep
 * Adobe-Standard-Encoding
 * Adobe-Symbol-Encoding
 * Adobe-Zapf-Dingbats-Encoding
 * windows-1250-bestfit
 * windows-1251-bestfit
 * windows-1252-bestfit
 * windows-1253-bestfit
 * windows-1254-bestfit
 * windows-1255-bestfit
 * windows-1256-bestfit
 * windows-1257-bestfit
 * windows-1258-bestfit
 * windows-874-bestfit
 * 
 * 
 * Bestfit charset derived from the charsets above
 * 
 * US-ASCII-bestfit
 * 
 * 
 * Bestfit charset derived from the charsets above and from The Unicode
 * Database, see {@link ToAsciiMapping}
 * 
 * US-ASCII-bestfit-2
 * 
 */
public class ExtraCharsetsProvider extends CharsetProvider {

	private static volatile Map allCharsets = null;

	private static volatile Set allCharsetNames = null;

	@Override
	public Charset charsetForName(String charsetName) {
		if (allCharsets == null) {
			if (allCharsetNames == null) {
				allCharsetNames = new HashSet();
				allCharsetNames.addAll(Arrays.asList(CharsetNameList.CHARSET_NAMES));
			}
			if (!allCharsetNames.contains(charsetName.toLowerCase()))
				return null;
			if (allCharsets == null) {
				allCharsets = new HashMap();
				for (Charset cs : CharsetList.ALL_CHARSETS) {
					allCharsets.put(cs.name().toLowerCase(), cs);
					for (String alias : cs.aliases()) {
						allCharsets.put(alias.toLowerCase(), cs);
					}
				}
			}
			allCharsetNames = null;
		}
		return allCharsets.get(charsetName.toLowerCase());
	}

	@Override
	public Iterator charsets() {
		return Arrays.asList(CharsetList.ALL_CHARSETS).iterator();
	}
}

net.sf.jmatchparser.util.charset.ExtraCharsetsProvider Maven / Gradle / Ivy

Character sets added by this provider

From RFC 1345

From Unicode.org

Bestfit charset derived from the charsets above

Bestfit charset derived from the charsets above and from The Unicode * Database, see {@link ToAsciiMapping}