net.sf.jmatchparser.util.charset.ExtraCharsetsProvider Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jMatchParser-charset Show documentation
Show all versions of jMatchParser-charset Show documentation
A java-based parser for parsing/grabbing web sites and other text or XML documents, based on a nondeterministic parser language, creating XML output. Also contains a few utility classes for HTML, CSV and text parsing, and additional character sets. The jMatchParser-charset module contains the character sets.
The newest version!
/*
* Copyright (c) 2009 - 2011 Michael Schierl
*
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* - Neither name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND THE CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* HOLDERS OR THE CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
* TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package net.sf.jmatchparser.util.charset;
import java.nio.charset.Charset;
import java.nio.charset.spi.CharsetProvider;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
/**
* Charset provider that provides over 150 extra character sets.
*
*
* It also provides more aliases for existing charsets, based on the IANA Character Set
* Database.
*
*
* This class is loaded automatically via SPI when it is in the class path.
*
*
Character sets added by this provider
*
* From RFC 1345
*
* - ISO_646.basic:1983
* - INVARIANT
* - BS_4730
* - NATS-SEFI
* - NATS-SEFI-ADD
* - NATS-DANO
* - NATS-DANO-ADD
* - SEN_850200_B
* - SEN_850200_C
* - JIS_C6220-1969-jp
* - JIS_C6220-1969-ro
* - IT
* - PT
* - ES
* - greek7-old
* - latin-greek
* - DIN_66003
* - iso-ir-25
* - Latin-greek-1
* - ISO_5427
* - BS_viewdata
* - INIS
* - INIS-8
* - INIS-cyrillic
* - ISO_5427:1981
* - ISO_5428:1980
* - GB_1988-80
* - NS_4551-1
* - NS_4551-2
* - NF_Z_62-010
* - PT2
* - ES2
* - MSZ_7795.3
* - greek7
* - ASMO_449
* - JIS_C6229-1984-a
* - JIS_C6229-1984-b
* - JIS_C6229-1984-b-add
* - JIS_C6229-1984-hand
* - JIS_C6229-1984-hand-add
* - JIS_C6229-1984-kana
* - ISO_2033-1983
* - T.61-7bit
* - ECMA-cyrillic
* - CSA_Z243.4-1985-1
* - CSA_Z243.4-1985-2
* - CSA_Z243.4-1985-gr
* - CSN_369103
* - JUS_I.B1.002
* - IEC_P27-1
* - JUS_I.B1.003-serb
* - JUS_I.B1.003-mac
* - greek-ccitt
* - NC_NC00-10:81
* - ISO_6937-2-25
* - ISO_8859-supp
* - ISO_10367-box
* - latin-lap
* - DS_2089
* - KSC5636
* - DEC-MCS
* - hp-roman8
* - macintosh
* - IBM038
* - IBM274
* - IBM275
* - IBM281
* - IBM290
* - IBM423
* - IBM851
* - IBM880
* - IBM891
* - IBM903
* - IBM904
* - IBM905
* - EBCDIC-AT-DE
* - EBCDIC-AT-DE-A
* - EBCDIC-CA-FR
* - EBCDIC-DK-NO
* - EBCDIC-DK-NO-A
* - EBCDIC-FI-SE
* - EBCDIC-FI-SE-A
* - EBCDIC-FR
* - EBCDIC-IT
* - EBCDIC-PT
* - EBCDIC-ES
* - EBCDIC-ES-A
* - EBCDIC-ES-S
* - EBCDIC-UK
* - EBCDIC-US
* - videotex-suppl
* - iso-ir-90
* - ANSI_X3.110-1983
* - T.61-8bit
* - T.101-G2
* - ISO_6937-2-add
* - us-dk
* - dk-us
*
*
* From Unicode.org
*
* - ISO-8859-1:1998
* - ISO-8859-10:1998
* - ISO-8859-11:2001
* - ISO-8859-13:1998
* - ISO-8859-14:1998
* - ISO-8859-15:1999
* - ISO-8859-16:2001
* - ISO-8859-2:1999
* - ISO-8859-3:1998
* - ISO-8859-4:1998
* - ISO-8859-5:1999
* - ISO-8859-6:1999
* - ISO-8859-7:1987a
* - ISO-8859-7:1987b
* - ISO-8859-7:2003
* - ISO-8859-8:1999
* - ISO-8859-9:1999
* - ISO-8859-10
* - ISO-8859-14
* - ISO-8859-16
* - MacCeltic
* - MacCenteuro
* - Apple-MacCroatian
* - Apple-MacCyrillic
* - MacDingbats
* - MacGaelic
* - Apple-MacGreek
* - MacIcelandic
* - MacInuit
* - Apple-MacRoman
* - MacRomanian
* - Apple-MacTurkish
* - Microsoft-MacIcelandic
* - Microsoft-MacLatin2
* - AtariST
* - KZ-1048
* - US-ASCII-QUOTES
* - NextStep
* - Adobe-Standard-Encoding
* - Adobe-Symbol-Encoding
* - Adobe-Zapf-Dingbats-Encoding
* - windows-1250-bestfit
* - windows-1251-bestfit
* - windows-1252-bestfit
* - windows-1253-bestfit
* - windows-1254-bestfit
* - windows-1255-bestfit
* - windows-1256-bestfit
* - windows-1257-bestfit
* - windows-1258-bestfit
* - windows-874-bestfit
*
*
* Bestfit charset derived from the charsets above
*
* - US-ASCII-bestfit
*
*
* Bestfit charset derived from the charsets above and from The Unicode
* Database, see {@link ToAsciiMapping}
*
* - US-ASCII-bestfit-2
*
*/
public class ExtraCharsetsProvider extends CharsetProvider {
private static volatile Map allCharsets = null;
private static volatile Set allCharsetNames = null;
@Override
public Charset charsetForName(String charsetName) {
if (allCharsets == null) {
if (allCharsetNames == null) {
allCharsetNames = new HashSet();
allCharsetNames.addAll(Arrays.asList(CharsetNameList.CHARSET_NAMES));
}
if (!allCharsetNames.contains(charsetName.toLowerCase()))
return null;
if (allCharsets == null) {
allCharsets = new HashMap();
for (Charset cs : CharsetList.ALL_CHARSETS) {
allCharsets.put(cs.name().toLowerCase(), cs);
for (String alias : cs.aliases()) {
allCharsets.put(alias.toLowerCase(), cs);
}
}
}
allCharsetNames = null;
}
return allCharsets.get(charsetName.toLowerCase());
}
@Override
public Iterator charsets() {
return Arrays.asList(CharsetList.ALL_CHARSETS).iterator();
}
}