All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.sf.jmatchparser.util.charset.ExtraCharsetsProvider Maven / Gradle / Ivy

Go to download

A java-based parser for parsing/grabbing web sites and other text or XML documents, based on a nondeterministic parser language, creating XML output. Also contains a few utility classes for HTML, CSV and text parsing, and additional character sets. The jMatchParser-charset module contains the character sets.

The newest version!
/*
 * Copyright (c) 2009 - 2011 Michael Schierl
 * 
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 
 * - Redistributions of source code must retain the above copyright notice,
 *   this list of conditions and the following disclaimer.
 *   
 * - Redistributions in binary form must reproduce the above copyright
 *   notice, this list of conditions and the following disclaimer in the
 *   documentation and/or other materials provided with the distribution.
 *   
 * - Neither name of the copyright holders nor the names of its
 *   contributors may be used to endorse or promote products derived from
 *   this software without specific prior written permission.
 *   
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND THE CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * HOLDERS OR THE CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
 * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
package net.sf.jmatchparser.util.charset;

import java.nio.charset.Charset;
import java.nio.charset.spi.CharsetProvider;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;

/**
 * Charset provider that provides over 150 extra character sets.
 * 
 * 

* It also provides more aliases for existing charsets, based on the IANA Character Set * Database. * *

* This class is loaded automatically via SPI when it is in the class path. * *

Character sets added by this provider

* *

From RFC 1345

*
    *
  • ISO_646.basic:1983
  • *
  • INVARIANT
  • *
  • BS_4730
  • *
  • NATS-SEFI
  • *
  • NATS-SEFI-ADD
  • *
  • NATS-DANO
  • *
  • NATS-DANO-ADD
  • *
  • SEN_850200_B
  • *
  • SEN_850200_C
  • *
  • JIS_C6220-1969-jp
  • *
  • JIS_C6220-1969-ro
  • *
  • IT
  • *
  • PT
  • *
  • ES
  • *
  • greek7-old
  • *
  • latin-greek
  • *
  • DIN_66003
  • *
  • iso-ir-25
  • *
  • Latin-greek-1
  • *
  • ISO_5427
  • *
  • BS_viewdata
  • *
  • INIS
  • *
  • INIS-8
  • *
  • INIS-cyrillic
  • *
  • ISO_5427:1981
  • *
  • ISO_5428:1980
  • *
  • GB_1988-80
  • *
  • NS_4551-1
  • *
  • NS_4551-2
  • *
  • NF_Z_62-010
  • *
  • PT2
  • *
  • ES2
  • *
  • MSZ_7795.3
  • *
  • greek7
  • *
  • ASMO_449
  • *
  • JIS_C6229-1984-a
  • *
  • JIS_C6229-1984-b
  • *
  • JIS_C6229-1984-b-add
  • *
  • JIS_C6229-1984-hand
  • *
  • JIS_C6229-1984-hand-add
  • *
  • JIS_C6229-1984-kana
  • *
  • ISO_2033-1983
  • *
  • T.61-7bit
  • *
  • ECMA-cyrillic
  • *
  • CSA_Z243.4-1985-1
  • *
  • CSA_Z243.4-1985-2
  • *
  • CSA_Z243.4-1985-gr
  • *
  • CSN_369103
  • *
  • JUS_I.B1.002
  • *
  • IEC_P27-1
  • *
  • JUS_I.B1.003-serb
  • *
  • JUS_I.B1.003-mac
  • *
  • greek-ccitt
  • *
  • NC_NC00-10:81
  • *
  • ISO_6937-2-25
  • *
  • ISO_8859-supp
  • *
  • ISO_10367-box
  • *
  • latin-lap
  • *
  • DS_2089
  • *
  • KSC5636
  • *
  • DEC-MCS
  • *
  • hp-roman8
  • *
  • macintosh
  • *
  • IBM038
  • *
  • IBM274
  • *
  • IBM275
  • *
  • IBM281
  • *
  • IBM290
  • *
  • IBM423
  • *
  • IBM851
  • *
  • IBM880
  • *
  • IBM891
  • *
  • IBM903
  • *
  • IBM904
  • *
  • IBM905
  • *
  • EBCDIC-AT-DE
  • *
  • EBCDIC-AT-DE-A
  • *
  • EBCDIC-CA-FR
  • *
  • EBCDIC-DK-NO
  • *
  • EBCDIC-DK-NO-A
  • *
  • EBCDIC-FI-SE
  • *
  • EBCDIC-FI-SE-A
  • *
  • EBCDIC-FR
  • *
  • EBCDIC-IT
  • *
  • EBCDIC-PT
  • *
  • EBCDIC-ES
  • *
  • EBCDIC-ES-A
  • *
  • EBCDIC-ES-S
  • *
  • EBCDIC-UK
  • *
  • EBCDIC-US
  • *
  • videotex-suppl
  • *
  • iso-ir-90
  • *
  • ANSI_X3.110-1983
  • *
  • T.61-8bit
  • *
  • T.101-G2
  • *
  • ISO_6937-2-add
  • *
  • us-dk
  • *
  • dk-us
  • *
* *

From Unicode.org

*
    *
  • ISO-8859-1:1998
  • *
  • ISO-8859-10:1998
  • *
  • ISO-8859-11:2001
  • *
  • ISO-8859-13:1998
  • *
  • ISO-8859-14:1998
  • *
  • ISO-8859-15:1999
  • *
  • ISO-8859-16:2001
  • *
  • ISO-8859-2:1999
  • *
  • ISO-8859-3:1998
  • *
  • ISO-8859-4:1998
  • *
  • ISO-8859-5:1999
  • *
  • ISO-8859-6:1999
  • *
  • ISO-8859-7:1987a
  • *
  • ISO-8859-7:1987b
  • *
  • ISO-8859-7:2003
  • *
  • ISO-8859-8:1999
  • *
  • ISO-8859-9:1999
  • *
  • ISO-8859-10
  • *
  • ISO-8859-14
  • *
  • ISO-8859-16
  • *
  • MacCeltic
  • *
  • MacCenteuro
  • *
  • Apple-MacCroatian
  • *
  • Apple-MacCyrillic
  • *
  • MacDingbats
  • *
  • MacGaelic
  • *
  • Apple-MacGreek
  • *
  • MacIcelandic
  • *
  • MacInuit
  • *
  • Apple-MacRoman
  • *
  • MacRomanian
  • *
  • Apple-MacTurkish
  • *
  • Microsoft-MacIcelandic
  • *
  • Microsoft-MacLatin2
  • *
  • AtariST
  • *
  • KZ-1048
  • *
  • US-ASCII-QUOTES
  • *
  • NextStep
  • *
  • Adobe-Standard-Encoding
  • *
  • Adobe-Symbol-Encoding
  • *
  • Adobe-Zapf-Dingbats-Encoding
  • *
  • windows-1250-bestfit
  • *
  • windows-1251-bestfit
  • *
  • windows-1252-bestfit
  • *
  • windows-1253-bestfit
  • *
  • windows-1254-bestfit
  • *
  • windows-1255-bestfit
  • *
  • windows-1256-bestfit
  • *
  • windows-1257-bestfit
  • *
  • windows-1258-bestfit
  • *
  • windows-874-bestfit
  • *
* *

Bestfit charset derived from the charsets above

*
    *
  • US-ASCII-bestfit
  • *
* *

Bestfit charset derived from the charsets above and from The Unicode * Database, see {@link ToAsciiMapping}

*
    *
  • US-ASCII-bestfit-2
  • *
*/ public class ExtraCharsetsProvider extends CharsetProvider { private static volatile Map allCharsets = null; private static volatile Set allCharsetNames = null; @Override public Charset charsetForName(String charsetName) { if (allCharsets == null) { if (allCharsetNames == null) { allCharsetNames = new HashSet(); allCharsetNames.addAll(Arrays.asList(CharsetNameList.CHARSET_NAMES)); } if (!allCharsetNames.contains(charsetName.toLowerCase())) return null; if (allCharsets == null) { allCharsets = new HashMap(); for (Charset cs : CharsetList.ALL_CHARSETS) { allCharsets.put(cs.name().toLowerCase(), cs); for (String alias : cs.aliases()) { allCharsets.put(alias.toLowerCase(), cs); } } } allCharsetNames = null; } return allCharsets.get(charsetName.toLowerCase()); } @Override public Iterator charsets() { return Arrays.asList(CharsetList.ALL_CHARSETS).iterator(); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy