
org.enhydra.xml.io.MkEncodingsTable Maven / Gradle / Ivy
The newest version!
/*
* Enhydra Java Application Server Project
*
* The contents of this file are subject to the Enhydra Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License on
* the Enhydra web site ( http://www.enhydra.org/ ).
*
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
* the License for the specific terms governing rights and limitations
* under the License.
*
* The Initial Developer of the Enhydra Application Server is Lutris
* Technologies, Inc. The Enhydra Application Server and portions created
* by Lutris Technologies, Inc. are Copyright Lutris Technologies, Inc.
* All Rights Reserved.
*
* Contributor(s):
*
* $Id: MkEncodingsTable.java,v 1.2 2005/01/26 08:29:24 jkjome Exp $
*/
package org.enhydra.xml.io;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.HashSet;
//FIXME: Next to convert to file to XML.
/**
* Generate a file contain character encodings by parsing
* the IANA Charset Registry, obtained from:
*
*
* ftp://ftp.isi.edu/in-notes/iana/assignments/character-sets
*
* This is a development-time tool, with special hacks to make up for
* various java encoding names not in the table.
* The resulting file has the format:
*
* name bits mime-name alias1 alias2 ...
*
* Where bits is 7, 8 or 16 and mime-name can be "null" if there is none
* defined. This file will be converted to XML in a future release.
*/
public final class MkEncodingsTable {
/**
* Resource name of IANA Charset Registry file.
*/
private final String CHAR_SET_REGISTRY = "character-sets";
/**
* Character set table that is created
*/
private final String CHAR_SET_TABLE = "character-sets.tbl";
/**
* Labels in registry file.
*/
private final String REG_NAME_FIELD = "Name:";
private final String REG_ALIAS_FIELD = "Alias:";
//FIXME: these encodings lists are not complete..
/**
* 7 bit encoding names.
*/
private static final String[] ENCODINGS_7BIT = {
"ANSI_X3.4-1968",
"T.61-7bit"
};
/**
* 8 bit encoding names.
*/
private static final String[] ENCODINGS_8BIT = {
"T.61-8bit",
"UNKNOWN-8BIT",
"PC8-Danish-Norwegian",
"PC8-Turkish",
"ISO_8859-1:1987",
"ISO_8859-2:1987",
"ISO_8859-3:1988",
"ISO_8859-4:1988",
"ISO_8859-6:1987",
"ISO_8859-6-E",
"ISO_8859-6-I",
"ISO_8859-7:1987",
"ISO_8859-8:1988",
"ISO_8859-8-E",
"ISO_8859-8-I",
"ISO_8859-5:1988",
"ISO_8859-9:1989",
"ISO_8859-supp",
"ISO-8859-10",
"ISO-8859-15",
"ISO-8859-1-Windows-3.0-Latin-1",
"ISO-8859-1-Windows-3.1-Latin-1",
"ISO-8859-2-Windows-Latin-2",
"ISO-8859-9-Windows-Latin-5",
"latin-greek",
"Latin-greek-1"
};
/**
* Tables of known 7 & 8 bit encodings.
*/
private static final HashSet f7BitEncodings = new HashSet();
private static final HashSet f8BitEncodings = new HashSet();
/**
* Pattern indicating the preferred MIME name.
*/
private static final String REG_MIME_PREFERRED = "preferred MIME name";
/**
* Table of aliases to add.
*/
private static final String[][] HACKED_ALIASES = {
{"UTF-8", "UTF8"},
{"ANSI_X3.4-1968", "646"},
};
/**
* Table of prefix conversions. An alias is created for names/aliases
* that match the first prefix, with the second subsitutued.
*/
private static final String[][] HACKED_PREFIXES = {
{"windows-", "Cp"},
{"ISO_8859-", "ISO8859-"},
{"ISO-8859-", "ISO8859_"}
};
/**
* Extra entries to output, with no other hacks available to get them.
*/
private static String[] EXTRA_ENTRIES = {
"UnicodeBig 16 null",
"UnicodeBigUnmarked 16 null",
"UnicodeLittle 16 null",
"UnicodeLittleUnmarked 16 null",
"UTF-16 16 null UTF16"
};
/**
* Class initializer.
*/
static {
for (int idx = 0; idx < ENCODINGS_7BIT.length; idx++) {
f7BitEncodings.add(ENCODINGS_7BIT[idx]);
}
for (int idx = 0; idx < ENCODINGS_8BIT.length; idx++) {
f8BitEncodings.add(ENCODINGS_8BIT[idx]);
}
}
/**
* Generate an error about parsing a line in registry.
*/
private void ianaParseError(String msg,
String line) {
throw new XMLIOError(msg + "; parsing line in " + CHAR_SET_REGISTRY
+ "\"" + line + "\"");
}
/**
* Get the encoding size. Returning 7, 8, or 16. This makes a guess
* based on some encoded knowledge. If not known, returns 16.
*/
private int getCharSize(String encoding) {
if (f7BitEncodings.contains(encoding)) {
return 7;
} else if (f8BitEncodings.contains(encoding)) {
return 8;
} else {
return 16;
}
}
/**
* Extract a encoding name out of a Name: or Alias: line. Returns
* null if empty.
*/
private String parseName(String line) {
int len = line.length();
// Get next char after index.
int startIdx = line.indexOf(':');
if (startIdx < 0) {
ianaParseError("no `:' found", line);
}
startIdx++;
// Skip spaces
while ((startIdx < len) && (line.charAt(startIdx) == ' ')) {
startIdx++;
}
// Find end
int endIdx = startIdx;
while ((endIdx < len) && (line.charAt(endIdx) != ' ')) {
endIdx++;
}
if (endIdx <= startIdx) {
return null;
} else {
return line.substring(startIdx, endIdx).intern();
}
}
/**
* Determine if a line contains the preferred MIME encoding.
*/
private boolean isMimePreferredEntry(String line) {
return (line.indexOf(REG_MIME_PREFERRED) >= 0);
}
/**
* Add a alias to the list of aliases, if its not null or not already
* there.
*/
private void addAlias(ArrayList aliases,
String alias) {
if ((alias != null) && !aliases.contains(alias)) {
aliases.add(alias);
}
}
/**
* Do special hacked mapping of name/aliases to other aliases. This
* handles alisas not in registry
*/
private void makeHackedAliases(ArrayList aliases,
String name) {
// Hacks based on alias.
for (int idx = 0; idx < HACKED_ALIASES.length; idx++) {
String[] mapping = HACKED_ALIASES[idx];
if (name.equals(mapping[0])) {
addAlias(aliases, mapping[1]);
}
}
// Hacks based on prefix.
for (int idx = 0; idx < HACKED_PREFIXES.length; idx++) {
String[] mapping = HACKED_PREFIXES[idx];
if (name.startsWith(mapping[0])) {
addAlias(aliases,
mapping[1] + name.substring(mapping[0].length()));
}
}
}
/**
* Scan the input stream for the next encoding entry and parse that
* entry and write a record.
*/
private boolean parseCharSetEntry(BufferedReader in,
PrintWriter out) throws IOException {
ArrayList aliases = new ArrayList();
String mimePreferred = null;
String line = null;
// Scan for next Name: entry
while ((line = in.readLine()) != null) {
if (line.startsWith(REG_NAME_FIELD)) {
break;
}
}
if (line == null) {
return false; // EOF
}
String name = parseName(line);
if (name == null) {
ianaParseError("no name parsed", line);
}
if (isMimePreferredEntry(line)){
mimePreferred = name;
}
// Handle stuff missing from registry
makeHackedAliases(aliases, name);
// Parse Alias: entries, scanning until a blank line or EOF.
while (((line = in.readLine()) != null)
&& (line.trim().length() > 0)) {
if (line.startsWith(REG_ALIAS_FIELD)) {
String alias = parseName(line);
if (alias != null) {
addAlias(aliases, alias);
makeHackedAliases(aliases, alias);
if (isMimePreferredEntry(line)){
mimePreferred = alias;
}
}
}
}
// output entry
out.print(name);
out.print(' ');
out.print(getCharSize(name));
out.print(' ');
out.print(mimePreferred);
int len = aliases.size();
for (int idx = 0; idx < len; idx++) {
out.print(' ');
out.print(aliases.get(idx));
}
out.println();
return true;
}
/**
* Parse the registry file.
*/
private void parseIanaRegistry(BufferedReader in,
PrintWriter out) throws IOException {
while (parseCharSetEntry(in, out)) {
// Looping till eof
}
}
/**
* Parse the registry file.
*/
private void parseIanaRegistry() throws IOException {
BufferedReader in = new BufferedReader(new FileReader(CHAR_SET_REGISTRY));
PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(CHAR_SET_TABLE)));
parseIanaRegistry(in, out);
for (int i = 0; i < EXTRA_ENTRIES.length; i++) {
out.println(EXTRA_ENTRIES[i]);
}
out.close();
in.close();
}
/**
* Entry
*/
public static void main(String[] args) throws IOException {
new MkEncodingsTable().parseIanaRegistry();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy