All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.marc4j.converter.impl.ReverseCodeTable Maven / Gradle / Ivy


package org.marc4j.converter.impl;

import java.util.Hashtable;

import org.xml.sax.helpers.DefaultHandler;

/**
 * 

* ReverseCodeTable is a set of methods to facilitate Unicode to MARC-8 character conversion, it tracks * the current charset encodings that are in use, and defines abstract methods isCombining() and getCharTable()which * must be overridden in a sub-class to actually implement the Unicode to MARC8 character conversion. There are two * defined subclasses: ReverseCodeTableHash which reads in and parses a large XML file at runtime, and * ReverseCodeTableGenerated which consists of a couple of switch statements that implement the same two methods. The * code for the second of these two sub-classes, ReverseCodeTableGenerated, is generated when the marc4j jar file is * created using the same XML file that ReverseCodeTableHash uses. *

* * @author Robert Haschart * @author Corey Keith */ public abstract class ReverseCodeTable { static final byte G0 = 0; static final byte G1 = 1; /** * Abstract method that must be defined in a sub-class, used in the conversion of Unicode to MARC-8. For a given * Unicode character, determine whether that character is a combining character (an accent mark or diacritic) * * @param c - the UCS/Unicode character to look up * @return boolean - true if character is a combining character */ abstract public boolean isCombining(Character c); /** * Abstract method that must be defined in a sub-class, used in the conversion of Unicode to MARC-8. For a given * Unicode character, return ALL of the possible MARC-8 representations of that character. These are represented * in a Hashtable where the key is the ISOcode of the character set in MARC-8 that the Unicode character appears * in, and the value is the MARC-8 character value in that character set that encodes the given Unicode character. * * @param c - the UCS/Unicode character to look up * @return Hashtable - contains all of the possible MARC-8 representations of that Unicode character */ abstract public Hashtable getCharTable(Character c); protected Character lastLookupKey = null; protected Hashtable lastLookupValue = null; protected byte g[]; protected String charsetsUsed; /** * Default constructor for the abstract class, allocates and initializes the structures that are used to track the * current character sets in use. */ public ReverseCodeTable() { g = new byte[2]; init(); } /** * Initializes the ReverseCodeTable state to the default value for encoding a field. */ public void init() { g[0] = 0x42; g[1] = 0x45; charsetsUsed = "BE"; } /** * Routine used for tracking which character set is currently in use for characters less than 0x80 * * @return byte - the current G0 character set in use. */ public byte getPreviousG0() { return g[G0]; } /** * Routine used for tracking which character set is currently in use for characters greater than 0x80 * * @return byte - the current G1 character set in use. */ public byte getPreviousG1() { return g[G1]; } /** * Routine used for changing which character set is currently in use for characters less than 0x80 * * @param table - the current G0 character set to be used. */ public void setPreviousG0(final int table) { g[G0] = (byte) table; } /** * Routine used for changing which character set is currently in use for characters greater than 0x80 * * @param table - the current G1 character set to be used. */ public void setPreviousG1(final int table) { g[G1] = (byte) table; } /** * Performs a lookup of the MARC8 translation of a given Unicode character. Caches the results in lastLookupKey * and lastLookupValue so that subsequent lookups made in processing the same character will proceed more quickly. * * @param c - the UCS/Unicode character to look up * @return Hashtable - contains all of the possible MARC-8 representations of that Unicode character */ public Hashtable codeTableHash(final Character c) { if (lastLookupKey != null && c.equals(lastLookupKey)) { return lastLookupValue; } lastLookupKey = c; lastLookupValue = getCharTable(c); return lastLookupValue; } /** * Checks whether a MARC8 translation of a given Unicode character exists. * * @param c - the UCS/Unicode character to look up * @return boolean - true if there is one or more MARC-8 representation of the given Unicode character. */ public boolean charHasMatch(final Character c) { return codeTableHash(c) != null; } /** * Checks whether a MARC8 translation of a given Unicode character exists in the character set currently loaded as * the G0 character set. * * @param c - the UCS/Unicode character to look up * @return boolean - true if there is a MARC-8 representation of the given Unicode character in the current G0 * character set */ public boolean inPreviousG0CharEntry(final Character c) { final Hashtable chars = codeTableHash(c); if (chars != null && chars.get((int) getPreviousG0()) != null) { return true; } return false; } /** * Checks whether a MARC8 translation of a given Unicode character exists in the character set currently loaded as * the G1 character set. * * @param c - the UCS/Unicode character to look up * @return boolean - true if there is a MARC-8 representation of the given Unicode character in the current G1 * character set */ public boolean inPreviousG1CharEntry(final Character c) { final Hashtable chars = codeTableHash(c); if (chars != null && chars.get((int) getPreviousG1()) != null) { return true; } return false; } /** * Returns the MARC8 translation of a given Unicode character from the character set currently loaded as the G0 * character set. * * @param c - the UCS/Unicode character to look up * @return boolean - true if there is a MARC-8 representation of the given Unicode character in the current G0 * character set */ public char[] getCurrentG0CharEntry(final Character c) { return getCharEntry(c, getPreviousG0()); } /** * Returns the MARC8 translation of a given Unicode character from the character set currently loaded as the G0 * character set. * * @param c - the UCS/Unicode character to look up * @return boolean - true if there is a MARC-8 representation of the given Unicode character in the current G0 * character set */ public char[] getCurrentG1CharEntry(final Character c) { return getCharEntry(c, getPreviousG1()); } /** * Returns the MARC8 translation of a given Unicode character from the character set currently loaded as either * the G0 or the G1 character set, as specified by the second parameter. * * @param c - the UCS/Unicode character to look up * @param charset - whether to use the current G0 charset of the current G1 charset to perform the lookup * @return boolean - true if there is a MARC-8 representation of the given Unicode character in the current G0 * character set */ public char[] getCharEntry(final Character c, final int charset) { final Hashtable chars = codeTableHash(c); if (chars == null) { return new char[0]; } return chars.get(charset); } /** * Lookups up the MARC8 translation of a given Unicode character and determines which of the MARC-8 character sets * that have a translation for that Unicode character is the best one to use. If one one charset has a * translation, that one will be returned. If more than one charset has a translation then return the first one * listed. * * @param c - the UCS/Unicode character to look up * @return boolean - true if there is a MARC-8 representation of the given Unicode character in the current G0 * character set */ public char getBestCharSet(final Character c) { final Hashtable chars = codeTableHash(c); final char returnVal; if (chars.keySet().size() == 1) { return (char) chars.keySet().iterator().next().intValue(); } for (int i = 0; i < charsetsUsed.length(); i++) { final char toUse = charsetsUsed.charAt(i); if (chars.containsKey((int) toUse)) { return toUse; } } if (chars.containsKey('S')) { returnVal = 'S'; } else { returnVal = (char) chars.keySet().iterator().next().intValue(); } charsetsUsed = charsetsUsed + returnVal; return returnVal; } /** * Utility function for translating a String consisting of one or more two character hex string of the character * values into a character array containing those characters * * @param str - A string containing the two-character hex strings of characters to decode * @return char[] - an array of characters represented by the */ public static char[] deHexify(final String str) { char result[] = null; if (str.length() == 2) { result = new char[1]; result[0] = (char) Integer.parseInt(str, 16); } else if (str.length() == 6) { result = new char[3]; result[0] = (char) Integer.parseInt(str.substring(0, 2), 16); result[1] = (char) Integer.parseInt(str.substring(2, 4), 16); result[2] = (char) Integer.parseInt(str.substring(4, 6), 16); } return result; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy