All Downloads are FREE. Search and download functionalities are using the official Maven repository.

info.freelibrary.marc4j.converter.impl.AnselToUnicode Maven / Gradle / Ivy

/**
 * Copyright (C) 2002 Bas Peters ([email protected])
 *
 * This file is part of MARC4J
 *
 * MARC4J is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * MARC4J is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with MARC4J; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

package info.freelibrary.marc4j.converter.impl;

import java.io.InputStream;
import java.lang.reflect.Constructor;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.marc4j.ErrorHandler;
import org.marc4j.MarcException;
import org.marc4j.converter.CharConverter;

/**
 * 

* A utility to convert MARC-8 data to non-precomposed UCS/Unicode. *

*

* The MARC-8 to Unicode mapping used is the version with the March 2005 revisions. *

* * @author Bas Peters * @author Corey Keith */ public class AnselToUnicode extends CharConverter { class Queue extends Vector { private static final long serialVersionUID = 1L; /** * Puts an item into the queue. * * @param item the item to be put into the queue. */ public Object put(final Character item) { addElement(item); return item; } /** * Gets an item from the front of the queue. */ public Object get() { Object obj; obj = peek(); removeElementAt(0); return obj; } /** * Peeks at the front of the queue. */ public Object peek() { return elementAt(0); } /** * Returns true if the queue is empty. */ public boolean empty() { return size() == 0; } } class CodeTracker { int offset; int g0; int g1; boolean multibyte; @Override public String toString() { return "Offset: " + offset + " G0: " + Integer.toHexString(g0) + " G1: " + Integer.toHexString(g1) + " Multibyte: " + multibyte; } } protected CodeTableInterface ct; protected boolean loadedMultibyte = false; // flag that indicates whether Numeric Character References of the form &#XXXX; should be translated to the // unicode code point specified by the 4 hexidecimal digits. As described on this page // http://www.loc.gov/marc/specifications/speccharconversion.html#lossless protected boolean translateNCR = false; /** * Returns true if should translate to NCR. * * @return True if should translate to NCR */ public boolean shouldTranslateNCR() { return translateNCR; } /** * Sets whether we should translate to NCR. * * @param translateNCR True if we should translate to NCR; else, false */ public void setTranslateNCR(final boolean translateNCR) { this.translateNCR = translateNCR; } /** * Should return true if the CharConverter outputs Unicode encoded characters * * @return boolean whether the CharConverter returns Unicode encoded characters */ @Override public boolean outputsUnicode() { return (true); } protected ErrorHandler errorList = null; /** * Creates a new instance and loads the MARC4J supplied conversion tables based on the official LC tables. */ public AnselToUnicode() { ct = loadGeneratedTable(false); } /** * Creates a new instance and loads the MARC4J supplied conversion tables based on the official LC tables. */ public AnselToUnicode(final boolean loadMultibyte) { ct = loadGeneratedTable(loadMultibyte); } /** * Creates a new instance and loads the MARC4J supplied conversion tables based on the official LC tables. */ public AnselToUnicode(final ErrorHandler errorList) { ct = loadGeneratedTable(false); this.errorList = errorList; } /** * Creates a new instance and loads the MARC4J supplied conversion tables based on the official LC tables. */ public AnselToUnicode(final ErrorHandler errorList, final boolean loadMultibyte) { ct = loadGeneratedTable(loadMultibyte); this.errorList = errorList; } private CodeTableInterface loadGeneratedTable(final boolean loadMultibyte) { try { final Class generated = Class.forName("org.marc4j.converter.impl.CodeTableGenerated"); final Constructor cons = generated.getConstructor(); final Object ct = cons.newInstance(); loadedMultibyte = true; return ((CodeTableInterface) ct); } catch (final Exception e) { CodeTableInterface ct; if (loadMultibyte) { ct = new CodeTable(AnselToUnicode.class.getResourceAsStream("resources/codetables.xml")); } else { ct = new CodeTable(AnselToUnicode.class.getResourceAsStream("resources/codetablesnocjk.xml")); } loadedMultibyte = loadMultibyte; return (ct); } } /** * Constructs an instance with the specified pathname. Use this constructor to create an instance with a * customized code table mapping. The mapping file should follow the structure of LC's XML MARC-8 to Unicode * mapping (see: http://www.loc.gov/marc/specifications/codetables.xml). */ public AnselToUnicode(final String pathname) { ct = new CodeTable(pathname); loadedMultibyte = true; } /** * Constructs an instance with the specified input stream. Use this constructor to create an instance with a * customized code table mapping. The mapping file should follow the structure of LC's XML MARC-8 to Unicode * mapping (see: http://www.loc.gov/marc/specifications/codetables.xml). */ public AnselToUnicode(final InputStream in) { ct = new CodeTable(in); loadedMultibyte = true; } /** * Loads the entire mapping (including multibyte characters) from the Library of Congress. */ private void loadMultibyte() { ct = new CodeTable(getClass().getResourceAsStream("resources/codetables.xml")); } private void checkMode(final char[] data, final CodeTracker cdt) { int extra = 0; int extra2 = 0; while (cdt.offset + extra + extra2 < data.length && isEscape(data[cdt.offset])) { if (cdt.offset + extra + extra2 + 1 == data.length) { cdt.offset += 1; if (errorList != null) { errorList.addError(ErrorHandler.MINOR_ERROR, "Escape character found at end of field, discarding it."); } else { throw new MarcException("Escape character found at end of field"); } break; } switch (data[cdt.offset + 1 + extra]) { case 0x28: // '(' case 0x2c: // ',' set_cdt(cdt, 0, data, 2 + extra, false); break; case 0x29: // ')' case 0x2d: // '-' set_cdt(cdt, 1, data, 2 + extra, false); break; case 0x24: // '$' if (!loadedMultibyte) { loadMultibyte(); loadedMultibyte = true; } switch (data[cdt.offset + 2 + extra + extra2]) { case 0x29: // ')' case 0x2d: // '-' set_cdt(cdt, 1, data, 3 + extra + extra2, true); break; case 0x2c: // ',' set_cdt(cdt, 0, data, 3 + extra + extra2, true); break; case 0x31: // '1' cdt.g0 = data[cdt.offset + 2 + extra + extra2]; cdt.offset += 3 + extra + extra2; cdt.multibyte = true; break; case 0x20: // ' ' // space found in escape code: look ahead and try to proceed extra2++; break; default: // unknown code character found: discard escape sequence and return cdt.offset += 1; if (errorList != null) { errorList.addError(ErrorHandler.MINOR_ERROR, "Unknown character set code found following escape character. " + "Discarding escape character."); } else { throw new MarcException( "Unknown character set code found following escape character."); } break; } break; case 0x67: // 'g' case 0x62: // 'b' case 0x70: // 'p' cdt.g0 = data[cdt.offset + 1 + extra]; cdt.offset += 2 + extra; cdt.multibyte = false; break; case 0x73: // 's' cdt.g0 = 0x42; cdt.offset += 2 + extra; cdt.multibyte = false; break; case 0x20: // ' ' // space found in escape code: look ahead and try to proceed if (errorList == null) { throw new MarcException( "Extraneous space character found within MARC8 character set escape sequence"); } extra++; break; default: // unknown code character found: discard escape sequence and return cdt.offset += 1; if (errorList != null) { errorList.addError(ErrorHandler.MINOR_ERROR, "Unknown character set code found following escape character. Discarding " + "escape character."); } else { throw new MarcException("Unknown character set code found following escape character."); } break; } } if (errorList != null && (extra != 0 || extra2 != 0)) { errorList.addError(ErrorHandler.ERROR_TYPO, "" + (extra + extra2) + " extraneous space characters found within MARC8 character set escape sequence"); } } private void set_cdt(final CodeTracker cdt, final int g0_or_g1, final char[] data, final int aAddnlOffset, final boolean multibyte) { int addnlOffset = aAddnlOffset; if (data[cdt.offset + addnlOffset] == '!' && data[cdt.offset + addnlOffset + 1] == 'E') { addnlOffset++; } else if (data[cdt.offset + addnlOffset] == ' ') { if (errorList != null) { errorList.addError(ErrorHandler.ERROR_TYPO, "Extraneous space character found within MARC8 character set escape sequence. " + "Skipping over space."); } else { throw new MarcException( "Extraneous space character found within MARC8 character set escape sequence"); } addnlOffset++; } else if ("(,)-$!".indexOf(data[cdt.offset + addnlOffset]) != -1) { if (errorList != null) { errorList.addError(ErrorHandler.MINOR_ERROR, "Extraneaous intermediate character found following escape character. Discarding " + "intermediate character."); } else { throw new MarcException("Extraneaous intermediate character found following escape character."); } addnlOffset++; } if ("34BE1NQS2".indexOf(data[cdt.offset + addnlOffset]) == -1) { cdt.offset += 1; cdt.multibyte = false; if (errorList != null) { errorList.addError(ErrorHandler.MINOR_ERROR, "Unknown character set code found following escape character. Discarding escape character."); } else { throw new MarcException("Unknown character set code found following escape character."); } } else { // All is well, proceed normally if (g0_or_g1 == 0) { cdt.g0 = data[cdt.offset + addnlOffset]; } else { cdt.g1 = data[cdt.offset + addnlOffset]; } cdt.offset += 1 + addnlOffset; cdt.multibyte = multibyte; } } /** *

* Converts MARC-8 data to UCS/Unicode. *

* * @param data - the MARC-8 data in an array of char * @return String - the UCS/Unicode data */ @Override public String convert(final char data[]) { final StringBuffer sb = new StringBuffer(); final int len = data.length; final CodeTracker cdt = new CodeTracker(); cdt.g0 = 0x42; cdt.g1 = 0x45; cdt.multibyte = false; cdt.offset = 0; checkMode(data, cdt); final Queue diacritics = new Queue(); while (cdt.offset < data.length) { if (ct.isCombining(data[cdt.offset], cdt.g0, cdt.g1) && hasNext(cdt.offset, len)) { while (cdt.offset < len && ct.isCombining(data[cdt.offset], cdt.g0, cdt.g1) && hasNext(cdt.offset, len)) { final char c = getCharCDT(data, cdt); if (c != 0) { diacritics.put(new Character(c)); } checkMode(data, cdt); } if (cdt.offset >= len) { if (errorList != null) { errorList.addError(ErrorHandler.MINOR_ERROR, "Diacritic found at the end of field, without the character that it is " + "supposed to decorate"); break; } } final char c2 = getCharCDT(data, cdt); checkMode(data, cdt); if (c2 != 0) { sb.append(c2); } while (!diacritics.isEmpty()) { final char c1 = ((Character) diacritics.get()).charValue(); sb.append(c1); } } else if (cdt.multibyte) { final String mbstr = convertMultibyte(cdt, data); sb.append(mbstr); } else { final int offset = cdt.offset; final char cdtchar = data[offset]; char c = getCharCDT(data, cdt); boolean greekErrorFixed = false; if (errorList != null && cdt.g0 == 0x53 && data[offset] > 0x20 && data[offset] < 0x40) { if (c == 0 && data[offset] > 0x20 && data[offset] < 0x40) { errorList.addError(ErrorHandler.MINOR_ERROR, "Unknown punctuation mark found in Greek character set, inserting change " + "to default character set"); cdt.g0 = 0x42; // change to default character set c = getChar(data[offset], cdt.g0, cdt.g1); if (c != 0) { sb.append(c); greekErrorFixed = true; } } else if (offset + 1 < data.length && data[offset] >= '0' && data[offset] <= '9' && data[offset + 1] >= '0' && data[offset + 1] <= '9') { errorList.addError(ErrorHandler.MINOR_ERROR, "Unlikely sequence of punctuation mark found in Greek character set, it " + "likely a number, inserting change to default character set"); cdt.g0 = 0x42; // change to default character set final char c1 = getChar(data[offset], cdt.g0, cdt.g1); if (c1 != 0) { sb.append(c1); greekErrorFixed = true; } } } if (!greekErrorFixed && c != 0) { sb.append(c); } else if (!greekErrorFixed && c == 0) { final String val = "0000" + Integer.toHexString((cdtchar)); sb.append(""); } } if (hasNext(cdt.offset, len)) { checkMode(data, cdt); } } String dataElement = sb.toString(); if (translateNCR && dataElement.matches("[^&]*&#x[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f];.*")) { final Pattern pattern = Pattern.compile("&#x([0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f]);"); final Matcher matcher = pattern.matcher(dataElement); final StringBuffer newElement = new StringBuffer(); int prevEnd = 0; while (matcher.find()) { newElement.append(dataElement.substring(prevEnd, matcher.start())); newElement.append(getCharFromCodePoint(matcher.group(1))); prevEnd = matcher.end(); } newElement.append(dataElement.substring(prevEnd)); dataElement = newElement.toString(); } return (dataElement); } private String convertMultibyte(final CodeTracker cdt, final char[] data) { final StringBuffer sb = new StringBuffer(); int offset = cdt.offset; while (offset < data.length && data[offset] != 0x1b) { final int length = getRawMBLength(data, offset); final int spaces = getNumSpacesInMBLength(data, offset); boolean errorsPresent = false; if ((length - spaces) % 3 != 0) { errorsPresent = true; } // if a 0x20 byte occurs amidst a sequence of multibyte characters // skip over it and output a space. if (data[offset] == 0x20) { sb.append(' '); offset++; } else if (data[offset] >= 0x80) { final char c2 = getChar(data[offset], cdt.g0, cdt.g1); sb.append(c2); offset += 1; } else if (errorList == null) { if (offset + 3 <= data.length) { final char c = getMBChar(makeMultibyte(data[offset], data[offset + 1], data[offset + 2])); if (c != 0) { sb.append(c); offset += 3; } else { sb.append(data[offset]); sb.append(data[offset + 1]); sb.append(data[offset + 2]); offset += 3; } } else { while (offset < data.length) { sb.append(data[offset++]); } } } else if (errorsPresent == false && offset + 3 <= data.length && (errorList == null || data[offset + 1] != 0x20 && data[offset + 2] != 0x20) && getMBChar(makeMultibyte(data[offset], data[offset + 1], data[offset + 2])) != 0) { final char c = getMBChar(makeMultibyte(data[offset], data[offset + 1], data[offset + 2])); if (errorList == null || c != 0) { sb.append(c); offset += 3; } } else if (offset + 6 < data.length && noneEquals(data, offset, offset + 3, ' ') && (getMBCharAlias(data, offset, new int[] { 0, 1, 2 }) == 0 || getMBCharAlias(data, offset, new int[] { 3, 4, 5 }) == 0) && getMBCharAlias(data, offset, new int[] { 2, 3, 4 }) != 0 && noneEquals(data, offset, offset + 5, 0x1b) && noneInRange(data, offset, offset + 5, 0x80, 0xFF) && !nextEscIsMB(data, offset, data.length)) { final String mbstr = getMBCharStr(makeMultibyte(data[offset], '[', data[offset + 1])) + getMBCharStr( makeMultibyte(data[offset], ']', data[offset + 1])) + getMBCharStr(makeMultibyte(data[offset], data[offset + 1], '[')) + getMBCharStr(makeMultibyte(data[offset], data[offset + 1], ']')); if (mbstr.length() == 1) { if (errorList != null) { errorList.addError(ErrorHandler.MINOR_ERROR, "Missing square brace character in MARC8 multibyte character, inserting " + "one to create the only valid option"); } sb.append(mbstr); offset += 2; } else if (mbstr.length() > 1) { if (errorList != null) { errorList.addError(ErrorHandler.MAJOR_ERROR, "Missing square brace character in MARC8 multibyte character, inserting " + "one to create a randomly chosen valid option"); } sb.append(mbstr.subSequence(0, 1)); offset += 2; } else if (mbstr.length() == 0) { if (errorList != null) { errorList.addError(ErrorHandler.MINOR_ERROR, "Erroneous MARC8 multibyte character, Discarding bad character and " + "continuing reading Multibyte characters"); } sb.append("[?]"); offset += 2; } } else if (offset + 7 < data.length && noneEquals(data, offset, offset + 3, ' ') && (getMBCharAlias(data, offset, new int[] { 0, 1, 2 }) == 0 || getMBCharAlias(data, offset, new int[] { 3, 4, 5 }) == 0) && getMBCharAlias(data, offset, new int[] { 2, 3, 4 }) != 0 && noneEquals(data, offset, offset + 6, 0x1b) && noneInRange(data, offset, offset + 6, 0x80, 0xFF) && !nextEscIsMB(data, offset, data.length)) { final String mbstr = getMBCharStr(makeMultibyte(data[offset], '[', data[offset + 1])) + getMBCharStr( makeMultibyte(data[offset], ']', data[offset + 1])) + getMBCharStr(makeMultibyte(data[offset], data[offset + 1], '[')) + getMBCharStr(makeMultibyte(data[offset], data[offset + 1], ']')); if (mbstr.length() == 1) { if (errorList != null) { errorList.addError(ErrorHandler.MINOR_ERROR, "Missing square brace character in MARC8 multibyte character, inserting " + "one to create the only valid option"); } sb.append(mbstr); offset += 2; } else if (mbstr.length() > 1) { if (errorList != null) { errorList.addError(ErrorHandler.MAJOR_ERROR, "Missing square brace character in MARC8 multibyte character, inserting " + "one to create a randomly chosen valid option"); } sb.append(mbstr.subSequence(0, 1)); offset += 2; } else if (mbstr.length() == 0) { if (errorList != null) { errorList.addError(ErrorHandler.MINOR_ERROR, "Erroneous MARC8 multibyte character, Discarding bad character and " + "continuing reading Multibyte characters"); } sb.append("[?]"); offset += 2; } } else if (offset + 4 <= data.length && data[offset] > 0x7f && getMBChar(makeMultibyte(data[offset + 1], data[offset + 2], data[offset + 3])) != 0) { if (errorList != null) { errorList.addError(ErrorHandler.MINOR_ERROR, "Erroneous character in MARC8 multibyte character, Copying bad character " + "and continuing reading Multibyte characters"); // FIXME: brackets right here? sb.append(getChar(data[offset], 0x42, 0x45)); offset += 1; } } else if (errorList != null && offset + 4 <= data.length && (data[offset + 1] == 0x20 || data[offset + 2] == 0x20)) { final int multiByte = makeMultibyte(data[offset], ((data[offset + 1] != 0x20) ? data[offset + 1] : data[offset + 2]), data[offset + 3]); final char c = getMBChar(multiByte); if (c != 0) { if (errorList != null) { errorList.addError(ErrorHandler.ERROR_TYPO, "Extraneous space found within MARC8 multibyte character"); } sb.append(c); sb.append(' '); offset += 4; } else { if (errorList != null) { errorList.addError(ErrorHandler.MINOR_ERROR, "Erroneous MARC8 multibyte character, inserting change to default character set"); } cdt.multibyte = false; cdt.g0 = 0x42; cdt.g1 = 0x45; break; } } else if (offset + 3 > data.length || offset + 3 == data.length && (data[offset + 1] == 0x20 || data[offset + 2] == 0x20)) { if (errorList != null) { errorList.addError(ErrorHandler.MINOR_ERROR, "Partial MARC8 multibyte character, inserting change to default character set"); } cdt.multibyte = false; cdt.g0 = 0x42; cdt.g1 = 0x45; break; } else if (offset + 3 <= data.length && getMBChar(makeMultibyte(data[offset + 0], data[offset + 1], data[offset + 2])) != 0) { final char c = getMBChar(makeMultibyte(data[offset], data[offset + 1], data[offset + 2])); if (errorList == null || c != 0) { sb.append(c); offset += 3; } } else { if (errorList != null) { errorList.addError(ErrorHandler.MINOR_ERROR, "Erroneous MARC8 multibyte character, inserting change to default character set"); } cdt.multibyte = false; cdt.g0 = 0x42; cdt.g1 = 0x45; break; } } cdt.offset = offset; return (sb.toString()); } private boolean nextEscIsMB(final char[] data, final int start, final int length) { for (int offset = start; offset < length - 1; offset++) { if (data[offset] == (char) 0x1b) { if (data[offset + 1] == '$') { return (true); } else { break; } } } return false; } private boolean noneEquals(final char[] data, final int start, final int end, final int val) { for (int offset = start; offset <= end; offset++) { if (data[offset] == (char) val) { return (false); } } return (true); } private boolean noneInRange(final char[] data, final int start, final int end, final int val1, final int val2) { for (int offset = start; offset <= end; offset++) { if (data[offset] >= (char) val1 && data[offset] <= (char) val2) { return (false); } } return (true); } /** * Alias function for:
     * makeMultibyte(data[cdt.offset + 3], data[cdt.offset + 4], data[cdt.offset + 5])
     * 
* * @param data * @param offset * @param bumps * @return */ private char getMBCharAlias(final char[] data, final int offset, final int[] bumps) { return getMBChar(makeMultibyte(data[offset + bumps[0]], data[offset + bumps[1]], data[offset + bumps[2]])); } private int getRawMBLength(final char[] aDataArray, final int aOffset) { int offset = aOffset; int length = 0; while (offset < aDataArray.length && aDataArray[offset] != 0x1b) { offset++; length++; } return length; } private int getNumSpacesInMBLength(final char[] data, final int aOffset) { int offset = aOffset; int cnt = 0; while (offset < data.length && data[offset] != 0x1b) { if (data[offset] == ' ') { cnt++; } offset++; } return cnt; } private char getCharCDT(final char[] data, final CodeTracker cdt) { char c = getChar(data[cdt.offset], cdt.g0, cdt.g1); if (translateNCR && c == '&' && data.length > cdt.offset + 8) { final String tmp = new String(data, cdt.offset, 8); if (tmp.matches("&#x[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f];")) { c = getCharFromCodePoint(tmp.substring(3, 7)); cdt.offset += 8; } else { cdt.offset++; } } else { cdt.offset++; } return (c); } private char getCharFromCodePoint(final String charCodePoint) { final int charNum = Integer.parseInt(charCodePoint, 16); return ((char) charNum); } /** * Makes a multibyte. * * @param c1 Character one * @param c2 Character two * @param c3 Character three * @return A multibyte */ public int makeMultibyte(final char c1, final char c2, final char c3) { final int[] chars = new int[3]; chars[0] = c1 << 16; chars[1] = c2 << 8; chars[2] = c3; return chars[0] | chars[1] | chars[2]; } private char getChar(final int ch, final int g0, final int g1) { if (ch <= 0x7E) { return ct.getChar(ch, g0); } else { return ct.getChar(ch, g1); } } /** * Gets the multibyte character. * * @param ch The int from which to get the multibyte character * @return The multibyte character */ public char getMBChar(final int ch) { return ct.getChar(ch, 0x31); } /** * Gets the multibyte character string. * * @param ch The int from which to get the multibyte character * @return The multibyte character string */ public String getMBCharStr(final int ch) { final char c = ct.getChar(ch, 0x31); if (c == 0) { return (""); } else { return "" + c; } } private static boolean hasNext(final int pos, final int len) { if (pos < (len - 1)) { return true; } return false; } private static boolean isEscape(final int i) { if (i == 0x1B) { return true; } return false; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy