org.marc4j.converter.impl.UnimarcToUnicode Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of marc4j Show documentation

An easy to use Application Programming Interface (API) for working with MARC and MARCXML in Java.

There is a newer version: 2.9.5

/**
 * Copyright (C) 2018
 *
 * This file is part of MARC4J
 *
 * MARC4J is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * MARC4J is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with MARC4J; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
package org.marc4j.converter.impl;

import org.marc4j.ConverterErrorHandler;
import org.marc4j.MarcError;
import org.marc4j.MarcException;
import org.marc4j.converter.CharConverter;

import java.lang.reflect.Constructor;
import java.text.Normalizer;
import java.util.Arrays;
import java.util.Vector;

/**
 * A utility to convert UNIMARC data to non-precomposed UCS/Unicode.
 * 
 * This is based off of the AnselToUnicode class with modifications for UNIMARC conversion
 *
 * @author SirsiDynix from Bas Peters
 */
public class UnimarcToUnicode extends CharConverter implements UnimarcConstants {

    class Queue extends Vector {

        /**
         * Puts an item into the queue.
         *
         * @param item the item to be put into the queue.
         */
        public Object put(Object item) {
            addElement(item);

            return item;
        }

        /**
         * Gets an item from the front of the queue.
         */
        public Object get() {
            Object obj;

            obj = peek();
            removeElementAt(0);

            return obj;
        }

        /**
         * Peeks at the front of the queue.
         */
        public Object peek() {
            return elementAt(0);
        }

        /**
         * Returns true if the queue is empty.
         */
        public boolean empty() {
            return size() == 0;
        }
    }

    class CodeTracker {
        int offset = 0;

        int g0 = DEFAULT_G0;            // 02-07
        int g1 = DEFAULT_G1;            // 0A-0F
        int workingG0 = DEFAULT_G0;
        int workingG1 = DEFAULT_G1;
        int workingG2 = DEFAULT_G2;
        int workingG3 = DEFAULT_G3;

        boolean multibyte = false;
        boolean isG0multibyte = false;
        boolean isG1multibyte = false;
        boolean isG2multibyte = false;
        boolean isG3multibyte = false;

        CodeTracker() {
        }

        CodeTracker(CodeTracker tracker) {
            if (tracker != null) {
                g0 = tracker.g0;
                g1 = tracker.g1;
                workingG0 = tracker.workingG0;
                workingG1 = tracker.workingG1;
                workingG2 = tracker.workingG2;
                workingG3 = tracker.workingG3;
                multibyte = tracker.multibyte;
                isG0multibyte = tracker.isG0multibyte;
                isG1multibyte = tracker.isG1multibyte;
                isG2multibyte = tracker.isG2multibyte;
                isG3multibyte = tracker.isG3multibyte;
            }
        }

        public String toString() {
            return "Offset: " + offset +
                " G0: " + Integer.toHexString(g0) +
                " G1: " + Integer.toHexString(g1) +
                " Multibyte: " + multibyte;
        }
    }

    protected CodeTableInterface ct;

    protected CodeTracker altCodeTracker = null;

    // flag that indicates whether Numeric Character References of the form
    // &#XXXX; (Marc-8 NCR) should be translated to the unicode code point
    // specified by the 4 hexidecimal digits. Marc-8 NCR is as described on
    // this page http://www.loc.gov/marc/specifications/speccharconversion.html#lossless
    // Note: Also translates (optional) "" (Unicode BNF) format character references.
    protected boolean translateNCR = false;

    // flag that indicates we should normalize the results of the convert method (i.e. compose any
    // decomposed Unicode characters.  Default false.
    protected boolean composeUnicode = false;


    /**
     * Returns true if should translate to NCR.
     *
     * @return True if should translate to NCR
     */
    public boolean shouldTranslateNCR() {
        return translateNCR;
    }

    /**
     * Sets whether we should translate to NCR (i.e. convert "&#XXXX;" sequences to Unicode).
     * If shouldComposeUnicode() is also true, the NCR Translate will happen before the
     * compose.
     * 

     * Note: Also translates any (optional) "<U+XXXX>" (Unicode BNF) sequences to Unicode).
     *
     * @param translateNCR True if we should translate to NCR; else, false
     */
    public void setTranslateNCR(final boolean translateNCR) {
        this.translateNCR = translateNCR;
    }

    /**
     * Returns true if Unicode decomposed characters should be composed.
     *
     * @return True if we should compose Unicode characters, else, false to leave them alone.
     */
    public boolean shouldComposeUnicode() {
        return composeUnicode;
    }

    /**
     * Sets whether we should compose Unicode decomposed charactes.
     *
     * @param composeUnicode True if we should compose Unicode characters, else, false.  Default false.
     */
    public void setComposeUnicode(boolean composeUnicode) {
        this.composeUnicode = composeUnicode;
    }

    protected ConverterErrorHandler errorHandler = null;

    /**
     * Default constructor.
     */
    public UnimarcToUnicode() {
        ct = loadGeneratedTable();
    }

    /**
     * Creates a new instance, and registers a class that handles it's own errors.  When set, this class will
     * log errors rather than throw exceptions, letting the error handler class handle the errors.
     *
     * @param errorHandler A class that handles its own errors, used for recording Errors detected in translation
     *                  of the field data.
     */
    public UnimarcToUnicode(final ConverterErrorHandler errorHandler) {
        ct = loadGeneratedTable();
        this.errorHandler = errorHandler;
    }

    private CodeTableInterface loadGeneratedTable() {
        try {
            final Class generated = Class
                    .forName("org.marc4j.converter.impl.UnimarcCodeTableGenerated");
            final Constructor cons = generated.getConstructor();
            final Object ct = cons.newInstance();

            return (CodeTableInterface) ct;
        } catch (final Exception e) {
            return new CodeTable(getClass().getResourceAsStream("resources/unimarc.xml"));
        }
    }

    private void checkMode(char[] data, CodeTracker cdt) {
        while (cdt.offset < data.length) {
            cdt.multibyte = false;
            if (data[cdt.offset] == LS0) {
                cdt.g0 = cdt.workingG0;
                cdt.offset += 1;
                cdt.multibyte = cdt.isG0multibyte;
            } else if (data[cdt.offset] == LS1) {
                cdt.g0 = cdt.workingG1;
                cdt.offset += 1;
                cdt.multibyte = cdt.isG1multibyte;
            } else if (isEscape(data[cdt.offset])) {
                switch (data[cdt.offset + 1]) {
                    case LS1R:
                        if (cdt.offset + 2 >= data.length) {
                            cdt.offset += 1;
                            if (errorHandler != null) {
                                errorHandler.addError(MarcError.MINOR_ERROR,
                                        "Incomplete character set code found following escape character. Discarding escape character."
                                        + " At offset " + cdt.offset + ":" + Arrays.toString(data));
                            } else {
                                throw new MarcException("Incomplete character set code found following escape character."
                                        + " At offset " + cdt.offset + ":" + Arrays.toString(data));
                            }
                        }
                        cdt.g1 = cdt.workingG1;
                        cdt.offset += 2;
                        cdt.multibyte = cdt.isG1multibyte;
                        break;
                    case LS2:
                        if (cdt.offset + 2 >= data.length) {
                            cdt.offset += 1;
                            if (errorHandler != null) {
                                errorHandler.addError(MarcError.MINOR_ERROR,
                                        "Incomplete character set code found following escape character. Discarding escape character."
                                                + " At offset " + cdt.offset + ":" + Arrays.toString(data));
                            } else {
                                throw new MarcException("Incomplete character set code found following escape character."
                                        + " At offset " + cdt.offset + ":" + Arrays.toString(data));
                            }
                        }
                        cdt.g0 = cdt.workingG2;
                        cdt.offset += 2;
                        cdt.multibyte = cdt.isG2multibyte;
                        break;
                    case LS2R:
                        if (cdt.offset + 2 >= data.length) {
                            cdt.offset += 1;
                            if (errorHandler != null) {
                                errorHandler.addError(MarcError.MINOR_ERROR,
                                        "Incomplete character set code found following escape character. Discarding escape character."
                                                + " At offset " + cdt.offset + ":" + Arrays.toString(data));
                            } else {
                                throw new MarcException("Incomplete character set code found following escape character."
                                        + " At offset " + cdt.offset + ":" + Arrays.toString(data));
                            }
                        }
                        cdt.g1 = cdt.workingG2;
                        cdt.offset += 2;
                        cdt.multibyte = cdt.isG2multibyte;
                        break;
                    case LS3:
                        if (cdt.offset + 2 >= data.length) {
                            cdt.offset += 1;
                            if (errorHandler != null) {
                                errorHandler.addError(MarcError.MINOR_ERROR,
                                        "Incomplete character set code found following escape character. Discarding escape character."
                                                + " At offset " + cdt.offset + ":" + Arrays.toString(data));
                            } else {
                                throw new MarcException("Incomplete character set code found following escape character."
                                        + " At offset " + cdt.offset + ":" + Arrays.toString(data));
                            }
                        }
                        cdt.g0 = cdt.workingG3;
                        cdt.offset += 2;
                        cdt.multibyte = cdt.isG3multibyte;
                        break;
                    case LS3R:
                        if (cdt.offset + 2 >= data.length) {
                            cdt.offset += 1;
                            if (errorHandler != null) {
                                errorHandler.addError(MarcError.MINOR_ERROR,
                                        "Incomplete character set code found following escape character. Discarding escape character."
                                                + " At offset " + cdt.offset + ":" + Arrays.toString(data));
                            } else {
                                throw new MarcException("Incomplete character set code found following escape character."
                                        + " At offset " + cdt.offset + ":" + Arrays.toString(data));
                            }
                        }
                        cdt.g1 = cdt.workingG3;
                        cdt.offset += 2;
                        cdt.multibyte = cdt.isG3multibyte;
                        break;
                    case 0x28:
                    case 0x2C:
                        if (cdt.offset + 3 >= data.length) {
                            cdt.offset += 1;
                            if (errorHandler != null) {
                                errorHandler.addError(MarcError.MINOR_ERROR,
                                        "Incomplete character set code found following escape character. Discarding escape character."
                                                + " At offset " + cdt.offset + ":" + Arrays.toString(data));
                            } else {
                                throw new MarcException("Incomplete character set code found following escape character."
                                        + " At offset " + cdt.offset + ":" + Arrays.toString(data));
                            }
                        }
                        cdt.workingG0 = data[cdt.offset + 2];
                        cdt.offset += 3;
                        cdt.isG0multibyte = false;
                        break;
                    case 0x29:
                    case 0x2D:
                        if (cdt.offset + 3 >= data.length) {
                            cdt.offset += 1;
                            if (errorHandler != null) {
                                errorHandler.addError(MarcError.MINOR_ERROR,
                                        "Incomplete character set code found following escape character. Discarding escape character."
                                                + " At offset " + cdt.offset + ":" + Arrays.toString(data));
                            } else {
                                throw new MarcException("Incomplete character set code found following escape character."
                                        + " At offset " + cdt.offset + ":" + Arrays.toString(data));
                            }
                        }
                        cdt.workingG1 = data[cdt.offset + 2];
                        cdt.offset += 3;
                        cdt.isG1multibyte = false;
                        break;
                    case 0x2A:
                    case 0x2E:
                        if (cdt.offset + 3 >= data.length) {
                            cdt.offset += 1;
                            if (errorHandler != null) {
                                errorHandler.addError(MarcError.MINOR_ERROR,
                                        "Incomplete character set code found following escape character. Discarding escape character."
                                                + " At offset " + cdt.offset + ":" + Arrays.toString(data));
                            } else {
                                throw new MarcException("Incomplete character set code found following escape character."
                                        + " At offset " + cdt.offset + ":" + Arrays.toString(data));
                            }
                        }
                        cdt.workingG2 = data[cdt.offset + 2];
                        cdt.offset += 3;
                        cdt.isG2multibyte = false;
                        break;
                    case 0x2B:
                    case 0x2F:
                        if (cdt.offset + 3 >= data.length) {
                            cdt.offset += 1;
                            if (errorHandler != null) {
                                errorHandler.addError(MarcError.MINOR_ERROR,
                                        "Incomplete character set code found following escape character. Discarding escape character."
                                                + " At offset " + cdt.offset + ":" + Arrays.toString(data));
                            } else {
                                throw new MarcException("Incomplete character set code found following escape character."
                                        + " At offset " + cdt.offset + ":" + Arrays.toString(data));
                            }
                        }
                        cdt.workingG3 = data[cdt.offset + 2];
                        cdt.offset += 3;
                        cdt.isG3multibyte = false;
                        break;
                    case 0x24:
                        if (cdt.offset + 2 >= data.length) {
                            cdt.offset += 1;
                            if (errorHandler != null) {
                                errorHandler.addError(MarcError.MINOR_ERROR,
                                        "Incomplete character set code found following escape character. Discarding escape character."
                                                + " At offset " + cdt.offset + ":" + Arrays.toString(data));
                            } else {
                                throw new MarcException("Incomplete character set code found following escape character."
                                        + " At offset " + cdt.offset + ":" + Arrays.toString(data));
                            }
                        }
                        switch (data[cdt.offset + 2]) {
                            case 0x2C:
                                if (cdt.offset + 4 >= data.length) {
                                    cdt.offset += 1;
                                    if (errorHandler != null) {
                                        errorHandler.addError(MarcError.MINOR_ERROR,
                                                "Incomplete character set code found following escape character. Discarding escape character."
                                                        + " At offset " + cdt.offset + ":" + Arrays.toString(data));
                                    } else {
                                        throw new MarcException("Incomplete character set code found following escape character."
                                                + " At offset " + cdt.offset + ":" + Arrays.toString(data));
                                    }
                                }
                                cdt.workingG0 = data[cdt.offset + 3];
                                cdt.offset += 4;
                                cdt.isG0multibyte = true;
                                break;
                            case 0x29:
                            case 0x2D:
                                if (cdt.offset + 4 >= data.length) {
                                    cdt.offset += 1;
                                    if (errorHandler != null) {
                                        errorHandler.addError(MarcError.MINOR_ERROR,
                                                "Incomplete character set code found following escape character. Discarding escape character."
                                                        + " At offset " + cdt.offset + ":" + Arrays.toString(data));
                                    } else {
                                        throw new MarcException("Incomplete character set code found following escape character."
                                                + " At offset " + cdt.offset + ":" + Arrays.toString(data));
                                    }
                                }
                                cdt.workingG1 = data[cdt.offset + 3];
                                cdt.offset += 4;
                                cdt.isG1multibyte = true;
                                break;
                            case 0x2A:
                            case 0x2E:
                                if (cdt.offset + 4 >= data.length) {
                                    cdt.offset += 1;
                                    if (errorHandler != null) {
                                        errorHandler.addError(MarcError.MINOR_ERROR,
                                                "Incomplete character set code found following escape character. Discarding escape character."
                                                        + " At offset " + cdt.offset + ":" + Arrays.toString(data));
                                    } else {
                                        throw new MarcException("Incomplete character set code found following escape character."
                                                + " At offset " + cdt.offset + ":" + Arrays.toString(data));
                                    }
                                }
                                cdt.workingG2 = data[cdt.offset + 3];
                                cdt.offset += 4;
                                cdt.isG2multibyte = true;
                                break;
                            case 0x2B:
                            case 0x2F:
                                if (cdt.offset + 4 >= data.length) {
                                    cdt.offset += 1;
                                    if (errorHandler != null) {
                                        errorHandler.addError(MarcError.MINOR_ERROR,
                                                "Incomplete character set code found following escape character. Discarding escape character."
                                                        + " At offset " + cdt.offset + ":" + Arrays.toString(data));
                                    } else {
                                        throw new MarcException("Incomplete character set code found following escape character."
                                                + " At offset " + cdt.offset + ":" + Arrays.toString(data));
                                    }
                                }
                                cdt.workingG3 = data[cdt.offset + 3];
                                cdt.offset += 4;
                                cdt.isG3multibyte = true;
                                break;
                            default:
                                if (cdt.offset + 3 >= data.length) {
                                    cdt.offset += 1;
                                    if (errorHandler != null) {
                                        errorHandler.addError(MarcError.MINOR_ERROR,
                                                "Incomplete character set code found following escape character. Discarding escape character."
                                                        + " At offset " + cdt.offset + ":" + Arrays.toString(data));
                                    } else {
                                        throw new MarcException("Incomplete character set code found following escape character."
                                                + " At offset " + cdt.offset + ":" + Arrays.toString(data));
                                    }
                                }
                                cdt.workingG0 = data[cdt.offset + 2];
                                cdt.offset += 3;
                                cdt.isG0multibyte = true;
                                break;
                        }
                    default:
                        // Unknown code character found: discard escape sequence and return (if have a errorHandler)
                        cdt.offset += 1;
                        if (errorHandler != null) {
                            errorHandler.addError(MarcError.MINOR_ERROR,
                                    "Unknown character set code found following escape character. Discarding escape character."
                                            + " At offset " + cdt.offset + ":" + Arrays.toString(data));
                        } else {
                            throw new MarcException("Unknown character set code found following escape character."
                                    + " At offset " + cdt.offset + ":" + Arrays.toString(data));
                        }

                        break;
                }
            } else {
                break;
            }
        }
    }

    /**
     * Resets the G0 and G1 charsets to the defaults (ASCII/ANSEL)
     */
    public void resetDefaultGX() {
        altCodeTracker = null;
    }

    /**
     * Allows the caller to set the default G0/G1/G2/G3 char sets
     *
     * @param altG0Code string pulled from 100 $a/26-27
     * @param altG1Code string pulled from 100 $a/28-29
     * @param altG2Code string pulled from 100 $a/30-31
     * @param altG3Code string pulled from 100 $a/32-33
     */
    public void setDefaultGX(String altG0Code, String altG1Code, String altG2Code, String altG3Code) {
        altCodeTracker = new CodeTracker();


        int iso = UnimarcCommon.determineCharSet(altG0Code);
        if (iso > 0) {
            altCodeTracker.g0 = iso;
            altCodeTracker.isG0multibyte = false;
            altCodeTracker.workingG0 = iso;
        }

        iso = UnimarcCommon.determineCharSet(altG1Code);
        if (iso > 0) {
            altCodeTracker.g1 = iso;
            altCodeTracker.isG1multibyte = false;
            altCodeTracker.workingG1 = iso;
        }

        iso = UnimarcCommon.determineCharSet(altG2Code);
        if (iso > 0) {
            altCodeTracker.isG2multibyte = false;
            altCodeTracker.workingG2 = iso;
        }

        iso = UnimarcCommon.determineCharSet(altG3Code);
        if (iso > 0) {
            altCodeTracker.isG3multibyte = false;
            altCodeTracker.workingG3 = iso;
        }
    }

    /**
     * 

     * Converts UNIMARC data to UCS/Unicode.
     * 
     *
     * @param data the UNIMARC data
     * @return String - the UCS/Unicode data
     */
    public String convert(char[] data) {
        StringBuilder sb = new StringBuilder();
        int len = data.length;

        CodeTracker cdt = new CodeTracker(altCodeTracker);

        checkMode(data, cdt);

        Queue diacritics = new Queue();

        boolean unrecognizedUnicode = false;

        while (cdt.offset < data.length) {
            if (ct.isCombining(data[cdt.offset], cdt.g0, cdt.g1)
                && hasNext(cdt.offset, len)) {

                while (ct.isCombining(data[cdt.offset], cdt.g0, cdt.g1)
                    && hasNext(cdt.offset, len)) {
                    diacritics.put(getChar(data[cdt.offset], cdt.g0, cdt.g1));
                    cdt.offset++;
                    checkMode(data, cdt);
                }

                char c2 = getChar(data[cdt.offset], cdt.g0, cdt.g1);
                cdt.offset++;
                checkMode(data, cdt);
                sb.append(c2);

                while (!diacritics.isEmpty()) {
                    char c1 = (Character) diacritics.get();
                    sb.append(c1);
                }
            } else if (cdt.multibyte) {
                sb.append(ct.getChar(
                    makeMultibyte(new String(data).substring(cdt.offset, cdt.offset + 4).toCharArray()), cdt.g0));
                cdt.offset += 3;
            } else {
                char c = getChar(data[cdt.offset], cdt.g0, cdt.g1);
                if (c != 0) {
                    sb.append(c);
                } else {
                    // Uh oh.  c == 0.  Don't know what to do with this character.  No Unicode equivalent.  Encode it in the output
                    // as , where XXXX is the Marc character.
                    // Note this is odd:  Normally a  represents a real Unicode character.  In this case, it
                    // represents the original MARC character that can't be converted to Unicode.  So, it's misleading
                    // in the result :(  But - fixing it now could break some other client code.
                    String val = UnicodeUtils.convertUnicodeToUnicodeBNF(data[cdt.offset]);
                    if (translateNCR) {
                        // if translateNCR, then Add it as "XXXX>" so that the normal convertNCRToUnicode won't immediately change it back to a unicode
                        // value.  After the 'translateNCR' takes place, we'll fix this up.
                        val = val.substring(0,3) + '>' + val.substring(3);
                        unrecognizedUnicode = true;
                    }
                    sb.append(val);
                }
                cdt.offset += 1;
            }
            if (hasNext(cdt.offset, len)) {
                checkMode(data, cdt);
            }
        }

        if (translateNCR) {
            UnicodeUtils.convertNCRToUnicode(sb);
            FixDoubleWidth.removeInvalidSecondHalf(sb);
        }
        String dataElement = sb.toString();
        if (unrecognizedUnicode) {
            // Replace "XXXX>" with ""
            dataElement = dataElement.replaceAll("", "

    

    

    
            
    
            

    
        
            
                Related Artifacts
                
                     mysql-connector-java mysql
 facebook-messenger com.github.codedrinker
 selenium-java org.seleniumhq.selenium
 instagram-java com.github.sola92
 gson com.google.code.gson
 poi org.apache.poi
 httpclient org.apache.httpcomponents
 json org.json
 facebook-java-api com.google.code.facebook-java-api
 poi-ooxml org.apache.poi
 jackson-databind com.fasterxml.jackson.core
 junit junit
 primefaces org.primefaces
 ojdbc7 com.github.noraui
 jfoenix com.jfoenix
 testng org.testng
 json-simple com.googlecode.json-simple
 selenium-server org.seleniumhq.selenium
 itextpdf com.itextpdf
 spring-core org.springframework
                
            
        
        
            
                Related Groups
                
                     org.springframework
 org.apache.poi
 org.hibernate
 org.springframework.boot
 com.fasterxml.jackson.core
 com.itextpdf
 org.seleniumhq.selenium
 mysql
 org.finos.legend.engine
 org.apache.httpcomponents
 org.apache.logging.log4j
 org.openjfx
 org.apache.commons
 org.json
 com.google.guava
 com.google.zxing
 net.sf.jasperreports
 javax.xml.bind
 ojdbc
 com.google.code.facebook-java-api