org.marc4j.MarcPermissiveStreamReader Maven / Gradle / Ivy

Go to download
/**
 * Copyright (C) 2004 Bas Peters
 *
 * This file is part of MARC4J
 *
 * MARC4J is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * MARC4J is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with MARC4J; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

package org.marc4j;

import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.marc4j.converter.CharConverter;
import org.marc4j.marc.ControlField;
import org.marc4j.marc.DataField;
import org.marc4j.marc.Leader;
import org.marc4j.marc.MarcFactory;
import org.marc4j.marc.Record;
import org.marc4j.marc.Subfield;
import org.marc4j.marc.VariableField;
import org.marc4j.util.Normalizer;

import info.freelibrary.marc4j.converter.impl.AnselToUnicode;
import info.freelibrary.marc4j.converter.impl.Iso5426ToUnicode;

/**
 * An iterator over a collection of MARC records in ISO 2709 format, that is designed to be able to handle MARC
 * records that have errors in their structure or their encoding. If the permissive flag is set in the call to the
 * constructor, or if a ErrorHandler object is passed in as a parameter to the constructor, this reader will do its
 * best to detect and recover from a number of structural or encoding errors that can occur in a MARC record. Note
 * that if this reader is not set to read permissively, its will operate pretty much identically to the
 * MarcStreamReader class. Note that no attempt is made to validate the contents of the record at a semantic level.
 * This reader does not know and does not care whether the record has a 245 field, or if the 008 field is the right
 * length, but if the record claims to be UTF-8 or MARC8 encoded and you are seeing gibberish in the output, or if the
 * reader is throwing an exception in trying to read a record, then this reader may be able to produce a usable record
 * from the bad data you have. The ability to directly translate the record to UTF-8 as it is being read in is useful
 * in cases where the UTF-8 version of the record will be used directly by the program that is reading the MARC data,
 * for instance if the marc records are to be indexed into a SOLR search engine. Previously the MARC record could only
 * be translated to UTF-8 as it was being written out via a MarcStreamWriter or a MarcXmlWriter.
 * 
 * Example usage: 
 * InputStream input = new FileInputStream("file.mrc");
 * MarcReader reader = new MarcPermissiveStreamReader(input, true, true);
 * while (reader.hasNext()) {
 *     Record record = reader.next();
 *     // Process record
 * }
 * 
 * 
 * Check the {@link org.marc4j.marc} package for examples about the use of the {@link org.marc4j.marc.Record}
 *  object model. Check the file org.marc4j.samples.PermissiveReaderExample.java for an example about using the
 * MarcPermissiveStreamReader in conjunction with the ErrorHandler class to report errors encountered while processing
 * records.
 * 
 * 
 * When no encoding is given as an constructor argument the parser tries to resolve the encoding by looking at the
 * character coding scheme (leader position 9) in MARC21 records. For UNIMARC records this position is not defined. If
 * the reader is operating in permissive mode and no encoding is given as an constructor argument the reader will look
 * at the leader, and also at the data of the record to determine to the best of its ability what character encoding
 * scheme has been used to encode the data in a particular MARC record.
 * 
 *
 * @author Robert Haschart
 */
public class MarcPermissiveStreamReader implements MarcReader {

    private DataInputStream input = null;

    private Record record;

    private final MarcFactory factory;

    private String encoding = "ISO8859_1";

    // This represents the expected encoding of the data when a
    // MARC record does not have a 'a' in character 9 of the leader.
    private String defaultEncoding = "ISO8859_1";

    private boolean convertToUTF8 = false;

    private boolean permissive = false;

    private boolean translateLosslessUnicodeNumericCodeReferencesEnabled = true;

    private int marc_file_lookahead_buffer = 200000;

    private AnselToUnicode converterAnsel = null;

    private CharConverter converterUnimarc = null;

    // These are used to algorithmically determine what encoding scheme was
    // used to encode the data in the Marc record
    private String conversionCheck1 = null;

    private String conversionCheck2 = null;

    private String conversionCheck3 = null;

    private ErrorHandler errors;

    static String validSubfieldCodes = "abcdefghijklmnopqrstuvwxyz0123456789";

    static String upperCaseSubfieldsProperty = "org.marc4j.MarcPermissiveStreamReader.upperCaseSubfields";

    /**
     * Constructs an instance with the specified input stream with possible additional functionality being enabled by
     * setting permissive and/or convertToUTF8 to true. If permissive and convertToUTF8 are both set to false, it
     * functions almost identically to the MarcStreamReader class.
     */
    public MarcPermissiveStreamReader(final InputStream input, final boolean permissive,
            final boolean convertToUTF8) {
        this.permissive = permissive;
        this.input = new DataInputStream(new BufferedInputStream(input));
        factory = MarcFactory.newInstance();
        this.convertToUTF8 = convertToUTF8;
        errors = null;
        if (permissive) {
            errors = new ErrorHandler();
            defaultEncoding = "BESTGUESS";
        }
    }

    /**
     * Constructs an instance with the specified input stream with possible additional functionality being enabled by
     * passing in an ErrorHandler object and/or setting convertToUTF8 to true. If errors and convertToUTF8 are both
     * set to false, it functions almost identically to the MarcStreamReader class. If an ErrorHandler object is
     * passed in, that object will be used to log and track any errors in the records as the records are decoded.
     * After the next() function returns, you can query to determine whether any errors were detected in the decoding
     * process. See the file org.marc4j.samples.PermissiveReaderExample.java to see how this can be done.
     */
    public MarcPermissiveStreamReader(final InputStream input, final ErrorHandler errors,
            final boolean convertToUTF8) {
        if (errors != null) {
            permissive = true;
            defaultEncoding = "BESTGUESS";
        }
        this.input = new DataInputStream((input.markSupported()) ? input : new BufferedInputStream(input));
        factory = MarcFactory.newInstance();
        this.convertToUTF8 = convertToUTF8;
        this.errors = errors;
    }

    /**
     * Constructs an instance with the specified input stream with possible additional functionality being enabled by
     * setting permissive and/or convertToUTF8 to true. If permissive and convertToUTF8 are both set to false, it
     * functions almost identically to the MarcStreamReader class. The parameter defaultEncoding is used to specify
     * the character encoding that is used in the records that will be read from the input stream. If permissive is
     * set to true, you can specify "BESTGUESS" as the default encoding, and the reader will attempt to determine the
     * character encoding used in the records being read from the input stream. This is especially useful if you are
     * working with records downloaded from an external source and the encoding is either unknown or the encoding is
     * different from what the records claim to be.
     */
    public MarcPermissiveStreamReader(final InputStream input, final boolean permissive, final boolean convertToUTF8,
            final String defaultEncoding) {
        this.permissive = permissive;
        this.input = new DataInputStream((input.markSupported()) ? input : new BufferedInputStream(input));
        factory = MarcFactory.newInstance();
        this.convertToUTF8 = convertToUTF8;
        this.defaultEncoding = defaultEncoding;
        errors = null;
        if (permissive) {
            errors = new ErrorHandler();
        }
    }

    /**
     * Constructs an instance with the specified input stream with possible additional functionality being enabled by
     * setting permissive and/or convertToUTF8 to true. If errors and convertToUTF8 are both set to false, it
     * functions almost identically to the MarcStreamReader class. The parameter defaultEncoding is used to specify
     * the character encoding that is used in the records that will be read from the input stream. If permissive is
     * set to true, you can specify "BESTGUESS" as the default encoding, and the reader will attempt to determine the
     * character encoding used in the records being read from the input stream. This is especially useful if you are
     * working with records downloaded from an external source and the encoding is either unknown or the encoding is
     * different from what the records claim to be. If an ErrorHandler object is passed in, that object will be used
     * to log and track any errors in the records as the records are decoded. After the next() function returns, you
     * can query to determine whether any errors were detected in the decoding process. See the file
     * org.marc4j.samples.PermissiveReaderExample.java to see how this can be done.
     */
    public MarcPermissiveStreamReader(final InputStream input, final ErrorHandler errors, final boolean convertToUTF8,
            final String defaultEncoding) {
        this.permissive = true;
        this.input = new DataInputStream(new BufferedInputStream(input));
        factory = MarcFactory.newInstance();
        this.convertToUTF8 = convertToUTF8;
        this.defaultEncoding = defaultEncoding;
        this.errors = errors;
    }

    /**
     * @return true if numeric character entities like � should be converted to their corresponding code point
     *         if converting to unicode. Default is to convert.
     */
    public boolean isTranslateLosslessUnicodeNumericCodeReferencesEnabled() {
        return translateLosslessUnicodeNumericCodeReferencesEnabled;
    }

    /**
     * Enable convesion of numeric code references into their corresponding code points when converting to unicode
     *
     * @param translateLosslessUnicodeNumericCodeReferencesEnabled
     */
    public void setTranslateLosslessUnicodeNumericCodeReferencesEnabled(
            final boolean translateLosslessUnicodeNumericCodeReferencesEnabled) {
        this.translateLosslessUnicodeNumericCodeReferencesEnabled =
                translateLosslessUnicodeNumericCodeReferencesEnabled;
    }

    /**
     * Returns true if the iteration has more records, false otherwise.
     */
    @Override
    public boolean hasNext() {
        try {
            input.mark(10);
            int byteread = input.read();
            if (byteread == -1) {
                return false;
            }
            // byte[] recLengthBuf = new byte[5];
            int numBadBytes = 0;
            while (byteread < '0' || byteread > '9') {
                byteread = input.read();
                numBadBytes++;
                if (byteread == -1) {
                    return false;
                }
            }
            input.reset();
            while (numBadBytes > 0) {
                byteread = input.read();
                numBadBytes--;
            }
        } catch (final IOException e) {
            throw new MarcException(e.getMessage(), e);
        }
        return true;
    }

    /**
     * Returns the next record in the iteration.
     *
     * @return Record - the record object
     */
    @Override
    public Record next() {
        record = factory.newRecord();
        if (errors != null) {
            errors.reset();
        }

        try {
            final byte[] byteArray = new byte[24];

            input.readFully(byteArray);
            int recordLength = parseRecordLength(byteArray);
            byte[] recordBuf = new byte[recordLength - 24];
            if (permissive) {
                input.mark(marc_file_lookahead_buffer);
                input.readFully(recordBuf);
                if (recordBuf[recordBuf.length - 1] != Constants.RT) {
                    errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR,
                            "Record terminator character not found at end of record length");
                    recordBuf = rereadPermissively(input, recordBuf, recordLength);
                    recordLength = recordBuf.length + 24;
                }
            } else {
                input.readFully(recordBuf);
            }
            // final String tmp = new String(recordBuf);
            parseRecord(record, byteArray, recordBuf, recordLength);

            if (this.convertToUTF8) {
                final Leader l = record.getLeader();
                l.setCharCodingScheme('a');
                record.setLeader(l);
            }
            return (record);
        } catch (final EOFException e) {
            throw new MarcException("Premature end of file encountered", e);
        } catch (final IOException e) {
            throw new MarcException("an error occured reading input", e);
        }
    }

    private byte[] rereadPermissively(final DataInputStream aInput, final byte[] aRecordBuf, final int aRecordLength)
            throws IOException {
        int loc = arrayContainsAt(aRecordBuf, Constants.RT);
        int recordLength = aRecordLength;
        byte[] recordBuf = aRecordBuf;

        if (loc != -1) {
            // stated record length is too long
            errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR,
                    "Record terminator appears before stated record length, using shorter record");
            recordLength = loc + 24;
            input.reset();
            recordBuf = new byte[recordLength - 24];
            input.readFully(recordBuf);
        } else {
            // stated record length is too short read ahead
            loc = recordLength - 24;

            boolean done = false;

            while (!done) {
                int c = 0;

                do {
                    c = input.read();
                    loc++;
                } while (loc < (marc_file_lookahead_buffer - 24) && c != Constants.RT && c != -1);

                if (c == Constants.RT) {
                    errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR,
                            "Record terminator appears after stated record length, reading extra bytes");
                    recordLength = loc + 24;
                    input.reset();
                    recordBuf = new byte[recordLength - 24];
                    input.readFully(recordBuf);
                    done = true;
                } else if (c == -1) {
                    errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR,
                            "No Record terminator found, end of file reached, Terminator appended");
                    recordLength = loc + 24;
                    input.reset();
                    recordBuf = new byte[recordLength - 24 + 1];
                    input.readFully(recordBuf);
                    recordBuf[recordBuf.length - 1] = Constants.RT;
                    done = true;
                } else {
                    errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL,
                            "No Record terminator found within " + marc_file_lookahead_buffer +
                                    " bytes of start of record, getting desperate.");
                    input.reset();
                    marc_file_lookahead_buffer *= 2;
                    input.mark(marc_file_lookahead_buffer);
                    loc = 0;
                }
            }
        }

        return (recordBuf);
    }

    private void parseRecord(final Record record, final byte[] aByteArray, final byte[] aRecordBuf,
            final int recordLength) {
        int directoryLength = 0;
        byte[] byteArray = aByteArray;
        byte[] recordBuf = aRecordBuf;
        Leader ldr;

        ldr = factory.newLeader();
        ldr.setRecordLength(recordLength);

        // These variables are used when the permissive reader is trying to make its best guess
        // as to what character encoding is actually used in the record being processed.
        conversionCheck1 = "";
        conversionCheck2 = "";
        conversionCheck3 = "";

        try {
            parseLeader(ldr, byteArray);
            directoryLength = ldr.getBaseAddressOfData() - (24 + 1);
        } catch (final IOException e) {
            throw new MarcException("error parsing leader with data: " + new String(byteArray), e);
        } catch (final MarcException e) {
            if (permissive) {
                if (recordBuf[recordBuf.length - 1] == Constants.RT && recordBuf[recordBuf.length -
                        2] == Constants.FT) {
                    errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR,
                            "Error parsing leader, trying to re-read leader either shorter or longer");

                    // make an attempt to recover record.
                    int offset = 0;

                    while (offset < recordBuf.length) {
                        if (recordBuf[offset] == Constants.FT) {
                            break;
                        }

                        offset++;
                    }

                    if (offset % 12 == 1) {
                        // move one byte from body to leader, make new leader, and try again
                        errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR,
                                "Leader appears to be too short, moving one byte from record body to leader, " +
                                        "and trying again");
                        final byte oldBody[] = recordBuf;
                        recordBuf = new byte[oldBody.length - 1];
                        System.arraycopy(oldBody, 1, recordBuf, 0, oldBody.length - 1);
                        directoryLength = offset - 1;
                        ldr.setIndicatorCount(2);
                        ldr.setSubfieldCodeLength(2);
                        ldr.setImplDefined1(("" + (char) byteArray[7] + " ").toCharArray());
                        ldr.setImplDefined2(("" + (char) byteArray[18] + (char) byteArray[19] + (char) byteArray[20])
                                .toCharArray());
                        ldr.setEntryMap("4500".toCharArray());

                        // if its ' ' or 'a'
                        if (byteArray[10] == (byte) ' ' || byteArray[10] == (byte) 'a') {
                            ldr.setCharCodingScheme((char) byteArray[10]);
                        }
                    } else if (offset % 12 == 11) {
                        errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR,
                                "Leader appears to be too long, moving one byte from leader to record body, " +
                                        "and trying again");

                        final byte oldBody[] = recordBuf;
                        recordBuf = new byte[oldBody.length + 1];
                        System.arraycopy(oldBody, 0, recordBuf, 1, oldBody.length);
                        recordBuf[0] = (byte) '0';
                        directoryLength = offset + 1;
                        ldr.setIndicatorCount(2);
                        ldr.setSubfieldCodeLength(2);
                        ldr.setImplDefined1(("" + (char) byteArray[7] + " ").toCharArray());
                        ldr.setImplDefined2(("" + (char) byteArray[16] + (char) byteArray[17] + (char) byteArray[18])
                                .toCharArray());
                        ldr.setEntryMap("4500".toCharArray());

                        // if its ' ' or 'a'
                        if (byteArray[8] == (byte) ' ' || byteArray[8] == (byte) 'a') {
                            ldr.setCharCodingScheme((char) byteArray[10]);
                        }

                        // if its ' ' or 'a'
                        if (byteArray[10] == (byte) ' ' || byteArray[10] == (byte) 'a') {
                            ldr.setCharCodingScheme((char) byteArray[10]);
                        }
                    } else {
                        errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL,
                                "error parsing leader with data: " + new String(byteArray));
                        throw new MarcException("error parsing leader with data: " + new String(byteArray), e);
                    }
                }
            } else {
                throw new MarcException("error parsing leader with data: " + new String(byteArray), e);
            }
        }

        final char tmp[] = ldr.getEntryMap();

        if (permissive && !("" + tmp[0] + tmp[1] + tmp[2] + tmp[3]).equals("4500")) {
            if (tmp[0] >= '0' && tmp[0] <= '9' && tmp[1] >= '0' && tmp[1] <= '9' && tmp[2] >= '0' && tmp[2] <= '9' &&
                    tmp[3] >= '0' && tmp[3] <= '9') {
                errors.addError("unknown", "n/a", "n/a", ErrorHandler.ERROR_TYPO,
                        "Unusual character found at end of leader [ " + tmp[0] + tmp[1] + tmp[2] + tmp[3] + " ]");
            } else {
                errors.addError("unknown", "n/a", "n/a", ErrorHandler.ERROR_TYPO,
                        "Erroneous character found at end of leader [ " + tmp[0] + tmp[1] + tmp[2] + tmp[3] +
                                " ]; changing them to the standard \"4500\"");
                ldr.setEntryMap("4500".toCharArray());
            }
        }

        // if MARC 21 then check encoding
        switch (ldr.getCharCodingScheme()) {
            case 'a':
                encoding = "UTF8";
                break;
            case ' ':
                if (convertToUTF8) {
                    encoding = defaultEncoding;
                } else {
                    encoding = "ISO8859_1";
                }
                break;
            default:
                if (convertToUTF8) {
                    if (permissive) {
                        errors.addError("unknown", "n/a", "n/a", ErrorHandler.MINOR_ERROR,
                                "Record character encoding should be 'a' or ' ' in this record it is '" + ldr
                                        .getCharCodingScheme() + "'. Attempting to guess the correct encoding.");
                        encoding = "BESTGUESS";
                    } else {
                        encoding = defaultEncoding;
                    }
                } else {
                    encoding = "ISO8859_1";
                }
                break;

        }

        String utfCheck;

        if (encoding.equalsIgnoreCase("BESTGUESS")) {
            try {
                final String marc8EscSeqCheck = new String(recordBuf, "ISO-8859-1");
                // If record has MARC8 character set selection strings, it must be MARC8 encoded
                if (marc8EscSeqCheck.split("\\e[-(,)$bsp]", 2).length > 1) {
                    encoding = "MARC8";
                } else {
                    boolean hasHighBitChars = false;

                    for (int i = 0; i < recordBuf.length; i++) {
                        if (recordBuf[i] < 0) {
                            // the high bit is set
                            hasHighBitChars = true;
                            break;
                        }
                    }

                    if (!hasHighBitChars) {
                        encoding = "ISO8859_1"; // You can choose any encoding you want here, the results will be the
                                                // same.
                    } else {
                        utfCheck = new String(recordBuf, "UTF-8");
                        final byte byteCheck[] = utfCheck.getBytes("UTF-8");
                        encoding = "UTF8";

                        if (recordBuf.length == byteCheck.length) {
                            for (int i = 0; i < recordBuf.length; i++) {
                                if (byteCheck[i] != recordBuf[i]) {
                                    encoding = "MARC8-Maybe";
                                    break;
                                }
                            }
                        } else {
                            encoding = "MARC8-Maybe";
                        }
                    }
                }
            } catch (final UnsupportedEncodingException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        } else if (permissive && encoding.equals("UTF8")) {
            try {
                utfCheck = new String(recordBuf, "UTF-8");
                final byte byteCheck[] = utfCheck.getBytes("UTF-8");

                if (recordBuf.length != byteCheck.length) {
                    boolean foundESC = false;

                    for (int i = 0; i < recordBuf.length; i++) {
                        if (recordBuf[i] == 0x1B) {
                            errors.addError("unknown", "n/a", "n/a", ErrorHandler.MINOR_ERROR,
                                    "Record claims to be UTF-8, but its not. Its probably MARC8.");
                            encoding = "MARC8-Maybe";
                            foundESC = true;
                            break;
                        }

                        if (byteCheck[i] != recordBuf[i]) {
                            encoding = "MARC8-Maybe";
                        }

                    }

                    if (!foundESC) {
                        errors.addError("unknown", "n/a", "n/a", ErrorHandler.MINOR_ERROR,
                                "Record claims to be UTF-8, but its not. It may be MARC8, or maybe UNIMARC, " +
                                        "or maybe raw ISO-8859-1 ");
                    }
                }

                if (utfCheck.contains("a$1!")) {
                    encoding = "MARC8-Broken";
                    errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR,
                            "Record claims to be UTF-8, but its not. It seems to be MARC8-encoded but with " +
                                    "missing escape codes.");
                }
            } catch (final UnsupportedEncodingException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        } else if (permissive && !encoding.equals("UTF8") && convertToUTF8) {
            try {
                final String marc8EscSeqCheck = new String(recordBuf, "ISO-8859-1");
                final boolean hasMarc8EscSeq = (marc8EscSeqCheck.split("\\e[-(,)$bsp]", 2).length > 1);
                utfCheck = new String(recordBuf, "UTF-8");
                final byte byteCheck[] = utfCheck.getBytes("UTF-8");

                if (recordBuf.length == byteCheck.length) {
                    for (int i = 0; i < recordBuf.length; i++) {
                        // need to check for byte < 0 to see if the high bit is set,
                        // because Java doesn't have unsigned types.
                        if (recordBuf[i] < 0x00 || byteCheck[i] != recordBuf[i]) {
                            // If record has MARC8 character set selection strings, it must be MARC8 encoded
                            if (hasMarc8EscSeq) {
                                errors.addError("unknown", "n/a", "n/a", ErrorHandler.MINOR_ERROR,
                                        "Record has MARC8 escape sequences, " +
                                                "but also seem to have UTF8-encoded characters.");
                                encoding = "MARC8-Maybe";
                            } else {
                                errors.addError("unknown", "n/a", "n/a", ErrorHandler.MINOR_ERROR,
                                        "Record claims not to be UTF-8, but it seems to be.");
                                encoding = "UTF8-Maybe";
                            }
                            break;
                        }
                    }
                }
            } catch (final UnsupportedEncodingException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }

        record.setLeader(ldr);

        boolean discardOneAtStartOfDirectory = false;
        boolean discardOneSomewhereInDirectory = false;

        if ((directoryLength % 12) != 0) {
            // which equals 99999 - (24 + 1) its a
            // BIG record (its directory is over
            // 100000 bytes)
            if (permissive && directoryLength == 99974 && recordLength > 200000) {
                directoryLength = 0;
                int tmpLength = 0;

                for (tmpLength = 0; tmpLength < recordLength; tmpLength += 12) {
                    if (recordBuf[tmpLength] == Constants.FT) {
                        directoryLength = tmpLength;
                        break;
                    }
                }

                if (directoryLength == 0) {
                    throw new MarcException(
                            "Directory is too big (> 99999 bytes) and it doesn't end with a field terminator " +
                                    "character, I give up. Unable to continue.");
                }
            } else if (permissive && directoryLength % 12 == 11 && recordBuf[1] != (byte) '0') {
                errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR,
                        "Directory length is not a multiple of 12 bytes long.  Prepending a zero and trying to " +
                                "continue.");
                final byte oldBody[] = recordBuf;
                recordBuf = new byte[oldBody.length + 1];
                System.arraycopy(oldBody, 0, recordBuf, 1, oldBody.length);
                recordBuf[0] = (byte) '0';
                directoryLength = directoryLength + 1;
            } else {
                if (permissive && directoryLength % 12 == 1 && recordBuf[1] == (byte) '0' &&
                        recordBuf[2] == (byte) '0') {
                    discardOneAtStartOfDirectory = true;
                    errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR,
                            "Directory length is not a multiple of 12 bytes long. Discarding byte from start of " +
                                    "directory and trying to continue.");
                } else if (permissive && directoryLength % 12 == 1 && recordLength > 10000 &&
                        recordBuf[0] == (byte) '0' && recordBuf[1] == (byte) '0' && recordBuf[2] > (byte) '0' &&
                        recordBuf[2] <= (byte) '9') {
                    discardOneSomewhereInDirectory = true;
                    errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR,
                            "Directory length is not a multiple of 12 bytes long.  Will look for oversized field " +
                                    "and try to work around it.");
                } else {
                    if (errors != null) {
                        errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL,
                                "Directory length is not a multiple of 12 bytes long. Unable to continue.");
                    }

                    throw new MarcException(
                            "Directory length is not a multiple of 12 bytes long. Unable to continue.");
                }
            }
        }

        final DataInputStream inputrec = new DataInputStream(new ByteArrayInputStream(recordBuf));
        final int size = directoryLength / 12;

        final String[] tags = new String[size];
        final int[] lengths = new int[size];

        final byte[] tag = new byte[3];
        final byte[] length = new byte[4];
        final byte[] start = new byte[5];

        String tmpStr;

        try {
            if (discardOneAtStartOfDirectory) {
                inputrec.read();
            }

            int totalOffset = 0;

            for (int i = 0; i < size; i++) {
                inputrec.readFully(tag);
                tmpStr = new String(tag);
                tags[i] = tmpStr;

                boolean proceedNormally = true;

                if (discardOneSomewhereInDirectory) {
                    final byte lenCheck[] = new byte[10];
                    inputrec.mark(20);
                    inputrec.readFully(lenCheck);

                    // proceed normally
                    if (byteCompare(lenCheck, 4, 5, totalOffset)) {
                        proceedNormally = true;
                    } else if (byteCompare(lenCheck, 5, 5, totalOffset)) {
                        // field length is 5 bytes! Bad Marc record,
                        // proceed normally
                        discardOneSomewhereInDirectory = false;
                        errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL,
                                "Field is longer than 9999 bytes.  Writing this record out will result in a bad " +
                                        "record.");
                        proceedNormally = false;
                    } else {
                        errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL,
                                "Unable to reconcile problems in directory. Unable to continue.");
                        throw new MarcException(
                                "Directory length is not a multiple of 12 bytes long. Unable to continue.");
                    }

                    inputrec.reset();
                }

                if (proceedNormally) {
                    inputrec.readFully(length);
                    tmpStr = new String(length);
                    lengths[i] = Integer.parseInt(tmpStr);

                    inputrec.readFully(start);
                } else {
                    // length is 5 bytes long
                    inputrec.readFully(start);
                    tmpStr = new String(start);
                    lengths[i] = Integer.parseInt(tmpStr);

                    inputrec.readFully(start);
                }

                totalOffset += lengths[i];
            }

            // If we still haven't found the extra byte, throw out the last byte and try to continue;
            if (discardOneSomewhereInDirectory) {
                inputrec.read();
            }

            if (inputrec.read() != Constants.FT) {
                errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL,
                        "Expected field terminator at end of directory. Unable to continue.");
                throw new MarcException("expected field terminator at end of directory");
            }

            int numBadLengths = 0;

            int totalLength = 0;

            for (int i = 0; i < size; i++) {
                final int fieldLength = getFieldLength(inputrec);

                if (fieldLength + 1 != lengths[i] && permissive) {
                    if (numBadLengths < 5 && (totalLength + fieldLength < recordLength + 26)) {
                        inputrec.mark(9999);
                        byteArray = new byte[lengths[i]];
                        inputrec.readFully(byteArray);
                        inputrec.reset();

                        if (fieldLength + 1 < lengths[i] && byteArray[lengths[i] - 1] == Constants.FT) {
                            errors.addError("unknown", "n/a", "n/a", ErrorHandler.MINOR_ERROR,
                                    "Field Terminator character found in the middle of a field.");
                        } else {
                            numBadLengths++;
                            lengths[i] = fieldLength + 1;
                            errors.addError("unknown", "n/a", "n/a", ErrorHandler.MINOR_ERROR,
                                    "Field length found in record different from length stated in the directory.");

                            if (fieldLength + 1 > 9999) {
                                errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL,
                                        "Field length is greater than 9999, record cannot be represented as a " +
                                                "binary Marc record.");
                            }
                        }

                    }
                }

                totalLength += lengths[i];

                if (Constants.CF_TAG_PATTERN.matcher(tags[i]).find()) {
                    byteArray = new byte[lengths[i] - 1];
                    inputrec.readFully(byteArray);

                    if (inputrec.read() != Constants.FT) {
                        errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL,
                                "Expected field terminator at end of field. Unable to continue.");
                        throw new MarcException("expected field terminator at end of field");
                    }

                    final ControlField field = factory.newControlField();
                    field.setTag(tags[i]);
                    field.setData(getDataAsString(byteArray));
                    record.addVariableField(field);

                } else {
                    byteArray = new byte[lengths[i]];
                    inputrec.readFully(byteArray);

                    try {
                        record.addVariableField(parseDataField(tags[i], byteArray));
                    } catch (final IOException e) {
                        throw new MarcException("error parsing data field for tag: " + tags[i] + " with data: " +
                                new String(byteArray), e);
                    }
                }
            }

            // We've determined that although the record says it is UTF-8, it is not.
            // Here we make an attempt to determine the actual encoding of the data in the record.
            if (permissive && conversionCheck1.length() > 1 && conversionCheck2.length() > 1 && conversionCheck3
                    .length() > 1) {
                guessAndSelectCorrectNonUTF8Encoding();
            }

            if (inputrec.read() != Constants.RT) {
                errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL,
                        "Expected record terminator at end of record. Unable to continue.");
                throw new MarcException("expected record terminator");
            }
        } catch (final IOException e) {
            errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL,
                    "Error reading from data file. Unable to continue.");
            throw new MarcException("an error occured reading input", e);
        }
    }

    private boolean byteCompare(final byte[] lenCheck, final int offset, final int length, final int totalOffset) {
        int divisor = 1;

        for (int i = offset + length - 1; i >= offset; i--, divisor *= 10) {
            if (((totalOffset / divisor) % 10) + '0' != lenCheck[i]) {
                return (false);
            }
        }

        return true;
    }

    private void guessAndSelectCorrectNonUTF8Encoding() {
        int defaultPart = 0;

        if (record.getVariableField("245") == null) {
            defaultPart = 1;
        }

        int partToUse = 0;
        final int l1 = conversionCheck1.length();
        final int l2 = conversionCheck2.length();
        final int l3 = conversionCheck3.length();
        int tst;

        if (l1 < l3 && l2 == l3 && defaultPart == 0) {
            errors.addError(ErrorHandler.INFO, "MARC8 translation shorter than ISO-8859-1, choosing MARC8.");
            partToUse = 0;
        } else if (l2 < l1 - 2 && l2 < l3 - 2) {
            errors.addError(ErrorHandler.INFO, "Unimarc translation shortest, choosing it.");
            partToUse = 1;
        } else if ((tst = onlyOneStartsWithUpperCase(conversionCheck1, conversionCheck2, conversionCheck3)) != -1) {
            partToUse = tst;
        } else if (l2 < l1 && l2 < l3) {
            errors.addError(ErrorHandler.INFO, "Unimarc translation shortest, choosing it.");
            partToUse = 1;
        } else if (conversionCheck2.equals(conversionCheck3) && !conversionCheck1.trim().contains(" ")) {
            errors.addError(ErrorHandler.INFO, "Unimarc and ISO-8859-1 translations identical, choosing ISO-8859-1.");
            partToUse = 2;
        } else if (!specialCharIsBetweenLetters(conversionCheck1)) {
            errors.addError(ErrorHandler.INFO, "To few letters in translations, choosing " + (defaultPart == 0
                    ? "MARC8" : "Unimarc"));
            partToUse = defaultPart;
        } else if (l2 == l3 && defaultPart == 1) {
            errors.addError(ErrorHandler.INFO,
                    "Unimarc and ISO-8859-1 translations equal length, choosing ISO-8859-1.");
            partToUse = 2;
        } else {
            errors.addError(ErrorHandler.INFO, "No Determination made, defaulting to " + (defaultPart == 0 ? "MARC8"
                    : "Unimarc"));
            partToUse = defaultPart;
        }

        final List fields = record.getVariableFields();
        final Iterator iter = fields.iterator();

        while (iter.hasNext()) {
            final VariableField field = iter.next();

            if (field instanceof DataField) {
                final DataField df = (DataField) field;
                final List subf = df.getSubfields();
                final Iterator sfiter = subf.iterator();

                while (sfiter.hasNext()) {
                    final Subfield sf = sfiter.next();

                    if (sf.getData().contains("%%@%%")) {
                        final String parts[] = sf.getData().split("%%@%%", 3);
                        sf.setData(parts[partToUse]);
                    }
                }
            }
        }
    }

    private int onlyOneStartsWithUpperCase(final String conversionCheck12, final String conversionCheck22,
            final String conversionCheck32) {
        if (conversionCheck1.length() == 0 || conversionCheck2.length() == 0 || conversionCheck3.length() == 0) {
            return -1;
        }

        final String check1Parts[] = conversionCheck1.trim().split("[|]>");
        final String check2Parts[] = conversionCheck2.trim().split("[|]>");
        final String check3Parts[] = conversionCheck3.trim().split("[|]>");

        for (int i = 1; i < check1Parts.length && i < check2Parts.length && i < check3Parts.length; i++) {
            final boolean tst1 = Character.isUpperCase(check1Parts[i].charAt(0));
            final boolean tst2 = Character.isUpperCase(check2Parts[i].charAt(0));
            final boolean tst3 = Character.isUpperCase(check3Parts[i].charAt(0));

            if (tst1 && !tst2 && !tst3) {
                return (0);
            }

            if (!tst1 && tst2 && !tst3) {
                return (-1);
            }

            if (!tst1 && !tst2 && tst3) {
                return (2);
            }
        }

        return -1;
    }

    private boolean specialCharIsBetweenLetters(final String conversionCheck) {
        boolean bewteenLetters = true;

        for (int i = 0; i < conversionCheck.length(); i++) {
            final int charCode = (conversionCheck.charAt(i));

            if (charCode > 0x7f) {
                bewteenLetters = false;

                if (i > 0 && Character.isLetter((int) (conversionCheck.charAt(i - 1))) || (i < conversionCheck
                        .length() - 1 && Character.isLetter((int) (conversionCheck.charAt(i + 1))))) {
                    bewteenLetters = true;
                    break;
                }
            }
        }

        return (bewteenLetters);
    }

    private int arrayContainsAt(final byte[] byteArray, final int ft) {
        for (int i = 0; i < byteArray.length; i++) {
            if (byteArray[i] == (byte) ft) {
                return i;
            }
        }

        return (-1);
    }

    private DataField parseDataField(final String tag, final byte[] field) throws IOException {
        if (permissive) {
            errors.setRecordID(record.getControlNumber());

            if (tag.equals("880")) {
                String fieldTag = new String(field);
                fieldTag = fieldTag.replaceFirst("^.*\\x1F6", "").replaceFirst("([-0-9]*).*", "$1");
                errors.setCurrentField(tag + "(" + fieldTag + ")");
            } else {
                errors.setCurrentField(tag);
            }

            errors.setCurrentSubfield("n/a");
            cleanupBadFieldSeperators(field, errors);
        }

        final ByteArrayInputStream bais = new ByteArrayInputStream(field);
        final char ind1 = (char) bais.read();
        final char ind2 = (char) bais.read();

        final DataField dataField = factory.newDataField();
        dataField.setTag(tag);
        dataField.setIndicator1(ind1);
        dataField.setIndicator2(ind2);

        int code;
        int size;
        int readByte;
        byte[] data;
        Subfield subfield;

        while (true) {
            readByte = bais.read();

            if (readByte < 0) {
                break;
            }

            switch (readByte) {
                case Constants.US:
                    code = bais.read();

                    if (code < 0) {
                        throw new IOException("unexpected end of data field");
                    }

                    if (code == Constants.FT) {
                        break;
                    }

                    size = getSubfieldLength(bais);

                    if (size == 0) {
                        if (permissive) {
                            errors.addError(ErrorHandler.MINOR_ERROR,
                                    "Subfield of zero length encountered, ignoring it.");
                            continue;
                        }

                        throw new IOException("Subfield of zero length encountered");
                    }

                    data = new byte[size];
                    bais.read(data);
                    subfield = factory.newSubfield();

                    if (permissive) {
                        errors.setCurrentSubfield("" + (char) code);
                    }

                    String dataAsString = getDataAsString(data);

                    if (permissive && code == Constants.US) {
                        code = data[0];
                        dataAsString = dataAsString.substring(1);
                        errors.addError(ErrorHandler.MAJOR_ERROR,
                                "Subfield tag is a subfield separator, using first character of field as " +
                                        "subfield tag.");
                    } else if (permissive && validSubfieldCodes.indexOf(code) == -1) {
                        if (code >= 'A' && code <= 'Z') {
                            final String ucSubfields = System.getProperty(upperCaseSubfieldsProperty, "false");
                            if (Boolean.parseBoolean(ucSubfields) == false) {
                                code = Character.toLowerCase(code);
                                errors.addError(ErrorHandler.MINOR_ERROR,
                                        "Subfield tag is an invalid uppercase character, changing it to lower case.");
                            } else {
                                // the System Property org.marc4j.MarcPermissiveStreamReader.upperCaseSubfields is
                                // defined to allow upperCaseSubfields
                                // therefore do nothing and be happy
                            }
                        } else if (code > 0x7f) {
                            code = data[0];
                            dataAsString = dataAsString.substring(1);
                            errors.addError(ErrorHandler.MAJOR_ERROR,
                                    "Subfield tag is an invalid character greater than 0x7f, using first character " +
                                            "of field as subfield tag.");
                        } else if (code == '[' && tag.equals("245")) {
                            code = 'h';
                            dataAsString = '[' + dataAsString;
                            errors.addError(ErrorHandler.MAJOR_ERROR,
                                    "Subfield tag is an open bracket, generating a code 'h' and pushing the bracket " +
                                            "to the data.");
                        } else if (code == ' ') {
                            errors.addError(ErrorHandler.MAJOR_ERROR,
                                    "Subfield tag is a space which is an invalid character");
                        } else {
                            errors.addError(ErrorHandler.MAJOR_ERROR, "Subfield tag is an invalid character, [ " +
                                    ((char) code) + " ]");
                        }
                    }

                    subfield.setCode((char) code);
                    subfield.setData(dataAsString);
                    dataField.addSubfield(subfield);
                    break;
                case Constants.FT:
                    break;
            }
        }

        return dataField;
    }

    static AnselToUnicode conv = null;

    private static void cleanupBadFieldSeperators(final byte[] field, final ErrorHandler errors) {
        if (conv == null) {
            conv = new AnselToUnicode(true);
        }

        boolean hasEsc = false;
        boolean inMultiByte = false;
        boolean justCleaned = false;
        int mbOffset = 0;
        boolean inCyrillic = false;
        int flen = 0;

        for (int i = 0; i < field.length - 1; i++) {
            if (field[i] == 0x1B) {
                hasEsc = true;

                if ("(,)-'".indexOf((char) field[i + 1]) != -1) {
                    inMultiByte = false;

                    if (i + 2 < field.length && (char) field[i + 2] == 'N') {
                        inCyrillic = true;
                    } else {
                        inCyrillic = false;
                    }
                } else if (i + 2 < field.length && field[i + 1] == '$' && field[i + 2] == '1') {
                    inMultiByte = true;
                    mbOffset = 3;
                } else if (i + 3 < field.length && (field[i + 1] == '$' || field[i + 2] == '$') && (field[i +
                        2] == '1' || field[i + 3] == '1')) {
                    inMultiByte = true;
                    mbOffset = 4;
                }

            } else if (inMultiByte && (field[i] != 0x20 && field[i] >= 0)) {
                mbOffset = (mbOffset == 0) ? 2 : mbOffset - 1;
            }

            if (inMultiByte && mbOffset == 0 && i + 2 < field.length && field[i] > 0) {
                char c;

                final byte f1 = field[i];
                final byte f2 = field[i + 1] == 0x20 ? field[i + 2] : field[i + 1];
                final byte f3 = (field[i + 1] == 0x20 || field[i + 2] == 0x20) ? field[i + 3] : field[i + 2];

                c = conv.getMBChar(conv.makeMultibyte((char) ((f1 == Constants.US) ? 0x7C : f1),
                        (char) ((f2 == Constants.US) ? 0x7C : f2), (char) ((f3 == Constants.US) ? 0x7C : f3)));

                if (c == 0 && !justCleaned) {
                    errors.addError(ErrorHandler.MAJOR_ERROR,
                            "Bad Multibyte character found, reinterpreting data as non-multibyte data");
                    inMultiByte = false;
                } else if (c == 0 && justCleaned) {
                    c = conv.getMBChar(conv.makeMultibyte('!', (char) ((f2 == Constants.US) ? 0x7C : f2),
                            (char) ((f3 == Constants.US) ? 0x7C : f3)));

                    if (c == 0) {
                        errors.addError(ErrorHandler.MAJOR_ERROR,
                                "Bad Multibyte character found, reinterpreting data as non-multibyte data");
                        inMultiByte = false;
                    } else {
                        errors.addError(ErrorHandler.MAJOR_ERROR,
                                "Character after restored vertical bar character makes bad multibyte character, " +
                                        "changing it to \"!\"");
                        field[i] = '!';
                    }
                }
            }

            justCleaned = false;

            if (field[i] == Constants.US) {
                if (inMultiByte && mbOffset != 0) {
                    field[i] = 0x7C;
                    errors.addError(ErrorHandler.MAJOR_ERROR,
                            "Subfield separator found in middle of a multibyte character, changing it to a " +
                                    "vertical bar, and continuing");

                    if (field[i + 1] == '0') {
                        if (field[i + 2] == '(' && field[i + 3] == 'B') {
                            field[i + 1] = 0x1B;
                            errors.addError(ErrorHandler.MAJOR_ERROR,
                                    "Character after restored vertical bar character makes bad multibyte character, " +
                                            "changing it to ESC");
                        } else {
                            field[i + 1] = 0x21;
                            errors.addError(ErrorHandler.MAJOR_ERROR,
                                    "Character after restored vertical bar character makes bad multibyte character, " +
                                            "changing it to \"!\"");
                        }
                    }

                    justCleaned = true;
                } else if (hasEsc && inCyrillic) {
                    final String prev = new String(field, i - (flen - 1), flen - 1);

                    if (!(field[i + 1] >= 'a' && field[i + 1] <= 'z') || prev.equals("\u001b(N")) {
                        errors.addError(ErrorHandler.MINOR_ERROR,
                                "Subfield separator found in Cyrillic string, changing separator to a vertical bar, " +
                                        "and continuing");
                        field[i] = 0x7C;
                        justCleaned = true;
                    }
                } else if (hasEsc && !checkSubfieldByte(field[i + 1])) {
                    errors.addError(ErrorHandler.MAJOR_ERROR,
                            "Subfield separator followed by invalid subfield tag, changing separator to a vertical " +
                                    "bar, and continuing");
                    field[i] = 0x7C;
                    justCleaned = true;
                } else if (hasEsc && i < field.length - 3 && (field[i + 1] == '0' && field[i + 2] == '(' && field[i +
                        3] == 'B')) {
                    errors.addError(ErrorHandler.MAJOR_ERROR,
                            "Subfield separator followed by invalid subfield tag, changing separator to a vertical " +
                                    "bar, and continuing");
                    field[i] = 0x7C;
                    field[i + 1] = 0x1B;
                    justCleaned = true;
                } else if (hasEsc && (field[i + 1] == '0')) {
                    errors.addError(ErrorHandler.MAJOR_ERROR,
                            "Subfield separator followed by invalid subfield tag, changing separator to a vertical " +
                                    "bar, and continuing");
                    field[i] = 0x7C;
                    field[i + 1] = 0x21;
                    justCleaned = true;
                } else if (field[i + 1] == Constants.US && field[i + 2] == Constants.US) {
                    errors.addError(ErrorHandler.MAJOR_ERROR,
                            "Three consecutive subfield separators, changing first two to vertical bars.");
                    field[i] = 0x7C;
                    field[i + 1] = 0x7C;
                    justCleaned = true;
                }
            }

            if (field[i] == Constants.US) {
                flen = 0;
            } else {
                flen++;
            }
        }
    }

    private static boolean checkSubfieldByte(final byte aByte) {
        return (aByte >= 'a' && aByte <= 'z') || (aByte >= '0' && aByte <= '9');
    }

    private int getFieldLength(final DataInputStream bais) throws IOException {
        bais.mark(9999);

        int bytesRead = 0;

        while (true) {
            switch (bais.read()) {
                case Constants.FT:
                    bais.reset();
                    return bytesRead;
                case -1:
                    bais.reset();
                    if (permissive) {
                        errors.addError(ErrorHandler.MINOR_ERROR, "Field not terminated trying to continue");
                        return (bytesRead);
                    } else {
                        throw new IOException("Field not terminated");
                    }
                case Constants.US:
                default:
                    bytesRead++;
            }
        }
    }

    private int getSubfieldLength(final ByteArrayInputStream bais) throws IOException {
        bais.mark(9999);

        int bytesRead = 0;

        while (true) {
            switch (bais.read()) {
                case Constants.FT:
                    bais.reset();
                    return bytesRead;
                case Constants.US:
                    bais.reset();
                    return bytesRead;
                case -1:
                    bais.reset();
                    if (permissive) {
                        errors.addError(ErrorHandler.MINOR_ERROR, "Subfield not terminated trying to continue");
                        return (bytesRead);
                    } else {
                        throw new IOException("subfield not terminated");
                    }
                default:
                    bytesRead++;
            }
        }
    }

    private int parseRecordLength(final byte[] leaderData) throws IOException {
        final InputStreamReader isr = new InputStreamReader(new ByteArrayInputStream(leaderData));
        final char[] tmp = new char[5];

        int length = -1;

        isr.read(tmp);

        try {
            length = Integer.parseInt(new String(tmp));
        } catch (final NumberFormatException e) {
            errors.addError(ErrorHandler.FATAL, "Unable to parse record length, Unable to Continue");
            throw new MarcException("unable to parse record length", e);
        }

        return (length);
    }

    private void parseLeader(final Leader ldr, final byte[] leaderData) throws IOException {
        final InputStreamReader isr = new InputStreamReader(new ByteArrayInputStream(leaderData), "ISO-8859-1");

        char[] tmp = new char[5];

        isr.read(tmp);
        // Skip over bytes for record length, If we get here, its already been computed.
        ldr.setRecordStatus((char) isr.read());
        ldr.setTypeOfRecord((char) isr.read());
        tmp = new char[2];
        isr.read(tmp);
        ldr.setImplDefined1(tmp);
        ldr.setCharCodingScheme((char) isr.read());

        final char indicatorCount = (char) isr.read();
        final char subfieldCodeLength = (char) isr.read();
        final char baseAddr[] = new char[5];

        isr.read(baseAddr);
        tmp = new char[3];
        isr.read(tmp);
        ldr.setImplDefined2(tmp);
        tmp = new char[4];
        isr.read(tmp);
        ldr.setEntryMap(tmp);
        isr.close();

        try {
            ldr.setIndicatorCount(Integer.parseInt(String.valueOf(indicatorCount)));
        } catch (final NumberFormatException e) {
            if (permissive) {
                // All Marc21 records should have indicatorCount '2'
                errors.addError(ErrorHandler.ERROR_TYPO, "bogus indicator count - byte value =  " + Integer
                        .toHexString(indicatorCount & 0xff));
                ldr.setIndicatorCount(2);
            } else {
                throw new MarcException("unable to parse indicator count", e);
            }
        }
        try {
            ldr.setSubfieldCodeLength(Integer.parseInt(String.valueOf(subfieldCodeLength)));
        } catch (final NumberFormatException e) {
            if (permissive) {
                // All Marc21 records should have subfieldCodeLength '2'
                errors.addError(ErrorHandler.ERROR_TYPO, "bogus subfield count - byte value =  " + Integer
                        .toHexString(subfieldCodeLength & 0xff));
                ldr.setSubfieldCodeLength(2);
            } else {
                throw new MarcException("unable to parse subfield code length", e);
            }
        }

        try {
            ldr.setBaseAddressOfData(Integer.parseInt(new String(baseAddr)));
        } catch (final NumberFormatException e) {
            throw new MarcException("unable to parse base address of data", e);
        }

    }

    private String getDataAsString(final byte[] bytes) {
        String dataElement = null;

        if (encoding.equals("UTF-8") || encoding.equals("UTF8")) {
            try {
                dataElement = new String(bytes, "UTF-8");
            } catch (final UnsupportedEncodingException e) {
                throw new MarcException("unsupported encoding", e);
            }
        } else if (encoding.equals("UTF8-Maybe")) {
            try {
                dataElement = new String(bytes, "UTF-8");
            } catch (final UnsupportedEncodingException e) {
                throw new MarcException("unsupported encoding", e);
            }
        } else if (encoding.equals("MARC-8") || encoding.equals("MARC8")) {
            dataElement = getMarc8Conversion(bytes);
        } else if (encoding.equalsIgnoreCase("Unimarc") || encoding.equals("IS05426")) {
            dataElement = getUnimarcConversion(bytes);
        } else if (encoding.equals("MARC8-Maybe")) {
            final String dataElement1 = getMarc8Conversion(bytes);
            final String dataElement2 = getUnimarcConversion(bytes);

            String dataElement3 = null;

            try {
                dataElement3 = new String(bytes, "ISO-8859-1");
            } catch (final UnsupportedEncodingException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }

            if (dataElement1.equals(dataElement2) && dataElement1.equals(dataElement3)) {
                dataElement = dataElement1;
            } else {
                conversionCheck1 = conversionCheck1 + "|>" + Normalizer.normalize(dataElement1, Normalizer.NFC);
                conversionCheck2 = conversionCheck2 + "|>" + dataElement2;
                conversionCheck3 = conversionCheck3 + "|>" + dataElement3;
                dataElement = dataElement1 + "%%@%%" + dataElement2 + "%%@%%" + dataElement3;
            }
        } else if (encoding.equals("MARC8-Broken")) {
            try {
                dataElement = new String(bytes, "ISO-8859-1");
            } catch (final UnsupportedEncodingException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }

            String newdataElement = dataElement.replaceAll("<", "<");

            newdataElement = newdataElement.replaceAll(">", ">");
            newdataElement = newdataElement.replaceAll("&", "&");
            newdataElement = newdataElement.replaceAll("'", "'");
            newdataElement = newdataElement.replaceAll(""", "\"");

            if (!newdataElement.equals(dataElement)) {
                dataElement = newdataElement;
                errors.addError(ErrorHandler.ERROR_TYPO,
                        "Subfield contains escaped html character entities, un-escaping them. ");
            }

            final String rep1 = "" + (char) 0x1b + "\\$1$1";
            final String rep2 = "" + (char) 0x1b + "\\(B";

            newdataElement = dataElement.replaceAll("\\$1(.)", rep1);
            newdataElement = newdataElement.replaceAll("\\(B", rep2);

            if (!newdataElement.equals(dataElement)) {
                dataElement = newdataElement;
                errors.addError(ErrorHandler.MAJOR_ERROR,
                        "Subfield seems to be missing MARC8 escape sequences, trying to restore them.");
            }

            try {
                dataElement = getMarc8Conversion(dataElement.getBytes("ISO-8859-1"));
            } catch (final UnsupportedEncodingException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        } else if (encoding.equals("ISO-8859-1") || encoding.equals("ISO8859_1")) {
            try {
                dataElement = new String(bytes, "ISO-8859-1");
            } catch (final UnsupportedEncodingException e) {
                throw new MarcException("unsupported encoding", e);
            }
        } else {
            try {
                dataElement = new String(bytes, encoding);
            } catch (final UnsupportedEncodingException e) {
                throw new MarcException("Unknown or unsupported Marc character encoding:" + encoding);
            }
        }

        if (errors != null && dataElement.matches("[^&]*&[a-z]*;.*")) {
            String newdataElement = dataElement.replaceAll("<", "<");

            newdataElement = newdataElement.replaceAll(">", ">");
            newdataElement = newdataElement.replaceAll("&", "&");
            newdataElement = newdataElement.replaceAll("'", "'");
            newdataElement = newdataElement.replaceAll(""", "\"");

            if (!newdataElement.equals(dataElement)) {
                dataElement = newdataElement;
                errors.addError(ErrorHandler.ERROR_TYPO,
                        "Subfield contains escaped html character entities, un-escaping them. ");
            }
        }

        return dataElement;
    }

    private static boolean byteArrayContains(final byte[] bytes, final byte[] seq) {
        for (int i = 0; i < bytes.length - seq.length; i++) {
            if (bytes[i] == seq[0]) {
                for (int j = 0; j < seq.length; j++) {
                    if (bytes[i + j] != seq[j]) {
                        break;
                    }

                    if (j == seq.length - 1) {
                        return (true);
                    }
                }
            }
        }

        return (false);
    }

    static byte badEsc[] = { (byte) ('b'), (byte) ('-'), 0x1b, (byte) ('s') };

    static byte overbar[] = { (byte) (char) (0xaf) };

    /**
     * Gets MARC-8 conversion for supplied bytes.
     *
     * @param bytes Bytes to be converted to MARC-8
     * @param conv An Ansel to Unicode converter
     * @param permissive Whether this is done in a permissive manner
     * @param errors An error handler
     * @param doNCR Do numeric character reference
     * @return A MARC-8 string
     */
    public static String getMarc8Conversion(final byte[] bytes, final AnselToUnicode conv, final boolean permissive,
            final ErrorHandler errors, final boolean doNCR) {
        String dataElement = null;

        if (permissive && (byteArrayContains(bytes, badEsc) || byteArrayContains(bytes, overbar))) {
            String newDataElement = null;

            try {
                dataElement = new String(bytes, "ISO-8859-1");
                newDataElement = dataElement.replaceAll("(\\e)b-\\es([psb$()])", "$1$2");

                if (!newDataElement.equals(dataElement)) {
                    dataElement = newDataElement;
                    errors.addError(ErrorHandler.MINOR_ERROR,
                            "Subfield contains odd pattern of subscript or superscript escapes. ");
                }

                newDataElement = dataElement.replace((char) 0xaf, (char) 0xe5);

                if (!newDataElement.equals(dataElement)) {
                    dataElement = newDataElement;
                    errors.addError(ErrorHandler.ERROR_TYPO,
                            "Subfield contains 0xaf overbar character, changing it to proper MARC8 representation ");
                }

                dataElement = conv.convert(dataElement);
            } catch (final UnsupportedEncodingException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        } else {
            dataElement = conv.convert(bytes);
        }

        if (doNCR) {
            // This code handles malformed Numeric Character references that either contain
            // an extraneous %x or which are missing the final semicolon
            if (permissive && dataElement.matches("[^&]*&#x[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][^;].*")) {
                final Pattern pattern = Pattern.compile("&#x([0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])(%x)?;?");
                final Matcher matcher = pattern.matcher(dataElement);
                final StringBuffer newElement = new StringBuffer();

                int prevEnd = 0;

                while (matcher.find()) {
                    newElement.append(dataElement.substring(prevEnd, matcher.start()));
                    newElement.append(getChar(matcher.group(1)));

                    if (matcher.group(1).contains("%x") || !matcher.group(1).endsWith(";")) {
                        errors.addError(ErrorHandler.MINOR_ERROR,
                                "Subfield contains malformed Unicode Numeric Character Reference : " + matcher.group(
                                        0));
                    }

                    prevEnd = matcher.end();
                }

                newElement.append(dataElement.substring(prevEnd));
                dataElement = newElement.toString();
            }
        }

        return (dataElement);
    }

    private String getMarc8Conversion(final byte[] bytes) {
        String dataElement = null;
        if (converterAnsel == null) {
            converterAnsel = new AnselToUnicode(errors);
        }
        if (isTranslateLosslessUnicodeNumericCodeReferencesEnabled()) {
            final AnselToUnicode anselConverter = converterAnsel;
            anselConverter.setTranslateNCR(isTranslateLosslessUnicodeNumericCodeReferencesEnabled());
        }
        dataElement = getMarc8Conversion(bytes, converterAnsel, permissive, errors,
                translateLosslessUnicodeNumericCodeReferencesEnabled);
        return (dataElement);
    }

    private String getUnimarcConversion(final byte[] bytes) {
        if (converterUnimarc == null) {
            converterUnimarc = new Iso5426ToUnicode();
        }

        String dataElement = converterUnimarc.convert(bytes);

        dataElement = dataElement.replaceAll("\u0088", "");
        dataElement = dataElement.replaceAll("\u0089", "");

        if (dataElement.matches("[^<]*.*")) {
            final Pattern pattern = Pattern.compile("");
            final Matcher matcher = pattern.matcher(dataElement);
            final StringBuffer newElement = new StringBuffer();

            int prevEnd = 0;

            while (matcher.find()) {
                newElement.append(dataElement.substring(prevEnd, matcher.start()));
                newElement.append(getChar(matcher.group(1)));
                prevEnd = matcher.end();
            }

            newElement.append(dataElement.substring(prevEnd));
            dataElement = newElement.toString();
        }

        return (dataElement);

    }

    private static String getChar(final String charCodePoint) {
        final int charNum = Integer.parseInt(charCodePoint, 16);
        final String result = "" + ((char) charNum);
        return (result);
    }
}