![JAR search and dependency download from the Maven repository](/logo.png)
org.marc4j.MarcPermissiveStreamReader Maven / Gradle / Ivy
/**
* Copyright (C) 2004 Bas Peters
*
* This file is part of MARC4J
*
* MARC4J is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* MARC4J is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with MARC4J; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package org.marc4j;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.marc4j.converter.CharConverter;
import org.marc4j.marc.ControlField;
import org.marc4j.marc.DataField;
import org.marc4j.marc.Leader;
import org.marc4j.marc.MarcFactory;
import org.marc4j.marc.Record;
import org.marc4j.marc.Subfield;
import org.marc4j.marc.VariableField;
import org.marc4j.util.Normalizer;
import info.freelibrary.marc4j.converter.impl.AnselToUnicode;
import info.freelibrary.marc4j.converter.impl.Iso5426ToUnicode;
/**
* An iterator over a collection of MARC records in ISO 2709 format, that is designed to be able to handle MARC
* records that have errors in their structure or their encoding. If the permissive flag is set in the call to the
* constructor, or if a ErrorHandler object is passed in as a parameter to the constructor, this reader will do its
* best to detect and recover from a number of structural or encoding errors that can occur in a MARC record. Note
* that if this reader is not set to read permissively, its will operate pretty much identically to the
* MarcStreamReader class. Note that no attempt is made to validate the contents of the record at a semantic level.
* This reader does not know and does not care whether the record has a 245 field, or if the 008 field is the right
* length, but if the record claims to be UTF-8 or MARC8 encoded and you are seeing gibberish in the output, or if the
* reader is throwing an exception in trying to read a record, then this reader may be able to produce a usable record
* from the bad data you have. The ability to directly translate the record to UTF-8 as it is being read in is useful
* in cases where the UTF-8 version of the record will be used directly by the program that is reading the MARC data,
* for instance if the marc records are to be indexed into a SOLR search engine. Previously the MARC record could only
* be translated to UTF-8 as it was being written out via a MarcStreamWriter or a MarcXmlWriter.
*
* Example usage:
* InputStream input = new FileInputStream("file.mrc");
* MarcReader reader = new MarcPermissiveStreamReader(input, true, true);
* while (reader.hasNext()) {
* Record record = reader.next();
* // Process record
* }
*
*
* Check the {@link org.marc4j.marc} package for examples about the use of the {@link org.marc4j.marc.Record}
* object model. Check the file org.marc4j.samples.PermissiveReaderExample.java for an example about using the
* MarcPermissiveStreamReader in conjunction with the ErrorHandler class to report errors encountered while processing
* records.
*
*
* When no encoding is given as an constructor argument the parser tries to resolve the encoding by looking at the
* character coding scheme (leader position 9) in MARC21 records. For UNIMARC records this position is not defined. If
* the reader is operating in permissive mode and no encoding is given as an constructor argument the reader will look
* at the leader, and also at the data of the record to determine to the best of its ability what character encoding
* scheme has been used to encode the data in a particular MARC record.
*
*
* @author Robert Haschart
*/
public class MarcPermissiveStreamReader implements MarcReader {
private DataInputStream input = null;
private Record record;
private final MarcFactory factory;
private String encoding = "ISO8859_1";
// This represents the expected encoding of the data when a
// MARC record does not have a 'a' in character 9 of the leader.
private String defaultEncoding = "ISO8859_1";
private boolean convertToUTF8 = false;
private boolean permissive = false;
private boolean translateLosslessUnicodeNumericCodeReferencesEnabled = true;
private int marc_file_lookahead_buffer = 200000;
private AnselToUnicode converterAnsel = null;
private CharConverter converterUnimarc = null;
// These are used to algorithmically determine what encoding scheme was
// used to encode the data in the Marc record
private String conversionCheck1 = null;
private String conversionCheck2 = null;
private String conversionCheck3 = null;
private ErrorHandler errors;
static String validSubfieldCodes = "abcdefghijklmnopqrstuvwxyz0123456789";
static String upperCaseSubfieldsProperty = "org.marc4j.MarcPermissiveStreamReader.upperCaseSubfields";
/**
* Constructs an instance with the specified input stream with possible additional functionality being enabled by
* setting permissive and/or convertToUTF8 to true. If permissive and convertToUTF8 are both set to false, it
* functions almost identically to the MarcStreamReader class.
*/
public MarcPermissiveStreamReader(final InputStream input, final boolean permissive,
final boolean convertToUTF8) {
this.permissive = permissive;
this.input = new DataInputStream(new BufferedInputStream(input));
factory = MarcFactory.newInstance();
this.convertToUTF8 = convertToUTF8;
errors = null;
if (permissive) {
errors = new ErrorHandler();
defaultEncoding = "BESTGUESS";
}
}
/**
* Constructs an instance with the specified input stream with possible additional functionality being enabled by
* passing in an ErrorHandler object and/or setting convertToUTF8 to true. If errors and convertToUTF8 are both
* set to false, it functions almost identically to the MarcStreamReader class. If an ErrorHandler object is
* passed in, that object will be used to log and track any errors in the records as the records are decoded.
* After the next() function returns, you can query to determine whether any errors were detected in the decoding
* process. See the file org.marc4j.samples.PermissiveReaderExample.java to see how this can be done.
*/
public MarcPermissiveStreamReader(final InputStream input, final ErrorHandler errors,
final boolean convertToUTF8) {
if (errors != null) {
permissive = true;
defaultEncoding = "BESTGUESS";
}
this.input = new DataInputStream((input.markSupported()) ? input : new BufferedInputStream(input));
factory = MarcFactory.newInstance();
this.convertToUTF8 = convertToUTF8;
this.errors = errors;
}
/**
* Constructs an instance with the specified input stream with possible additional functionality being enabled by
* setting permissive and/or convertToUTF8 to true. If permissive and convertToUTF8 are both set to false, it
* functions almost identically to the MarcStreamReader class. The parameter defaultEncoding is used to specify
* the character encoding that is used in the records that will be read from the input stream. If permissive is
* set to true, you can specify "BESTGUESS" as the default encoding, and the reader will attempt to determine the
* character encoding used in the records being read from the input stream. This is especially useful if you are
* working with records downloaded from an external source and the encoding is either unknown or the encoding is
* different from what the records claim to be.
*/
public MarcPermissiveStreamReader(final InputStream input, final boolean permissive, final boolean convertToUTF8,
final String defaultEncoding) {
this.permissive = permissive;
this.input = new DataInputStream((input.markSupported()) ? input : new BufferedInputStream(input));
factory = MarcFactory.newInstance();
this.convertToUTF8 = convertToUTF8;
this.defaultEncoding = defaultEncoding;
errors = null;
if (permissive) {
errors = new ErrorHandler();
}
}
/**
* Constructs an instance with the specified input stream with possible additional functionality being enabled by
* setting permissive and/or convertToUTF8 to true. If errors and convertToUTF8 are both set to false, it
* functions almost identically to the MarcStreamReader class. The parameter defaultEncoding is used to specify
* the character encoding that is used in the records that will be read from the input stream. If permissive is
* set to true, you can specify "BESTGUESS" as the default encoding, and the reader will attempt to determine the
* character encoding used in the records being read from the input stream. This is especially useful if you are
* working with records downloaded from an external source and the encoding is either unknown or the encoding is
* different from what the records claim to be. If an ErrorHandler object is passed in, that object will be used
* to log and track any errors in the records as the records are decoded. After the next() function returns, you
* can query to determine whether any errors were detected in the decoding process. See the file
* org.marc4j.samples.PermissiveReaderExample.java to see how this can be done.
*/
public MarcPermissiveStreamReader(final InputStream input, final ErrorHandler errors, final boolean convertToUTF8,
final String defaultEncoding) {
this.permissive = true;
this.input = new DataInputStream(new BufferedInputStream(input));
factory = MarcFactory.newInstance();
this.convertToUTF8 = convertToUTF8;
this.defaultEncoding = defaultEncoding;
this.errors = errors;
}
/**
* @return true if numeric character entities like � should be converted to their corresponding code point
* if converting to unicode. Default is to convert.
*/
public boolean isTranslateLosslessUnicodeNumericCodeReferencesEnabled() {
return translateLosslessUnicodeNumericCodeReferencesEnabled;
}
/**
* Enable convesion of numeric code references into their corresponding code points when converting to unicode
*
* @param translateLosslessUnicodeNumericCodeReferencesEnabled
*/
public void setTranslateLosslessUnicodeNumericCodeReferencesEnabled(
final boolean translateLosslessUnicodeNumericCodeReferencesEnabled) {
this.translateLosslessUnicodeNumericCodeReferencesEnabled =
translateLosslessUnicodeNumericCodeReferencesEnabled;
}
/**
* Returns true if the iteration has more records, false otherwise.
*/
@Override
public boolean hasNext() {
try {
input.mark(10);
int byteread = input.read();
if (byteread == -1) {
return false;
}
// byte[] recLengthBuf = new byte[5];
int numBadBytes = 0;
while (byteread < '0' || byteread > '9') {
byteread = input.read();
numBadBytes++;
if (byteread == -1) {
return false;
}
}
input.reset();
while (numBadBytes > 0) {
byteread = input.read();
numBadBytes--;
}
} catch (final IOException e) {
throw new MarcException(e.getMessage(), e);
}
return true;
}
/**
* Returns the next record in the iteration.
*
* @return Record - the record object
*/
@Override
public Record next() {
record = factory.newRecord();
if (errors != null) {
errors.reset();
}
try {
final byte[] byteArray = new byte[24];
input.readFully(byteArray);
int recordLength = parseRecordLength(byteArray);
byte[] recordBuf = new byte[recordLength - 24];
if (permissive) {
input.mark(marc_file_lookahead_buffer);
input.readFully(recordBuf);
if (recordBuf[recordBuf.length - 1] != Constants.RT) {
errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR,
"Record terminator character not found at end of record length");
recordBuf = rereadPermissively(input, recordBuf, recordLength);
recordLength = recordBuf.length + 24;
}
} else {
input.readFully(recordBuf);
}
// final String tmp = new String(recordBuf);
parseRecord(record, byteArray, recordBuf, recordLength);
if (this.convertToUTF8) {
final Leader l = record.getLeader();
l.setCharCodingScheme('a');
record.setLeader(l);
}
return (record);
} catch (final EOFException e) {
throw new MarcException("Premature end of file encountered", e);
} catch (final IOException e) {
throw new MarcException("an error occured reading input", e);
}
}
private byte[] rereadPermissively(final DataInputStream aInput, final byte[] aRecordBuf, final int aRecordLength)
throws IOException {
int loc = arrayContainsAt(aRecordBuf, Constants.RT);
int recordLength = aRecordLength;
byte[] recordBuf = aRecordBuf;
if (loc != -1) {
// stated record length is too long
errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR,
"Record terminator appears before stated record length, using shorter record");
recordLength = loc + 24;
input.reset();
recordBuf = new byte[recordLength - 24];
input.readFully(recordBuf);
} else {
// stated record length is too short read ahead
loc = recordLength - 24;
boolean done = false;
while (!done) {
int c = 0;
do {
c = input.read();
loc++;
} while (loc < (marc_file_lookahead_buffer - 24) && c != Constants.RT && c != -1);
if (c == Constants.RT) {
errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR,
"Record terminator appears after stated record length, reading extra bytes");
recordLength = loc + 24;
input.reset();
recordBuf = new byte[recordLength - 24];
input.readFully(recordBuf);
done = true;
} else if (c == -1) {
errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR,
"No Record terminator found, end of file reached, Terminator appended");
recordLength = loc + 24;
input.reset();
recordBuf = new byte[recordLength - 24 + 1];
input.readFully(recordBuf);
recordBuf[recordBuf.length - 1] = Constants.RT;
done = true;
} else {
errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL,
"No Record terminator found within " + marc_file_lookahead_buffer +
" bytes of start of record, getting desperate.");
input.reset();
marc_file_lookahead_buffer *= 2;
input.mark(marc_file_lookahead_buffer);
loc = 0;
}
}
}
return (recordBuf);
}
private void parseRecord(final Record record, final byte[] aByteArray, final byte[] aRecordBuf,
final int recordLength) {
int directoryLength = 0;
byte[] byteArray = aByteArray;
byte[] recordBuf = aRecordBuf;
Leader ldr;
ldr = factory.newLeader();
ldr.setRecordLength(recordLength);
// These variables are used when the permissive reader is trying to make its best guess
// as to what character encoding is actually used in the record being processed.
conversionCheck1 = "";
conversionCheck2 = "";
conversionCheck3 = "";
try {
parseLeader(ldr, byteArray);
directoryLength = ldr.getBaseAddressOfData() - (24 + 1);
} catch (final IOException e) {
throw new MarcException("error parsing leader with data: " + new String(byteArray), e);
} catch (final MarcException e) {
if (permissive) {
if (recordBuf[recordBuf.length - 1] == Constants.RT && recordBuf[recordBuf.length -
2] == Constants.FT) {
errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR,
"Error parsing leader, trying to re-read leader either shorter or longer");
// make an attempt to recover record.
int offset = 0;
while (offset < recordBuf.length) {
if (recordBuf[offset] == Constants.FT) {
break;
}
offset++;
}
if (offset % 12 == 1) {
// move one byte from body to leader, make new leader, and try again
errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR,
"Leader appears to be too short, moving one byte from record body to leader, " +
"and trying again");
final byte oldBody[] = recordBuf;
recordBuf = new byte[oldBody.length - 1];
System.arraycopy(oldBody, 1, recordBuf, 0, oldBody.length - 1);
directoryLength = offset - 1;
ldr.setIndicatorCount(2);
ldr.setSubfieldCodeLength(2);
ldr.setImplDefined1(("" + (char) byteArray[7] + " ").toCharArray());
ldr.setImplDefined2(("" + (char) byteArray[18] + (char) byteArray[19] + (char) byteArray[20])
.toCharArray());
ldr.setEntryMap("4500".toCharArray());
// if its ' ' or 'a'
if (byteArray[10] == (byte) ' ' || byteArray[10] == (byte) 'a') {
ldr.setCharCodingScheme((char) byteArray[10]);
}
} else if (offset % 12 == 11) {
errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR,
"Leader appears to be too long, moving one byte from leader to record body, " +
"and trying again");
final byte oldBody[] = recordBuf;
recordBuf = new byte[oldBody.length + 1];
System.arraycopy(oldBody, 0, recordBuf, 1, oldBody.length);
recordBuf[0] = (byte) '0';
directoryLength = offset + 1;
ldr.setIndicatorCount(2);
ldr.setSubfieldCodeLength(2);
ldr.setImplDefined1(("" + (char) byteArray[7] + " ").toCharArray());
ldr.setImplDefined2(("" + (char) byteArray[16] + (char) byteArray[17] + (char) byteArray[18])
.toCharArray());
ldr.setEntryMap("4500".toCharArray());
// if its ' ' or 'a'
if (byteArray[8] == (byte) ' ' || byteArray[8] == (byte) 'a') {
ldr.setCharCodingScheme((char) byteArray[10]);
}
// if its ' ' or 'a'
if (byteArray[10] == (byte) ' ' || byteArray[10] == (byte) 'a') {
ldr.setCharCodingScheme((char) byteArray[10]);
}
} else {
errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL,
"error parsing leader with data: " + new String(byteArray));
throw new MarcException("error parsing leader with data: " + new String(byteArray), e);
}
}
} else {
throw new MarcException("error parsing leader with data: " + new String(byteArray), e);
}
}
final char tmp[] = ldr.getEntryMap();
if (permissive && !("" + tmp[0] + tmp[1] + tmp[2] + tmp[3]).equals("4500")) {
if (tmp[0] >= '0' && tmp[0] <= '9' && tmp[1] >= '0' && tmp[1] <= '9' && tmp[2] >= '0' && tmp[2] <= '9' &&
tmp[3] >= '0' && tmp[3] <= '9') {
errors.addError("unknown", "n/a", "n/a", ErrorHandler.ERROR_TYPO,
"Unusual character found at end of leader [ " + tmp[0] + tmp[1] + tmp[2] + tmp[3] + " ]");
} else {
errors.addError("unknown", "n/a", "n/a", ErrorHandler.ERROR_TYPO,
"Erroneous character found at end of leader [ " + tmp[0] + tmp[1] + tmp[2] + tmp[3] +
" ]; changing them to the standard \"4500\"");
ldr.setEntryMap("4500".toCharArray());
}
}
// if MARC 21 then check encoding
switch (ldr.getCharCodingScheme()) {
case 'a':
encoding = "UTF8";
break;
case ' ':
if (convertToUTF8) {
encoding = defaultEncoding;
} else {
encoding = "ISO8859_1";
}
break;
default:
if (convertToUTF8) {
if (permissive) {
errors.addError("unknown", "n/a", "n/a", ErrorHandler.MINOR_ERROR,
"Record character encoding should be 'a' or ' ' in this record it is '" + ldr
.getCharCodingScheme() + "'. Attempting to guess the correct encoding.");
encoding = "BESTGUESS";
} else {
encoding = defaultEncoding;
}
} else {
encoding = "ISO8859_1";
}
break;
}
String utfCheck;
if (encoding.equalsIgnoreCase("BESTGUESS")) {
try {
final String marc8EscSeqCheck = new String(recordBuf, "ISO-8859-1");
// If record has MARC8 character set selection strings, it must be MARC8 encoded
if (marc8EscSeqCheck.split("\\e[-(,)$bsp]", 2).length > 1) {
encoding = "MARC8";
} else {
boolean hasHighBitChars = false;
for (int i = 0; i < recordBuf.length; i++) {
if (recordBuf[i] < 0) {
// the high bit is set
hasHighBitChars = true;
break;
}
}
if (!hasHighBitChars) {
encoding = "ISO8859_1"; // You can choose any encoding you want here, the results will be the
// same.
} else {
utfCheck = new String(recordBuf, "UTF-8");
final byte byteCheck[] = utfCheck.getBytes("UTF-8");
encoding = "UTF8";
if (recordBuf.length == byteCheck.length) {
for (int i = 0; i < recordBuf.length; i++) {
if (byteCheck[i] != recordBuf[i]) {
encoding = "MARC8-Maybe";
break;
}
}
} else {
encoding = "MARC8-Maybe";
}
}
}
} catch (final UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
} else if (permissive && encoding.equals("UTF8")) {
try {
utfCheck = new String(recordBuf, "UTF-8");
final byte byteCheck[] = utfCheck.getBytes("UTF-8");
if (recordBuf.length != byteCheck.length) {
boolean foundESC = false;
for (int i = 0; i < recordBuf.length; i++) {
if (recordBuf[i] == 0x1B) {
errors.addError("unknown", "n/a", "n/a", ErrorHandler.MINOR_ERROR,
"Record claims to be UTF-8, but its not. Its probably MARC8.");
encoding = "MARC8-Maybe";
foundESC = true;
break;
}
if (byteCheck[i] != recordBuf[i]) {
encoding = "MARC8-Maybe";
}
}
if (!foundESC) {
errors.addError("unknown", "n/a", "n/a", ErrorHandler.MINOR_ERROR,
"Record claims to be UTF-8, but its not. It may be MARC8, or maybe UNIMARC, " +
"or maybe raw ISO-8859-1 ");
}
}
if (utfCheck.contains("a$1!")) {
encoding = "MARC8-Broken";
errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR,
"Record claims to be UTF-8, but its not. It seems to be MARC8-encoded but with " +
"missing escape codes.");
}
} catch (final UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
} else if (permissive && !encoding.equals("UTF8") && convertToUTF8) {
try {
final String marc8EscSeqCheck = new String(recordBuf, "ISO-8859-1");
final boolean hasMarc8EscSeq = (marc8EscSeqCheck.split("\\e[-(,)$bsp]", 2).length > 1);
utfCheck = new String(recordBuf, "UTF-8");
final byte byteCheck[] = utfCheck.getBytes("UTF-8");
if (recordBuf.length == byteCheck.length) {
for (int i = 0; i < recordBuf.length; i++) {
// need to check for byte < 0 to see if the high bit is set,
// because Java doesn't have unsigned types.
if (recordBuf[i] < 0x00 || byteCheck[i] != recordBuf[i]) {
// If record has MARC8 character set selection strings, it must be MARC8 encoded
if (hasMarc8EscSeq) {
errors.addError("unknown", "n/a", "n/a", ErrorHandler.MINOR_ERROR,
"Record has MARC8 escape sequences, " +
"but also seem to have UTF8-encoded characters.");
encoding = "MARC8-Maybe";
} else {
errors.addError("unknown", "n/a", "n/a", ErrorHandler.MINOR_ERROR,
"Record claims not to be UTF-8, but it seems to be.");
encoding = "UTF8-Maybe";
}
break;
}
}
}
} catch (final UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
record.setLeader(ldr);
boolean discardOneAtStartOfDirectory = false;
boolean discardOneSomewhereInDirectory = false;
if ((directoryLength % 12) != 0) {
// which equals 99999 - (24 + 1) its a
// BIG record (its directory is over
// 100000 bytes)
if (permissive && directoryLength == 99974 && recordLength > 200000) {
directoryLength = 0;
int tmpLength = 0;
for (tmpLength = 0; tmpLength < recordLength; tmpLength += 12) {
if (recordBuf[tmpLength] == Constants.FT) {
directoryLength = tmpLength;
break;
}
}
if (directoryLength == 0) {
throw new MarcException(
"Directory is too big (> 99999 bytes) and it doesn't end with a field terminator " +
"character, I give up. Unable to continue.");
}
} else if (permissive && directoryLength % 12 == 11 && recordBuf[1] != (byte) '0') {
errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR,
"Directory length is not a multiple of 12 bytes long. Prepending a zero and trying to " +
"continue.");
final byte oldBody[] = recordBuf;
recordBuf = new byte[oldBody.length + 1];
System.arraycopy(oldBody, 0, recordBuf, 1, oldBody.length);
recordBuf[0] = (byte) '0';
directoryLength = directoryLength + 1;
} else {
if (permissive && directoryLength % 12 == 1 && recordBuf[1] == (byte) '0' &&
recordBuf[2] == (byte) '0') {
discardOneAtStartOfDirectory = true;
errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR,
"Directory length is not a multiple of 12 bytes long. Discarding byte from start of " +
"directory and trying to continue.");
} else if (permissive && directoryLength % 12 == 1 && recordLength > 10000 &&
recordBuf[0] == (byte) '0' && recordBuf[1] == (byte) '0' && recordBuf[2] > (byte) '0' &&
recordBuf[2] <= (byte) '9') {
discardOneSomewhereInDirectory = true;
errors.addError("unknown", "n/a", "n/a", ErrorHandler.MAJOR_ERROR,
"Directory length is not a multiple of 12 bytes long. Will look for oversized field " +
"and try to work around it.");
} else {
if (errors != null) {
errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL,
"Directory length is not a multiple of 12 bytes long. Unable to continue.");
}
throw new MarcException(
"Directory length is not a multiple of 12 bytes long. Unable to continue.");
}
}
}
final DataInputStream inputrec = new DataInputStream(new ByteArrayInputStream(recordBuf));
final int size = directoryLength / 12;
final String[] tags = new String[size];
final int[] lengths = new int[size];
final byte[] tag = new byte[3];
final byte[] length = new byte[4];
final byte[] start = new byte[5];
String tmpStr;
try {
if (discardOneAtStartOfDirectory) {
inputrec.read();
}
int totalOffset = 0;
for (int i = 0; i < size; i++) {
inputrec.readFully(tag);
tmpStr = new String(tag);
tags[i] = tmpStr;
boolean proceedNormally = true;
if (discardOneSomewhereInDirectory) {
final byte lenCheck[] = new byte[10];
inputrec.mark(20);
inputrec.readFully(lenCheck);
// proceed normally
if (byteCompare(lenCheck, 4, 5, totalOffset)) {
proceedNormally = true;
} else if (byteCompare(lenCheck, 5, 5, totalOffset)) {
// field length is 5 bytes! Bad Marc record,
// proceed normally
discardOneSomewhereInDirectory = false;
errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL,
"Field is longer than 9999 bytes. Writing this record out will result in a bad " +
"record.");
proceedNormally = false;
} else {
errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL,
"Unable to reconcile problems in directory. Unable to continue.");
throw new MarcException(
"Directory length is not a multiple of 12 bytes long. Unable to continue.");
}
inputrec.reset();
}
if (proceedNormally) {
inputrec.readFully(length);
tmpStr = new String(length);
lengths[i] = Integer.parseInt(tmpStr);
inputrec.readFully(start);
} else {
// length is 5 bytes long
inputrec.readFully(start);
tmpStr = new String(start);
lengths[i] = Integer.parseInt(tmpStr);
inputrec.readFully(start);
}
totalOffset += lengths[i];
}
// If we still haven't found the extra byte, throw out the last byte and try to continue;
if (discardOneSomewhereInDirectory) {
inputrec.read();
}
if (inputrec.read() != Constants.FT) {
errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL,
"Expected field terminator at end of directory. Unable to continue.");
throw new MarcException("expected field terminator at end of directory");
}
int numBadLengths = 0;
int totalLength = 0;
for (int i = 0; i < size; i++) {
final int fieldLength = getFieldLength(inputrec);
if (fieldLength + 1 != lengths[i] && permissive) {
if (numBadLengths < 5 && (totalLength + fieldLength < recordLength + 26)) {
inputrec.mark(9999);
byteArray = new byte[lengths[i]];
inputrec.readFully(byteArray);
inputrec.reset();
if (fieldLength + 1 < lengths[i] && byteArray[lengths[i] - 1] == Constants.FT) {
errors.addError("unknown", "n/a", "n/a", ErrorHandler.MINOR_ERROR,
"Field Terminator character found in the middle of a field.");
} else {
numBadLengths++;
lengths[i] = fieldLength + 1;
errors.addError("unknown", "n/a", "n/a", ErrorHandler.MINOR_ERROR,
"Field length found in record different from length stated in the directory.");
if (fieldLength + 1 > 9999) {
errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL,
"Field length is greater than 9999, record cannot be represented as a " +
"binary Marc record.");
}
}
}
}
totalLength += lengths[i];
if (Constants.CF_TAG_PATTERN.matcher(tags[i]).find()) {
byteArray = new byte[lengths[i] - 1];
inputrec.readFully(byteArray);
if (inputrec.read() != Constants.FT) {
errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL,
"Expected field terminator at end of field. Unable to continue.");
throw new MarcException("expected field terminator at end of field");
}
final ControlField field = factory.newControlField();
field.setTag(tags[i]);
field.setData(getDataAsString(byteArray));
record.addVariableField(field);
} else {
byteArray = new byte[lengths[i]];
inputrec.readFully(byteArray);
try {
record.addVariableField(parseDataField(tags[i], byteArray));
} catch (final IOException e) {
throw new MarcException("error parsing data field for tag: " + tags[i] + " with data: " +
new String(byteArray), e);
}
}
}
// We've determined that although the record says it is UTF-8, it is not.
// Here we make an attempt to determine the actual encoding of the data in the record.
if (permissive && conversionCheck1.length() > 1 && conversionCheck2.length() > 1 && conversionCheck3
.length() > 1) {
guessAndSelectCorrectNonUTF8Encoding();
}
if (inputrec.read() != Constants.RT) {
errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL,
"Expected record terminator at end of record. Unable to continue.");
throw new MarcException("expected record terminator");
}
} catch (final IOException e) {
errors.addError("unknown", "n/a", "n/a", ErrorHandler.FATAL,
"Error reading from data file. Unable to continue.");
throw new MarcException("an error occured reading input", e);
}
}
private boolean byteCompare(final byte[] lenCheck, final int offset, final int length, final int totalOffset) {
int divisor = 1;
for (int i = offset + length - 1; i >= offset; i--, divisor *= 10) {
if (((totalOffset / divisor) % 10) + '0' != lenCheck[i]) {
return (false);
}
}
return true;
}
private void guessAndSelectCorrectNonUTF8Encoding() {
int defaultPart = 0;
if (record.getVariableField("245") == null) {
defaultPart = 1;
}
int partToUse = 0;
final int l1 = conversionCheck1.length();
final int l2 = conversionCheck2.length();
final int l3 = conversionCheck3.length();
int tst;
if (l1 < l3 && l2 == l3 && defaultPart == 0) {
errors.addError(ErrorHandler.INFO, "MARC8 translation shorter than ISO-8859-1, choosing MARC8.");
partToUse = 0;
} else if (l2 < l1 - 2 && l2 < l3 - 2) {
errors.addError(ErrorHandler.INFO, "Unimarc translation shortest, choosing it.");
partToUse = 1;
} else if ((tst = onlyOneStartsWithUpperCase(conversionCheck1, conversionCheck2, conversionCheck3)) != -1) {
partToUse = tst;
} else if (l2 < l1 && l2 < l3) {
errors.addError(ErrorHandler.INFO, "Unimarc translation shortest, choosing it.");
partToUse = 1;
} else if (conversionCheck2.equals(conversionCheck3) && !conversionCheck1.trim().contains(" ")) {
errors.addError(ErrorHandler.INFO, "Unimarc and ISO-8859-1 translations identical, choosing ISO-8859-1.");
partToUse = 2;
} else if (!specialCharIsBetweenLetters(conversionCheck1)) {
errors.addError(ErrorHandler.INFO, "To few letters in translations, choosing " + (defaultPart == 0
? "MARC8" : "Unimarc"));
partToUse = defaultPart;
} else if (l2 == l3 && defaultPart == 1) {
errors.addError(ErrorHandler.INFO,
"Unimarc and ISO-8859-1 translations equal length, choosing ISO-8859-1.");
partToUse = 2;
} else {
errors.addError(ErrorHandler.INFO, "No Determination made, defaulting to " + (defaultPart == 0 ? "MARC8"
: "Unimarc"));
partToUse = defaultPart;
}
final List fields = record.getVariableFields();
final Iterator iter = fields.iterator();
while (iter.hasNext()) {
final VariableField field = iter.next();
if (field instanceof DataField) {
final DataField df = (DataField) field;
final List subf = df.getSubfields();
final Iterator sfiter = subf.iterator();
while (sfiter.hasNext()) {
final Subfield sf = sfiter.next();
if (sf.getData().contains("%%@%%")) {
final String parts[] = sf.getData().split("%%@%%", 3);
sf.setData(parts[partToUse]);
}
}
}
}
}
private int onlyOneStartsWithUpperCase(final String conversionCheck12, final String conversionCheck22,
final String conversionCheck32) {
if (conversionCheck1.length() == 0 || conversionCheck2.length() == 0 || conversionCheck3.length() == 0) {
return -1;
}
final String check1Parts[] = conversionCheck1.trim().split("[|]>");
final String check2Parts[] = conversionCheck2.trim().split("[|]>");
final String check3Parts[] = conversionCheck3.trim().split("[|]>");
for (int i = 1; i < check1Parts.length && i < check2Parts.length && i < check3Parts.length; i++) {
final boolean tst1 = Character.isUpperCase(check1Parts[i].charAt(0));
final boolean tst2 = Character.isUpperCase(check2Parts[i].charAt(0));
final boolean tst3 = Character.isUpperCase(check3Parts[i].charAt(0));
if (tst1 && !tst2 && !tst3) {
return (0);
}
if (!tst1 && tst2 && !tst3) {
return (-1);
}
if (!tst1 && !tst2 && tst3) {
return (2);
}
}
return -1;
}
private boolean specialCharIsBetweenLetters(final String conversionCheck) {
boolean bewteenLetters = true;
for (int i = 0; i < conversionCheck.length(); i++) {
final int charCode = (conversionCheck.charAt(i));
if (charCode > 0x7f) {
bewteenLetters = false;
if (i > 0 && Character.isLetter((int) (conversionCheck.charAt(i - 1))) || (i < conversionCheck
.length() - 1 && Character.isLetter((int) (conversionCheck.charAt(i + 1))))) {
bewteenLetters = true;
break;
}
}
}
return (bewteenLetters);
}
private int arrayContainsAt(final byte[] byteArray, final int ft) {
for (int i = 0; i < byteArray.length; i++) {
if (byteArray[i] == (byte) ft) {
return i;
}
}
return (-1);
}
private DataField parseDataField(final String tag, final byte[] field) throws IOException {
if (permissive) {
errors.setRecordID(record.getControlNumber());
if (tag.equals("880")) {
String fieldTag = new String(field);
fieldTag = fieldTag.replaceFirst("^.*\\x1F6", "").replaceFirst("([-0-9]*).*", "$1");
errors.setCurrentField(tag + "(" + fieldTag + ")");
} else {
errors.setCurrentField(tag);
}
errors.setCurrentSubfield("n/a");
cleanupBadFieldSeperators(field, errors);
}
final ByteArrayInputStream bais = new ByteArrayInputStream(field);
final char ind1 = (char) bais.read();
final char ind2 = (char) bais.read();
final DataField dataField = factory.newDataField();
dataField.setTag(tag);
dataField.setIndicator1(ind1);
dataField.setIndicator2(ind2);
int code;
int size;
int readByte;
byte[] data;
Subfield subfield;
while (true) {
readByte = bais.read();
if (readByte < 0) {
break;
}
switch (readByte) {
case Constants.US:
code = bais.read();
if (code < 0) {
throw new IOException("unexpected end of data field");
}
if (code == Constants.FT) {
break;
}
size = getSubfieldLength(bais);
if (size == 0) {
if (permissive) {
errors.addError(ErrorHandler.MINOR_ERROR,
"Subfield of zero length encountered, ignoring it.");
continue;
}
throw new IOException("Subfield of zero length encountered");
}
data = new byte[size];
bais.read(data);
subfield = factory.newSubfield();
if (permissive) {
errors.setCurrentSubfield("" + (char) code);
}
String dataAsString = getDataAsString(data);
if (permissive && code == Constants.US) {
code = data[0];
dataAsString = dataAsString.substring(1);
errors.addError(ErrorHandler.MAJOR_ERROR,
"Subfield tag is a subfield separator, using first character of field as " +
"subfield tag.");
} else if (permissive && validSubfieldCodes.indexOf(code) == -1) {
if (code >= 'A' && code <= 'Z') {
final String ucSubfields = System.getProperty(upperCaseSubfieldsProperty, "false");
if (Boolean.parseBoolean(ucSubfields) == false) {
code = Character.toLowerCase(code);
errors.addError(ErrorHandler.MINOR_ERROR,
"Subfield tag is an invalid uppercase character, changing it to lower case.");
} else {
// the System Property org.marc4j.MarcPermissiveStreamReader.upperCaseSubfields is
// defined to allow upperCaseSubfields
// therefore do nothing and be happy
}
} else if (code > 0x7f) {
code = data[0];
dataAsString = dataAsString.substring(1);
errors.addError(ErrorHandler.MAJOR_ERROR,
"Subfield tag is an invalid character greater than 0x7f, using first character " +
"of field as subfield tag.");
} else if (code == '[' && tag.equals("245")) {
code = 'h';
dataAsString = '[' + dataAsString;
errors.addError(ErrorHandler.MAJOR_ERROR,
"Subfield tag is an open bracket, generating a code 'h' and pushing the bracket " +
"to the data.");
} else if (code == ' ') {
errors.addError(ErrorHandler.MAJOR_ERROR,
"Subfield tag is a space which is an invalid character");
} else {
errors.addError(ErrorHandler.MAJOR_ERROR, "Subfield tag is an invalid character, [ " +
((char) code) + " ]");
}
}
subfield.setCode((char) code);
subfield.setData(dataAsString);
dataField.addSubfield(subfield);
break;
case Constants.FT:
break;
}
}
return dataField;
}
static AnselToUnicode conv = null;
private static void cleanupBadFieldSeperators(final byte[] field, final ErrorHandler errors) {
if (conv == null) {
conv = new AnselToUnicode(true);
}
boolean hasEsc = false;
boolean inMultiByte = false;
boolean justCleaned = false;
int mbOffset = 0;
boolean inCyrillic = false;
int flen = 0;
for (int i = 0; i < field.length - 1; i++) {
if (field[i] == 0x1B) {
hasEsc = true;
if ("(,)-'".indexOf((char) field[i + 1]) != -1) {
inMultiByte = false;
if (i + 2 < field.length && (char) field[i + 2] == 'N') {
inCyrillic = true;
} else {
inCyrillic = false;
}
} else if (i + 2 < field.length && field[i + 1] == '$' && field[i + 2] == '1') {
inMultiByte = true;
mbOffset = 3;
} else if (i + 3 < field.length && (field[i + 1] == '$' || field[i + 2] == '$') && (field[i +
2] == '1' || field[i + 3] == '1')) {
inMultiByte = true;
mbOffset = 4;
}
} else if (inMultiByte && (field[i] != 0x20 && field[i] >= 0)) {
mbOffset = (mbOffset == 0) ? 2 : mbOffset - 1;
}
if (inMultiByte && mbOffset == 0 && i + 2 < field.length && field[i] > 0) {
char c;
final byte f1 = field[i];
final byte f2 = field[i + 1] == 0x20 ? field[i + 2] : field[i + 1];
final byte f3 = (field[i + 1] == 0x20 || field[i + 2] == 0x20) ? field[i + 3] : field[i + 2];
c = conv.getMBChar(conv.makeMultibyte((char) ((f1 == Constants.US) ? 0x7C : f1),
(char) ((f2 == Constants.US) ? 0x7C : f2), (char) ((f3 == Constants.US) ? 0x7C : f3)));
if (c == 0 && !justCleaned) {
errors.addError(ErrorHandler.MAJOR_ERROR,
"Bad Multibyte character found, reinterpreting data as non-multibyte data");
inMultiByte = false;
} else if (c == 0 && justCleaned) {
c = conv.getMBChar(conv.makeMultibyte('!', (char) ((f2 == Constants.US) ? 0x7C : f2),
(char) ((f3 == Constants.US) ? 0x7C : f3)));
if (c == 0) {
errors.addError(ErrorHandler.MAJOR_ERROR,
"Bad Multibyte character found, reinterpreting data as non-multibyte data");
inMultiByte = false;
} else {
errors.addError(ErrorHandler.MAJOR_ERROR,
"Character after restored vertical bar character makes bad multibyte character, " +
"changing it to \"!\"");
field[i] = '!';
}
}
}
justCleaned = false;
if (field[i] == Constants.US) {
if (inMultiByte && mbOffset != 0) {
field[i] = 0x7C;
errors.addError(ErrorHandler.MAJOR_ERROR,
"Subfield separator found in middle of a multibyte character, changing it to a " +
"vertical bar, and continuing");
if (field[i + 1] == '0') {
if (field[i + 2] == '(' && field[i + 3] == 'B') {
field[i + 1] = 0x1B;
errors.addError(ErrorHandler.MAJOR_ERROR,
"Character after restored vertical bar character makes bad multibyte character, " +
"changing it to ESC");
} else {
field[i + 1] = 0x21;
errors.addError(ErrorHandler.MAJOR_ERROR,
"Character after restored vertical bar character makes bad multibyte character, " +
"changing it to \"!\"");
}
}
justCleaned = true;
} else if (hasEsc && inCyrillic) {
final String prev = new String(field, i - (flen - 1), flen - 1);
if (!(field[i + 1] >= 'a' && field[i + 1] <= 'z') || prev.equals("\u001b(N")) {
errors.addError(ErrorHandler.MINOR_ERROR,
"Subfield separator found in Cyrillic string, changing separator to a vertical bar, " +
"and continuing");
field[i] = 0x7C;
justCleaned = true;
}
} else if (hasEsc && !checkSubfieldByte(field[i + 1])) {
errors.addError(ErrorHandler.MAJOR_ERROR,
"Subfield separator followed by invalid subfield tag, changing separator to a vertical " +
"bar, and continuing");
field[i] = 0x7C;
justCleaned = true;
} else if (hasEsc && i < field.length - 3 && (field[i + 1] == '0' && field[i + 2] == '(' && field[i +
3] == 'B')) {
errors.addError(ErrorHandler.MAJOR_ERROR,
"Subfield separator followed by invalid subfield tag, changing separator to a vertical " +
"bar, and continuing");
field[i] = 0x7C;
field[i + 1] = 0x1B;
justCleaned = true;
} else if (hasEsc && (field[i + 1] == '0')) {
errors.addError(ErrorHandler.MAJOR_ERROR,
"Subfield separator followed by invalid subfield tag, changing separator to a vertical " +
"bar, and continuing");
field[i] = 0x7C;
field[i + 1] = 0x21;
justCleaned = true;
} else if (field[i + 1] == Constants.US && field[i + 2] == Constants.US) {
errors.addError(ErrorHandler.MAJOR_ERROR,
"Three consecutive subfield separators, changing first two to vertical bars.");
field[i] = 0x7C;
field[i + 1] = 0x7C;
justCleaned = true;
}
}
if (field[i] == Constants.US) {
flen = 0;
} else {
flen++;
}
}
}
private static boolean checkSubfieldByte(final byte aByte) {
return (aByte >= 'a' && aByte <= 'z') || (aByte >= '0' && aByte <= '9');
}
private int getFieldLength(final DataInputStream bais) throws IOException {
bais.mark(9999);
int bytesRead = 0;
while (true) {
switch (bais.read()) {
case Constants.FT:
bais.reset();
return bytesRead;
case -1:
bais.reset();
if (permissive) {
errors.addError(ErrorHandler.MINOR_ERROR, "Field not terminated trying to continue");
return (bytesRead);
} else {
throw new IOException("Field not terminated");
}
case Constants.US:
default:
bytesRead++;
}
}
}
private int getSubfieldLength(final ByteArrayInputStream bais) throws IOException {
bais.mark(9999);
int bytesRead = 0;
while (true) {
switch (bais.read()) {
case Constants.FT:
bais.reset();
return bytesRead;
case Constants.US:
bais.reset();
return bytesRead;
case -1:
bais.reset();
if (permissive) {
errors.addError(ErrorHandler.MINOR_ERROR, "Subfield not terminated trying to continue");
return (bytesRead);
} else {
throw new IOException("subfield not terminated");
}
default:
bytesRead++;
}
}
}
private int parseRecordLength(final byte[] leaderData) throws IOException {
final InputStreamReader isr = new InputStreamReader(new ByteArrayInputStream(leaderData));
final char[] tmp = new char[5];
int length = -1;
isr.read(tmp);
try {
length = Integer.parseInt(new String(tmp));
} catch (final NumberFormatException e) {
errors.addError(ErrorHandler.FATAL, "Unable to parse record length, Unable to Continue");
throw new MarcException("unable to parse record length", e);
}
return (length);
}
private void parseLeader(final Leader ldr, final byte[] leaderData) throws IOException {
final InputStreamReader isr = new InputStreamReader(new ByteArrayInputStream(leaderData), "ISO-8859-1");
char[] tmp = new char[5];
isr.read(tmp);
// Skip over bytes for record length, If we get here, its already been computed.
ldr.setRecordStatus((char) isr.read());
ldr.setTypeOfRecord((char) isr.read());
tmp = new char[2];
isr.read(tmp);
ldr.setImplDefined1(tmp);
ldr.setCharCodingScheme((char) isr.read());
final char indicatorCount = (char) isr.read();
final char subfieldCodeLength = (char) isr.read();
final char baseAddr[] = new char[5];
isr.read(baseAddr);
tmp = new char[3];
isr.read(tmp);
ldr.setImplDefined2(tmp);
tmp = new char[4];
isr.read(tmp);
ldr.setEntryMap(tmp);
isr.close();
try {
ldr.setIndicatorCount(Integer.parseInt(String.valueOf(indicatorCount)));
} catch (final NumberFormatException e) {
if (permissive) {
// All Marc21 records should have indicatorCount '2'
errors.addError(ErrorHandler.ERROR_TYPO, "bogus indicator count - byte value = " + Integer
.toHexString(indicatorCount & 0xff));
ldr.setIndicatorCount(2);
} else {
throw new MarcException("unable to parse indicator count", e);
}
}
try {
ldr.setSubfieldCodeLength(Integer.parseInt(String.valueOf(subfieldCodeLength)));
} catch (final NumberFormatException e) {
if (permissive) {
// All Marc21 records should have subfieldCodeLength '2'
errors.addError(ErrorHandler.ERROR_TYPO, "bogus subfield count - byte value = " + Integer
.toHexString(subfieldCodeLength & 0xff));
ldr.setSubfieldCodeLength(2);
} else {
throw new MarcException("unable to parse subfield code length", e);
}
}
try {
ldr.setBaseAddressOfData(Integer.parseInt(new String(baseAddr)));
} catch (final NumberFormatException e) {
throw new MarcException("unable to parse base address of data", e);
}
}
private String getDataAsString(final byte[] bytes) {
String dataElement = null;
if (encoding.equals("UTF-8") || encoding.equals("UTF8")) {
try {
dataElement = new String(bytes, "UTF-8");
} catch (final UnsupportedEncodingException e) {
throw new MarcException("unsupported encoding", e);
}
} else if (encoding.equals("UTF8-Maybe")) {
try {
dataElement = new String(bytes, "UTF-8");
} catch (final UnsupportedEncodingException e) {
throw new MarcException("unsupported encoding", e);
}
} else if (encoding.equals("MARC-8") || encoding.equals("MARC8")) {
dataElement = getMarc8Conversion(bytes);
} else if (encoding.equalsIgnoreCase("Unimarc") || encoding.equals("IS05426")) {
dataElement = getUnimarcConversion(bytes);
} else if (encoding.equals("MARC8-Maybe")) {
final String dataElement1 = getMarc8Conversion(bytes);
final String dataElement2 = getUnimarcConversion(bytes);
String dataElement3 = null;
try {
dataElement3 = new String(bytes, "ISO-8859-1");
} catch (final UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
if (dataElement1.equals(dataElement2) && dataElement1.equals(dataElement3)) {
dataElement = dataElement1;
} else {
conversionCheck1 = conversionCheck1 + "|>" + Normalizer.normalize(dataElement1, Normalizer.NFC);
conversionCheck2 = conversionCheck2 + "|>" + dataElement2;
conversionCheck3 = conversionCheck3 + "|>" + dataElement3;
dataElement = dataElement1 + "%%@%%" + dataElement2 + "%%@%%" + dataElement3;
}
} else if (encoding.equals("MARC8-Broken")) {
try {
dataElement = new String(bytes, "ISO-8859-1");
} catch (final UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
String newdataElement = dataElement.replaceAll("<", "<");
newdataElement = newdataElement.replaceAll(">", ">");
newdataElement = newdataElement.replaceAll("&", "&");
newdataElement = newdataElement.replaceAll("'", "'");
newdataElement = newdataElement.replaceAll(""", "\"");
if (!newdataElement.equals(dataElement)) {
dataElement = newdataElement;
errors.addError(ErrorHandler.ERROR_TYPO,
"Subfield contains escaped html character entities, un-escaping them. ");
}
final String rep1 = "" + (char) 0x1b + "\\$1$1";
final String rep2 = "" + (char) 0x1b + "\\(B";
newdataElement = dataElement.replaceAll("\\$1(.)", rep1);
newdataElement = newdataElement.replaceAll("\\(B", rep2);
if (!newdataElement.equals(dataElement)) {
dataElement = newdataElement;
errors.addError(ErrorHandler.MAJOR_ERROR,
"Subfield seems to be missing MARC8 escape sequences, trying to restore them.");
}
try {
dataElement = getMarc8Conversion(dataElement.getBytes("ISO-8859-1"));
} catch (final UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
} else if (encoding.equals("ISO-8859-1") || encoding.equals("ISO8859_1")) {
try {
dataElement = new String(bytes, "ISO-8859-1");
} catch (final UnsupportedEncodingException e) {
throw new MarcException("unsupported encoding", e);
}
} else {
try {
dataElement = new String(bytes, encoding);
} catch (final UnsupportedEncodingException e) {
throw new MarcException("Unknown or unsupported Marc character encoding:" + encoding);
}
}
if (errors != null && dataElement.matches("[^&]*&[a-z]*;.*")) {
String newdataElement = dataElement.replaceAll("<", "<");
newdataElement = newdataElement.replaceAll(">", ">");
newdataElement = newdataElement.replaceAll("&", "&");
newdataElement = newdataElement.replaceAll("'", "'");
newdataElement = newdataElement.replaceAll(""", "\"");
if (!newdataElement.equals(dataElement)) {
dataElement = newdataElement;
errors.addError(ErrorHandler.ERROR_TYPO,
"Subfield contains escaped html character entities, un-escaping them. ");
}
}
return dataElement;
}
private static boolean byteArrayContains(final byte[] bytes, final byte[] seq) {
for (int i = 0; i < bytes.length - seq.length; i++) {
if (bytes[i] == seq[0]) {
for (int j = 0; j < seq.length; j++) {
if (bytes[i + j] != seq[j]) {
break;
}
if (j == seq.length - 1) {
return (true);
}
}
}
}
return (false);
}
static byte badEsc[] = { (byte) ('b'), (byte) ('-'), 0x1b, (byte) ('s') };
static byte overbar[] = { (byte) (char) (0xaf) };
/**
* Gets MARC-8 conversion for supplied bytes.
*
* @param bytes Bytes to be converted to MARC-8
* @param conv An Ansel to Unicode converter
* @param permissive Whether this is done in a permissive manner
* @param errors An error handler
* @param doNCR Do numeric character reference
* @return A MARC-8 string
*/
public static String getMarc8Conversion(final byte[] bytes, final AnselToUnicode conv, final boolean permissive,
final ErrorHandler errors, final boolean doNCR) {
String dataElement = null;
if (permissive && (byteArrayContains(bytes, badEsc) || byteArrayContains(bytes, overbar))) {
String newDataElement = null;
try {
dataElement = new String(bytes, "ISO-8859-1");
newDataElement = dataElement.replaceAll("(\\e)b-\\es([psb$()])", "$1$2");
if (!newDataElement.equals(dataElement)) {
dataElement = newDataElement;
errors.addError(ErrorHandler.MINOR_ERROR,
"Subfield contains odd pattern of subscript or superscript escapes. ");
}
newDataElement = dataElement.replace((char) 0xaf, (char) 0xe5);
if (!newDataElement.equals(dataElement)) {
dataElement = newDataElement;
errors.addError(ErrorHandler.ERROR_TYPO,
"Subfield contains 0xaf overbar character, changing it to proper MARC8 representation ");
}
dataElement = conv.convert(dataElement);
} catch (final UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
} else {
dataElement = conv.convert(bytes);
}
if (doNCR) {
// This code handles malformed Numeric Character references that either contain
// an extraneous %x or which are missing the final semicolon
if (permissive && dataElement.matches("[^&]*[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][^;].*")) {
final Pattern pattern = Pattern.compile("([0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])(%x)?;?");
final Matcher matcher = pattern.matcher(dataElement);
final StringBuffer newElement = new StringBuffer();
int prevEnd = 0;
while (matcher.find()) {
newElement.append(dataElement.substring(prevEnd, matcher.start()));
newElement.append(getChar(matcher.group(1)));
if (matcher.group(1).contains("%x") || !matcher.group(1).endsWith(";")) {
errors.addError(ErrorHandler.MINOR_ERROR,
"Subfield contains malformed Unicode Numeric Character Reference : " + matcher.group(
0));
}
prevEnd = matcher.end();
}
newElement.append(dataElement.substring(prevEnd));
dataElement = newElement.toString();
}
}
return (dataElement);
}
private String getMarc8Conversion(final byte[] bytes) {
String dataElement = null;
if (converterAnsel == null) {
converterAnsel = new AnselToUnicode(errors);
}
if (isTranslateLosslessUnicodeNumericCodeReferencesEnabled()) {
final AnselToUnicode anselConverter = converterAnsel;
anselConverter.setTranslateNCR(isTranslateLosslessUnicodeNumericCodeReferencesEnabled());
}
dataElement = getMarc8Conversion(bytes, converterAnsel, permissive, errors,
translateLosslessUnicodeNumericCodeReferencesEnabled);
return (dataElement);
}
private String getUnimarcConversion(final byte[] bytes) {
if (converterUnimarc == null) {
converterUnimarc = new Iso5426ToUnicode();
}
String dataElement = converterUnimarc.convert(bytes);
dataElement = dataElement.replaceAll("\u0088", "");
dataElement = dataElement.replaceAll("\u0089", "");
if (dataElement.matches("[^<]*.*")) {
final Pattern pattern = Pattern.compile("");
final Matcher matcher = pattern.matcher(dataElement);
final StringBuffer newElement = new StringBuffer();
int prevEnd = 0;
while (matcher.find()) {
newElement.append(dataElement.substring(prevEnd, matcher.start()));
newElement.append(getChar(matcher.group(1)));
prevEnd = matcher.end();
}
newElement.append(dataElement.substring(prevEnd));
dataElement = newElement.toString();
}
return (dataElement);
}
private static String getChar(final String charCodePoint) {
final int charNum = Integer.parseInt(charCodePoint, 16);
final String result = "" + ((char) charNum);
return (result);
}
}