All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.marc4j.MarcXmlWriter Maven / Gradle / Ivy

/**
 * Copyright (C) 2004 Bas Peters
 *
 * This file is part of MARC4J
 *
 * MARC4J is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * MARC4J is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with MARC4J; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

package org.marc4j;

import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;

import javax.xml.transform.OutputKeys;
import javax.xml.transform.Result;
import javax.xml.transform.Source;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;

import org.marc4j.converter.CharConverter;
import org.marc4j.marc.ControlField;
import org.marc4j.marc.DataField;
import org.marc4j.marc.Leader;
import org.marc4j.marc.MarcFactory;
import org.marc4j.marc.Record;
import org.marc4j.marc.Subfield;
import org.marc4j.util.Normalizer;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;

import info.freelibrary.marc4j.converter.impl.AnselToUnicode;

/**
 * Class for writing MARC record objects in MARCXML format. This class outputs a SAX event stream to the given
 * {@link java.io.OutputStream}  or {@link javax.xml.transform.Result} object. It can be used in a SAX
 * pipeline to post-process the result. By default this class uses a null transform. It is strongly recommended to use
 * a dedicated XML serializer.
 * 

*

* This class requires a JAXP compliant XML parser and XSLT processor. The underlying SAX2 parser should be namespace * aware. *

*

* The following example reads a file with MARC records and writes MARCXML records in UTF-8 encoding to the console: *

*

*

 *
 *      InputStream input = new FileInputStream("input.mrc")
 *      MarcReader reader = new MarcStreamReader(input);
 *
 *      MarcWriter writer = new MarcXmlWriter(System.out, true);
 *      while (reader.hasNext()) {
 *          Record record = reader.next();
 *          writer.write(record);
 *      }
 *      writer.close();
 *
 * 
*

*

* To perform a character conversion like MARC-8 to UCS/Unicode register a CharConverter: *

*

*

 * writer.setConverter(new AnselToUnicode());
 * 
*

*

* In addition you can perform Unicode normalization. This is for example not done by the MARC-8 to UCS/Unicode * converter. With Unicode normalization text is transformed into the canonical composed form. For example * "a�bc" is normalized to "�bc". To perform normalization set Unicode normalization to true: *

*

*

 * writer.setUnicodeNormalization(true);
 * 
*

*

* Please note that it's not garanteed to work if you try to convert normalized Unicode back to MARC-8 encoding using * {@link info.freelibrary.marc4j.converter.impl.UnicodeToAnsel}. *

*

* This class provides very basic formatting options. For more advanced options create an instance of this class with * a {@link javax.xml.transform.sax.SAXResult} containing a {@link org.xml.sax.ContentHandler} derived from * a dedicated XML serializer. *

*

*

* The following example uses org.apache.xml.serialize.XMLSerializer to write MARC records to XML using * MARC-8 to UCS/Unicode conversion and Unicode normalization: *

*

*

 *
 *      InputStream input = new FileInputStream("input.mrc")
 *      MarcReader reader = new MarcStreamReader(input);
 *
 *      OutputFormat format = new OutputFormat("xml","UTF-8", true);
 *      OutputStream out = new FileOutputStream("output.xml");
 *      XMLSerializer serializer = new XMLSerializer(out, format);
 *      Result result = new SAXResult(serializer.asContentHandler());
 *
 *      MarcXmlWriter writer = new MarcXmlWriter(result);
 *      writer.setConverter(new AnselToUnicode());
 *      while (reader.hasNext()) {
 *          Record record = reader.next();
 *          writer.write(record);
 *      }
 *      writer.close();
 *
 * 
*

*

* You can post-process the result using a Source object pointing to a stylesheet resource and a * Result object to hold the transformation result tree. The example below converts MARC to MARCXML and * transforms the result tree to MODS using the stylesheet provided by The Library of Congress: *

*

*

 *
 *      String stylesheetUrl = "http://www.loc.gov/standards/mods/v3/MARC21slim2MODS3.xsl";
 *      Source stylesheet = new StreamSource(stylesheetUrl);
 *
 *      Result result = new StreamResult(System.out);
 *
 *      InputStream input = new FileInputStream("input.mrc")
 *      MarcReader reader = new MarcStreamReader(input);
 *      MarcXmlWriter writer = new MarcXmlWriter(result, stylesheet);
 *      writer.setConverter(new AnselToUnicode());
 *      while (reader.hasNext()) {
 *          Record record = (Record) reader.next();
 *          writer.write(record);
 *      }
 *      writer.close();
 *
 * 
*

*

* It is also possible to write the result into a DOM Node: *

*

*

 *
 *      InputStream input = new FileInputStream("input.mrc")
 *      MarcReader reader = new MarcStreamReader(input);
 *      DOMResult result = new DOMResult();
 *      MarcXmlWriter writer = new MarcXmlWriter(result);
 *      writer.setConverter(new AnselToUnicode());
 *      while (reader.hasNext()) {
 *          Record record = (Record) reader.next();
 *          writer.write(record);
 *      }
 *      writer.close();
 *
 *      Document doc = (Document) result.getNode();
 *
 * 
* * @author Bas Peters */ public class MarcXmlWriter implements MarcWriter { public static final String CONTROL_FIELD = "controlfield"; public static final String DATA_FIELD = "datafield"; public static final String SUBFIELD = "subfield"; public static final String COLLECTION = "collection"; public static final String RECORD = "record"; public static final String LEADER = "leader"; private boolean indent = false; private TransformerHandler handler = null; private Writer writer = null; /** * Character encoding. Default is UTF-8. */ private String encoding = "UTF8"; private CharConverter converter = null; private boolean normalize = false; private MarcXmlWriter() { } /** * Constructs an instance with the specified output stream. *

* The default character encoding for UTF-8 is used. * * @throws MarcException */ public MarcXmlWriter(final OutputStream out) { this(out, false); } /** * Constructs an instance with the specified output stream and indentation. *

* The default character encoding for UTF-8 is used. * * @throws MarcException */ public MarcXmlWriter(final OutputStream out, final boolean indent) { this(out, "UTF8", indent); } /** * Constructs an instance with the specified output stream and character encoding. * * @throws MarcException */ public MarcXmlWriter(final OutputStream out, final String encoding) { this(out, encoding, false); } /** * Constructs an instance with the specified output stream, character encoding and indentation. * * @throws MarcException */ public MarcXmlWriter(final OutputStream out, final String encoding, final boolean indent) { this.encoding = encoding; if (out == null) { throw new NullPointerException("null OutputStream"); } if (this.encoding == null) { throw new NullPointerException("null encoding"); } try { setIndent(indent); writer = new OutputStreamWriter(out, encoding); writer = new BufferedWriter(writer); setHandler(new StreamResult(writer), null); } catch (final UnsupportedEncodingException details) { throw new MarcException(details.getMessage(), details); } writeStartDocument(); } /** * Constructs an instance with the specified result. * * @param result * @throws SAXException */ public MarcXmlWriter(final Result result) { if (result == null) { throw new NullPointerException("null Result"); } setHandler(result, null); writeStartDocument(); } /** * Constructs an instance with the specified stylesheet location and result. * * @param result * @throws SAXException */ public MarcXmlWriter(final Result result, final String stylesheetUrl) { this(result, new StreamSource(stylesheetUrl)); } /** * Constructs an instance with the specified stylesheet source and result. * * @param result * @throws SAXException */ public MarcXmlWriter(final Result result, final Source stylesheet) { if (stylesheet == null) { throw new NullPointerException("null Source"); } if (result == null) { throw new NullPointerException("null Result"); } setHandler(result, stylesheet); writeStartDocument(); } /** * Closes the writer. */ @Override public void close() { writeEndDocument(); try { if (writer != null) { writer.write("\n"); writer.close(); } } catch (final IOException details) { throw new MarcException(details.getMessage(), details); } } /** * Returns the character converter. * * @return CharConverter the character converter */ @Override public CharConverter getConverter() { return converter; } /** * Sets the character converter. * * @param converter the character converter */ @Override public void setConverter(final CharConverter converter) { this.converter = converter; } /** * If set to true this writer will perform Unicode normalization on data elements using normalization form C * (NFC). The default is false. *

* The implementation used is ICU4J 2.6. This version is based on Unicode 4.0. * * @param normalize true if this writer performs Unicode normalization, false otherwise */ public void setUnicodeNormalization(final boolean normalize) { this.normalize = normalize; } /** * Returns true if this writer will perform Unicode normalization, false otherwise. * * @return boolean - true if this writer performs Unicode normalization, false otherwise. */ public boolean getUnicodeNormalization() { return normalize; } protected void setHandler(final Result result, final Source stylesheet) throws MarcException { try { final TransformerFactory factory = TransformerFactory.newInstance(); if (!factory.getFeature(SAXTransformerFactory.FEATURE)) { throw new UnsupportedOperationException("SAXTransformerFactory is not supported"); } final SAXTransformerFactory saxFactory = (SAXTransformerFactory) factory; if (stylesheet == null) { handler = saxFactory.newTransformerHandler(); } else { handler = saxFactory.newTransformerHandler(stylesheet); } handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml"); handler.setResult(result); } catch (final Exception details) { throw new MarcException(details.getMessage(), details); } } /** * Writes the root start tag to the result. * * @throws SAXException */ protected void writeStartDocument() { try { final AttributesImpl atts = new AttributesImpl(); handler.startDocument(); handler.startPrefixMapping(Constants.MARCXML_NS_PREFIX, Constants.MARCXML_NS_URI); handler.startElement(Constants.MARCXML_NS_URI, COLLECTION, Constants.MARCXML_NS_PREFIX + ":" + COLLECTION, atts); } catch (final SAXException details) { throw new MarcException("SAX error occured while writing start document", details); } } /** * Writes the root end tag to the result. * * @throws SAXException */ protected void writeEndDocument() { try { if (indent) { handler.ignorableWhitespace("\n".toCharArray(), 0, 1); } handler.endElement(Constants.MARCXML_NS_URI, COLLECTION, Constants.MARCXML_NS_PREFIX + ":" + COLLECTION); handler.endPrefixMapping(Constants.MARCXML_NS_URI); handler.endDocument(); } catch (final SAXException e) { throw new MarcException("SAX error occured while writing end document", e); } } /** * Writes a Record object to the result. * * @param record - the Record object * @throws SAXException */ @Override public void write(final Record record) { try { toXml(record); } catch (final SAXException e) { throw new MarcException("SAX error occured while writing record", e); } } /** * A convenience method that writes a single Record object to the result. The assumption is the record needs to be * converted from Ansel to Unicode and that the record doesn't need to be indented. * * @param record The Record to write * @param stream The XML output stream * @throws SAXException */ public static void writeSingleRecord(final Record record, final OutputStream stream) throws IOException { writeSingleRecord(record, stream, true, false); } /** * A convenience method that writes a single Record object to the result. The assumption is the record needs to be * converted from Ansel to Unicode. * * @param record The Record to write * @param stream The XML output stream * @param indent If the XML output should be indented * @throws SAXException */ public static void writeSingleRecord(final Record record, final OutputStream stream, final boolean indent) throws IOException { writeSingleRecord(record, stream, true, indent); } /** * A convenience method that writes a single Record object to the result. * * @param record The Record to write * @param stream The XML output stream * @param encode If the text should be converted from Ansel to Unicode * @param indent Whether the output XML should be indented * @throws SAXException */ public static void writeSingleRecord(final Record record, final OutputStream stream, final boolean encode, final boolean indent) throws IOException { try { final BufferedWriter out = new BufferedWriter(new OutputStreamWriter(stream, "UTF-8")); final MarcXmlWriter writer = new MarcXmlWriter(); if (encode) { writer.setConverter(new AnselToUnicode()); } writer.setIndent(indent); writer.setUnicodeNormalization(true); writer.setHandler(new StreamResult(out), null); writer.handler.startDocument(); writer.handler.startPrefixMapping(Constants.MARCXML_NS_PREFIX, Constants.MARCXML_NS_URI); writer.toXml(record); if (indent) { writer.handler.ignorableWhitespace("\n".toCharArray(), 0, 1); } writer.handler.endPrefixMapping(Constants.MARCXML_NS_URI); writer.handler.endDocument(); out.write("\n"); out.close(); } catch (final SAXException details) { throw new MarcException("SAX error occured while writing record", details); } catch (final UnsupportedEncodingException details) { throw new MarcException(details.getMessage(), details); } } /** * Returns true if indentation is active, false otherwise. * * @return boolean */ public boolean hasIndent() { return indent; } /** * Activates or deactivates indentation. Default value is false. * * @param indent */ public void setIndent(final boolean indent) { this.indent = indent; } protected void toXml(final Record record) throws SAXException { if (!MarcFactory.newInstance().validateRecord(record)) { throw new MarcException("Marc record didn't validate"); } char temp[]; AttributesImpl atts = new AttributesImpl(); if (indent) { handler.ignorableWhitespace("\n ".toCharArray(), 0, 3); } handler.startElement(Constants.MARCXML_NS_URI, RECORD, Constants.MARCXML_NS_PREFIX + ":" + RECORD, atts); if (indent) { handler.ignorableWhitespace("\n ".toCharArray(), 0, 5); } handler.startElement(Constants.MARCXML_NS_URI, LEADER, Constants.MARCXML_NS_PREFIX + ":" + LEADER, atts); final Leader leader = record.getLeader(); temp = getDataElement(leader.toString()); handler.characters(temp, 0, temp.length); handler.endElement(Constants.MARCXML_NS_URI, LEADER, Constants.MARCXML_NS_PREFIX + ":" + LEADER); for (final ControlField field : record.getControlFields()) { atts = new AttributesImpl(); atts.addAttribute("", "tag", "tag", "CDATA", getDataElementString(field.getTag())); if (indent) { handler.ignorableWhitespace("\n ".toCharArray(), 0, 5); } handler.startElement(Constants.MARCXML_NS_URI, CONTROL_FIELD, Constants.MARCXML_NS_PREFIX + ":" + CONTROL_FIELD, atts); temp = getDataElement(field.getData()); handler.characters(temp, 0, temp.length); handler.endElement(Constants.MARCXML_NS_URI, CONTROL_FIELD, Constants.MARCXML_NS_PREFIX + ":" + CONTROL_FIELD); } for (final DataField field : record.getDataFields()) { atts = new AttributesImpl(); atts.addAttribute("", "tag", "tag", "CDATA", getDataElementString(field.getTag())); atts.addAttribute("", "ind1", "ind1", "CDATA", getDataElementString(String.valueOf(field .getIndicator1()))); atts.addAttribute("", "ind2", "ind2", "CDATA", getDataElementString(String.valueOf(field .getIndicator2()))); if (indent) { handler.ignorableWhitespace("\n ".toCharArray(), 0, 5); } handler.startElement(Constants.MARCXML_NS_URI, DATA_FIELD, Constants.MARCXML_NS_PREFIX + ":" + DATA_FIELD, atts); for (final Subfield subfield : field.getSubfields()) { atts = new AttributesImpl(); atts.addAttribute("", "code", "code", "CDATA", getDataElementString(String.valueOf(subfield .getCode()))); if (indent) { handler.ignorableWhitespace("\n ".toCharArray(), 0, 7); } handler.startElement(Constants.MARCXML_NS_URI, SUBFIELD, Constants.MARCXML_NS_PREFIX + ":" + SUBFIELD, atts); temp = getDataElement(subfield.getData()); handler.characters(temp, 0, temp.length); handler.endElement(Constants.MARCXML_NS_URI, SUBFIELD, Constants.MARCXML_NS_PREFIX + ":" + SUBFIELD); } if (indent) { handler.ignorableWhitespace("\n ".toCharArray(), 0, 5); } handler.endElement(Constants.MARCXML_NS_URI, DATA_FIELD, Constants.MARCXML_NS_PREFIX + ":" + DATA_FIELD); } if (indent) { handler.ignorableWhitespace("\n ".toCharArray(), 0, 3); } handler.endElement(Constants.MARCXML_NS_URI, RECORD, Constants.MARCXML_NS_PREFIX + ":" + RECORD); } protected String getDataElementString(final String data) { String dataElement = null; if (converter == null) { dataElement = data; } else { dataElement = converter.convert(data); } if (normalize) { dataElement = Normalizer.normalize(dataElement, Normalizer.NFC); } return dataElement; } protected char[] getDataElement(final String data) { return getDataElementString(data).toCharArray(); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy