com.marklogic.mapreduce.DOMDocument Maven / Gradle / Ivy
/*
* Copyright (c) 2020 MarkLogic Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.marklogic.mapreduce;
import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Result;
import javax.xml.transform.Source;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.Text;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import com.marklogic.dom.NodeImpl;
import com.marklogic.dom.TextImpl;
import com.marklogic.tree.ExpandedTree;
import com.marklogic.tree.NodeKind;
import com.marklogic.xcc.Content;
import com.marklogic.xcc.ContentCreateOptions;
import com.marklogic.xcc.ContentFactory;
/**
* A {@link ForestDocument} containing a DOM document node
* representation of a document as stored in the expanded tree
* cache of a forest on disk.
*
*
* You cannot use this class to modify a document. However, you
* can create a modifiable copy of the underlying document
* using {@link com.marklogic.dom.DocumentImpl} on the
* document returned by {@link #getDocument}.
*
*
* @author jchen
*
*/
public class DOMDocument extends ForestDocument {
public static final Log LOG = LogFactory.getLog(DOMDocument.class);
private Document doc;
private byte rootNodeKind;
private static TransformerFactory transformerFactory = null;
private static synchronized TransformerFactory getTransformerFactory() {
if (transformerFactory == null) {
transformerFactory = TransformerFactory.newInstance();
}
return transformerFactory;
}
public DOMDocument() {
}
public DOMDocument(ExpandedTree tree) {
doc = (Document)tree.node(0);
rootNodeKind = tree.rootNodeKind();
}
public Document getDocument() {
return doc;
}
@Override
public void readFields(DataInput in) throws IOException {
super.readFields(in);
ExpandedTree tree = new ExpandedTree();
tree.readFields(in);
doc = (Document)tree.node(0);
rootNodeKind = tree.rootNodeKind();
}
@Override
public void write(DataOutput out) throws IOException {
super.write(out);
((NodeImpl)doc).getExpandedTree().write(out);
}
static ByteArrayOutputStream serialize(Node node)
throws TransformerException {
ByteArrayOutputStream bos = new ByteArrayOutputStream();
Result rslt = new StreamResult(bos);
Source src = new DOMSource(node);
Transformer transformer = getTransformerFactory().newTransformer();
transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION,
"yes");
transformer.transform(src, rslt);
return bos;
}
public String toString() {
if (rootNodeKind == NodeKind.TEXT) {
TextImpl textNode = (TextImpl) doc.getFirstChild();
if (textNode != null) {
return textNode.getTextContent();
}
}
try {
ByteArrayOutputStream bos = serialize(doc);
return bos.toString();
} catch (TransformerException ex) {
LOG.error("Error serializing document", ex);
}
return null;
}
@Override
public byte[] getContentAsByteArray() {
if (rootNodeKind == NodeKind.TEXT) {
TextImpl textNode = (TextImpl) doc.getFirstChild();
if (textNode != null) {
return textNode.getTextContent().getBytes();
}
}
try {
ByteArrayOutputStream bos = serialize(doc);
return bos.toByteArray();
} catch (TransformerException ex) {
LOG.error("Error serializing document", ex);
}
return null;
}
@Override
public MarkLogicNode getContentAsMarkLogicNode() {
return new MarkLogicNode(doc);
}
@Override
public Text getContentAsText() {
return new Text(toString());
}
@Override
public ContentType getContentType() {
if (rootNodeKind == NodeKind.ELEM ||
rootNodeKind == NodeKind.COMMENT ||
rootNodeKind == NodeKind.PI) {
return ContentType.XML;
} else if (rootNodeKind == NodeKind.TEXT) {
return ContentType.TEXT;
} else {
throw new UnsupportedOperationException("Unknown node kind: " +
rootNodeKind);
}
}
@Override
public String getContentAsString() throws UnsupportedEncodingException {
return toString();
}
@Override
public Content createContent(String uri, ContentCreateOptions options,
boolean copyCollections, boolean copyMetadata, boolean copyQuality)
throws IOException {
if (copyCollections || copyMetadata || copyQuality) {
setContentOptions(options, copyCollections, copyMetadata,
copyQuality);
}
return ContentFactory.newContent(uri,
getContentAsMarkLogicNode().get(), options);
}
}