com.marklogic.contentpump.AggregateXMLReader Maven / Gradle / Ivy
/*
* Copyright (c) 2022 MarkLogic Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.marklogic.contentpump;
import java.io.IOException;
import java.util.HashMap;
import java.util.Set;
import java.util.Stack;
import javax.xml.stream.Location;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import com.marklogic.contentpump.utilities.FileIterator;
import com.marklogic.contentpump.utilities.IdGenerator;
/**
* Reader for AggregateXMLInputFormat.
*
* @author ali
*
* @param
*/
public class AggregateXMLReader extends ImportRecordReader {
public static final Log LOG = LogFactory.getLog(AggregateXMLReader.class);
public static String DEFAULT_NS = null;
private int currDepth = 0;
protected XMLStreamReader xmlSR;
protected String recordName;
protected String recordNamespace;
private int recordDepth = Integer.MAX_VALUE;
private StringBuilder buffer;
protected String idName;
protected String currentId = null;
private boolean keepGoing = true;
protected HashMap> nameSpaces =
new HashMap<>();
protected boolean startOfRecord = true;
protected boolean hasNext = true;
private boolean newDoc = false;
private boolean newUriId = false;
protected boolean useAutomaticId = false;
protected String mode;
protected IdGenerator idGen;
protected XMLInputFactory f;
protected FSDataInputStream fInputStream;
protected long start;
protected long pos;
protected boolean overflow;
protected long end;
protected boolean compressed = false;
public AggregateXMLReader() {
}
@Override
public void close() throws IOException {
if (xmlSR != null) {
try {
xmlSR.close();
} catch (XMLStreamException e) {
LOG.error("Error closing stream reader", e);
}
}
if (fInputStream != null) {
fInputStream.close();
}
}
@Override
public float getProgress() throws IOException, InterruptedException {
if (!hasNext) {
return 1;
}
return (pos > end) ? 1 : ((float) (pos - start)) / (end - start);
}
@Override
public void initialize(InputSplit inSplit, TaskAttemptContext context)
throws IOException, InterruptedException {
initConfig(context);
initAggConf(context);
f = XMLInputFactory.newInstance();
setFile(((FileSplit) inSplit).getPath());
fs = file.getFileSystem(context.getConfiguration());
FileStatus status = fs.getFileStatus(file);
if(status.isDirectory()) {
iterator = new FileIterator((FileSplit)inSplit, context);
inSplit = iterator.next();
}
initStreamReader(inSplit);
}
protected void initStreamReader(InputSplit inSplit) throws IOException,
InterruptedException {
start = 0;
end = inSplit.getLength();
overflow = false;
fInputStream = openFile(inSplit, true);
if (fInputStream == null) {
return;
}
try {
xmlSR = f.createXMLStreamReader(fInputStream, encoding);
} catch (XMLStreamException e) {
LOG.error("ParseError");
LOG.error(e.getMessage(), e);
}
if (useAutomaticId) {
idGen = new IdGenerator(file.toUri().getPath() + "-"
+ ((FileSplit) inSplit).getStart());
}
}
protected void initAggConf(TaskAttemptContext context) {
Configuration conf = context.getConfiguration();
idName = conf.get(ConfigConstants.CONF_INPUT_URI_ID);
if (idName == null) {
useAutomaticId = true;
}
recordName = conf.get(ConfigConstants.CONF_AGGREGATE_RECORD_ELEMENT);
recordNamespace = conf
.get(ConfigConstants.CONF_AGGREGATE_RECORD_NAMESPACE);
}
private void write(String str) {
if (buffer == null) {
buffer = new StringBuilder();
}
if (newDoc && currDepth >= recordDepth) {
buffer.append(str);
}
}
protected void copyNameSpaceDecl() {
if (recordDepth < currDepth) {
return;
}
int stop = xmlSR.getNamespaceCount();
if (stop > 0) {
String nsDeclPrefix, nsDeclUri;
if (LOG.isTraceEnabled()) {
LOG.trace("checking namespace declarations");
}
for (int i = 0; i < stop; i++) {
nsDeclPrefix = xmlSR.getNamespacePrefix(i);
nsDeclUri = xmlSR.getNamespaceURI(i);
if (LOG.isTraceEnabled()) {
LOG.trace(nsDeclPrefix + ":" + nsDeclUri);
}
if (nameSpaces.containsKey(nsDeclPrefix)) {
nameSpaces.get(nsDeclPrefix).push(nsDeclUri);
} else {
Stack s = new Stack<>();
s.push(nsDeclUri);
nameSpaces.put(nsDeclPrefix, s);
}
}
}
}
protected void removeNameSpaceDecl() {
if (recordDepth < currDepth) {
return;
}
int stop = xmlSR.getNamespaceCount();
if (stop > 0) {
String nsDeclPrefix;
if (LOG.isTraceEnabled()) {
LOG.trace("checking namespace declarations");
}
for (int i = 0; i < stop; i++) {
nsDeclPrefix = xmlSR.getNamespacePrefix(i);
if (nameSpaces.containsKey(nsDeclPrefix)) {
if (!nameSpaces.get(nsDeclPrefix).isEmpty()) {
nameSpaces.get(nsDeclPrefix).pop();
}
} else {
LOG.warn("Namespace " + nsDeclPrefix + " not in scope");
}
}
}
}
private void processStartElement() throws XMLStreamException {
String name = xmlSR.getLocalName();
String namespace = xmlSR.getNamespaceURI();
if (LOG.isTraceEnabled()) {
LOG.trace("Start-tag: " + xmlSR.getName() + " at depth " + currDepth);
}
if ("".equals(namespace)) {
namespace = null;
}
if (namespace == null) {
String prefix = xmlSR.getPrefix();
if ("".equals(prefix)) {
prefix = DEFAULT_NS;
}
if (nameSpaces.get(prefix) != null) {
namespace = nameSpaces.get(prefix).peek();
}
}
String prefix = xmlSR.getPrefix();
int attrCount = xmlSR.getAttributeCount();
boolean isNewRootStart = false;
currDepth++;
Location loc = xmlSR.getLocation();
if (recordName == null) {
recordName = name;
if (recordNamespace == null) {
recordNamespace = namespace;
}
recordDepth = currDepth;
isNewRootStart = true;
newDoc = true;
newUriId = true;
if (useAutomaticId) {
setKey(idGen.incrementAndGet(), loc.getLineNumber(),
loc.getColumnNumber(), true);
}
} else {
// record element name may not nest
if (name.equals(recordName)
&& ((recordNamespace == null && namespace == null)
|| (recordNamespace != null && recordNamespace
.equals(namespace)))) {
recordDepth = currDepth;
isNewRootStart = true;
newDoc = true;
newUriId = true;
if (useAutomaticId) {
setKey(idGen.incrementAndGet(), loc.getLineNumber(),
loc.getColumnNumber(), true);
}
}
}
copyNameSpaceDecl();
if (!newDoc) {
return;
}
StringBuilder sb = new StringBuilder();
sb.append("<");
if (prefix != null && !prefix.equals("")) {
sb.append(prefix + ":" + name);
} else {
sb.append(name);
}
// add namespaces declared into the new root element
if (isNewRootStart) {
Set keys = nameSpaces.keySet();
for (String k : keys) {
String v = nameSpaces.get(k).peek();
if (DEFAULT_NS == k || ("".equals(k)) ){
sb.append(" xmlns=\"" + v + "\"");
} else {
sb.append(" xmlns:" + k + "=\"" + v + "\"");
}
}
} else {
// add new namespace declaration into current element
int stop = xmlSR.getNamespaceCount();
if (stop > 0) {
String nsDeclPrefix, nsDeclUri;
if (LOG.isTraceEnabled()) {
LOG.trace("checking namespace declarations");
}
for (int i = 0; i < stop; i++) {
nsDeclPrefix = xmlSR.getNamespacePrefix(i);
nsDeclUri = xmlSR.getNamespaceURI(i);
if (LOG.isTraceEnabled()) {
LOG.trace(nsDeclPrefix + ":" + nsDeclUri);
}
if (DEFAULT_NS == nsDeclPrefix) {
sb.append(" xmlns=\"" + nsDeclUri + "\"");
} else {
sb.append(" xmlns:" + nsDeclPrefix + "=\"" + nsDeclUri
+ "\"");
}
}
}
}
for (int i = 0; i < attrCount; i++) {
// aPrefix is null if i is invalid (out of boundary)
// aPrefix is empty if the attribute has no prefix
String aPrefix = xmlSR.getAttributePrefix(i);
String aName = xmlSR.getAttributeLocalName(i);
String aValue = StringEscapeUtils.escapeXml10(xmlSR
.getAttributeValue(i));
sb.append(" " + (null == aPrefix || "".equals(aPrefix) ?
"" : (aPrefix + ":")) + aName
+ "=\"" + aValue + "\"");
if (!useAutomaticId
&& newDoc
&& ("@" + aName).equals(idName)
&& currentId == null) {
currentId = aValue;
setKey(aValue, loc.getLineNumber(), loc.getColumnNumber(),
true);
}
}
sb.append(">");
// allow for repeated idName elements: first one wins
// NOTE: idName is namespace-insensitive
if (!useAutomaticId && newDoc && name.equals(idName)) {
int nextToken = xmlSR.next();
if (nextToken != XMLStreamConstants.CHARACTERS) {
throw new XMLStreamException("badly formed xml or " + idName
+ " is not a simple node: at" + xmlSR.getLocation());
}
do {
String idStr = StringEscapeUtils.escapeXml10(xmlSR.getText());
if (currentId == null) {
currentId = "";
}
currentId += idStr;
sb.append(idStr);
} while ((nextToken = xmlSR.next()) == XMLStreamConstants.CHARACTERS);
if (newUriId) {
setKey(currentId, loc.getLineNumber(), loc.getColumnNumber(),
true);
newUriId = false;
} else if (LOG.isDebugEnabled()) {
LOG.debug("Duplicate URI_ID match found: key = " + key);
}
if (LOG.isTraceEnabled()) {
LOG.trace("URI_ID: " + currentId);
}
// advance to the END_ELEMENT
if (nextToken != XMLStreamConstants.END_ELEMENT) {
throw new XMLStreamException(
"badly formed xml: no END_TAG after id text"
+ xmlSR.getLocation());
}
sb.append("");
if (prefix != null && !prefix.equals("")) {
sb.append(prefix + ":" + name);
} else {
sb.append(name);
}
sb.append(">");
currDepth--;
}
write(sb.toString());
}
/**
*
* @return false when the record end-element is found; true when keep going
* @throws XMLStreamException
*/
@SuppressWarnings("unchecked")
private boolean processEndElement() throws XMLStreamException {
String name = xmlSR.getLocalName();
String namespace = xmlSR.getNamespaceURI();
if (LOG.isTraceEnabled()) {
LOG.trace("End-tag: " + xmlSR.getName() + " at depth " + currDepth);
}
if ("".equals(namespace)) {
namespace = null;
}
if (namespace == null) {
String prefix = xmlSR.getPrefix();
if ("".equals(prefix)) {
prefix = DEFAULT_NS;
}
if (nameSpaces.get(prefix) != null &&
!nameSpaces.get(prefix).isEmpty()) {
namespace = nameSpaces.get(prefix).peek();
}
}
String prefix = xmlSR.getPrefix();
StringBuilder sb = new StringBuilder();
sb.append("");
if (prefix != null && prefix != "") {
sb.append(prefix + ":" + name);
} else {
sb.append(name);
}
sb.append(">");
//write to finish the end tag before checking errors
write(sb.toString());
if (!newDoc || !name.equals(recordName) || !((recordNamespace == null && namespace == null)
|| (recordNamespace != null && recordNamespace
.equals(namespace)))) {
// not the end of the record: go look for more nodes
if( currDepth == 1) {
cleanupEndElement();
} else {
removeNameSpaceDecl();
currDepth--;
}
return true;
}
if (!useAutomaticId && null == currentId && newDoc) {
LOG.error("end of record element " + name
+ " with no id found: " + ConfigConstants.AGGREGATE_URI_ID
+ "=" + idName);
cleanupEndElement();
return true;
}
if (value instanceof Text) {
((Text) value).set(buffer.toString());
} else {
((Text)((ContentWithFileNameWritable)
value).getValue()).set(buffer.toString());
}
cleanupEndElement();
// end of new record
return false;
}
protected void cleanupEndElement(){
currentId = null;
newDoc = false;
// reset buffer
buffer.setLength(0);
removeNameSpaceDecl();
currDepth--;
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if (xmlSR == null) {
hasNext = false;
return false;
}
try {
while (xmlSR.hasNext()) {
int eventType;
//getCharacterOffset() returns int;
//int will overflows if file is larger than 2GB
if (!overflow && xmlSR.getLocation().getCharacterOffset() < -1) {
overflow = true;
LOG.info("In progress...");
}
//do not update pos if offset overflows
if (!overflow) {
pos = xmlSR.getLocation().getCharacterOffset();
}
eventType = xmlSR.next();
switch (eventType) {
case XMLStreamConstants.START_ELEMENT:
if (startOfRecord) {
// this is the start of the root, only copy
// namespaces
copyNameSpaceDecl();
startOfRecord = false;
continue;
}
processStartElement();
break;
case XMLStreamConstants.CHARACTERS:
write(StringEscapeUtils.escapeXml10(xmlSR.getText()));
break;
case XMLStreamConstants.CDATA:
write("");
break;
case XMLStreamConstants.SPACE:
write(xmlSR.getText());
break;
case XMLStreamConstants.ENTITY_REFERENCE:
write("&");
write(xmlSR.getLocalName());
write(";");
break;
case XMLStreamConstants.DTD:
write("");
break;
case XMLStreamConstants.PROCESSING_INSTRUCTION:
write("");
write(xmlSR.getPITarget());
write(" ");
write(xmlSR.getPIData());
write("?>");
break;
case XMLStreamConstants.COMMENT:
write("");
break;
case XMLStreamConstants.END_ELEMENT:
keepGoing = processEndElement();
if (!keepGoing) {
keepGoing = true;
return true;
}
break;
case XMLStreamConstants.START_DOCUMENT:
throw new XMLStreamException(
"unexpected start of document within record!\n"
+ "recordName = " + recordName
+ ", recordNamespace = " + recordNamespace
+ " at " + xmlSR.getLocation());
case XMLStreamConstants.END_DOCUMENT:
if (currentId != null) {
throw new XMLStreamException(
"end of document before end of current record!\n"
+ "recordName = " + recordName
+ ", recordNamespace = " + recordNamespace
+ " at " + xmlSR.getLocation());
} else {
if(compressed) {
//this doc is done, refer to the zip for next doc
hasNext = false;
return false;
} else {
//get next file from FileIterator
if (iterator!=null && iterator.hasNext()) {
close();
initStreamReader(iterator.next());
continue;
} else {
hasNext = false;
return false;
}
}
}
default:
throw new XMLStreamException("UNIMPLEMENTED: " + eventType);
}
}
} catch (XMLStreamException e) {
LOG.error("Parsing error", e);
close();
xmlSR = null;
throw new IOException("Parsing error", e);
}
return false;
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy