org.apache.beam.sdk.io.xml.XmlSource Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.beam.sdk.io.xml;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.SequenceInputStream;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.NoSuchElementException;
import javax.xml.bind.JAXBContext;
import javax.xml.bind.JAXBElement;
import javax.xml.bind.JAXBException;
import javax.xml.bind.Unmarshaller;
import javax.xml.stream.FactoryConfigurationError;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.io.FileBasedSource;
import org.apache.beam.sdk.io.Source;
import org.apache.beam.sdk.io.fs.MatchResult.Metadata;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.ValueProvider;
import org.codehaus.stax2.XMLInputFactory2;
/** Implementation of {@link XmlIO#read}. */
@SuppressWarnings({
"nullness" // TODO(https://github.com/apache/beam/issues/20497)
})
public class XmlSource extends FileBasedSource {
private static final String XML_VERSION = "1.1";
private final XmlIO.MappingConfiguration configuration;
XmlSource(
ValueProvider spec,
XmlIO.MappingConfiguration configuration,
long minBundleSizeBytes) {
super(spec, minBundleSizeBytes);
this.configuration = configuration;
}
private XmlSource(
XmlIO.MappingConfiguration configuration,
long minBundleSizeBytes,
Metadata metadata,
long startOffset,
long endOffset) {
super(metadata, minBundleSizeBytes, startOffset, endOffset);
this.configuration = configuration;
}
@Override
protected FileBasedSource createForSubrangeOfFile(Metadata metadata, long start, long end) {
return new XmlSource<>(configuration, getMinBundleSize(), metadata, start, end);
}
@Override
protected FileBasedReader createSingleFileReader(PipelineOptions options) {
return new XMLReader<>(this);
}
@Override
public Coder getOutputCoder() {
return JAXBCoder.of(configuration.getRecordClass());
}
/**
* A {@link Source.Reader} for reading JAXB annotated Java objects from an XML file. The XML file
* should be of the form defined at {@link XmlSource}.
*
* Timestamped values are currently unsupported - all values implicitly have the timestamp of
* {@code BoundedWindow.TIMESTAMP_MIN_VALUE}.
*
* @param Type of objects that will be read by the reader.
*/
private static class XMLReader extends FileBasedReader {
// The amount of bytes read from the channel to memory when determining the starting offset of
// the first record in a bundle. After matching to starting offset of the first record the
// remaining bytes read to this buffer and the bytes still not read from the channel are used to
// create the XML parser.
private static final int BUF_SIZE = 1024;
// This should be the maximum number of bytes a character will encode to, for any encoding
// supported by XmlSource. Currently this is set to 4 since UTF-8 characters may be
// four bytes.
private static final int MAX_CHAR_BYTES = 4;
// In order to support reading starting in the middle of an XML file, we construct an imaginary
// well-formed document (a header and root tag followed by the contents of the input starting at
// the record boundary) and feed it to the parser. Because of this, the offset reported by the
// XML parser is not the same as offset in the original file. They differ by a constant amount:
// offsetInOriginalFile = parser.getLocation().getCharacterOffset() + parserBaseOffset;
// Note that this is true only for files with single-byte characters.
// It appears that, as of writing, there does not exist a Java XML parser capable of correctly
// reporting byte offsets of elements in the presence of multi-byte characters.
private long parserBaseOffset = 0;
private boolean readingStarted = false;
// If true, the current bundle does not contain any records.
private boolean emptyBundle = false;
private Unmarshaller jaxbUnmarshaller = null;
private XMLStreamReader parser = null;
private T currentRecord = null;
// Byte offset of the current record in the XML file provided when creating the source.
private long currentByteOffset = 0;
XMLReader(XmlSource source) {
super(source);
// Set up a JAXB Unmarshaller that can be used to unmarshall record objects.
try {
JAXBContext jaxbContext =
JAXBContext.newInstance(getCurrentSource().configuration.getRecordClass());
jaxbUnmarshaller = jaxbContext.createUnmarshaller();
if (getCurrentSource().configuration.getValidationEventHandler() != null) {
jaxbUnmarshaller.setEventHandler(
getCurrentSource().configuration.getValidationEventHandler());
}
} catch (JAXBException e) {
throw new RuntimeException(e);
}
}
@Override
public synchronized XmlSource getCurrentSource() {
return (XmlSource) super.getCurrentSource();
}
@Override
protected void startReading(ReadableByteChannel channel) throws IOException {
// This method determines the correct starting offset of the first record by reading bytes
// from the ReadableByteChannel. This implementation does not need the channel to be a
// SeekableByteChannel.
// The method tries to determine the first record element in the byte channel. The first
// record must start with the characters "' character
// * '/' character (to support empty records).
//
// After this match this method creates the XML parser for parsing the XML document,
// feeding it a fake document consisting of an XML header and the tag followed
// by the contents of channel starting from tag may be never
// closed.
// This stores any bytes that should be used prior to the remaining bytes of the channel when
// creating an XML parser object.
ByteArrayOutputStream preambleByteBuffer = new ByteArrayOutputStream();
// A dummy declaration and root for the document with proper XML version and encoding. Without
// this XML parsing may fail or may produce incorrect results.
byte[] dummyStartDocumentBytes =
String.format(
"<%s>",
XML_VERSION,
getCurrentSource().configuration.getRootElement())
.getBytes(Charset.forName(getCurrentSource().configuration.getCharset()));
preambleByteBuffer.write(dummyStartDocumentBytes);
// Gets the byte offset (in the input file) of the first record in ReadableByteChannel. This
// method returns the offset and stores any bytes that should be used when creating the XML
// parser in preambleByteBuffer.
long offsetInFileOfRecordElement =
getFirstOccurenceOfRecordElement(channel, preambleByteBuffer);
if (offsetInFileOfRecordElement < 0) {
// Bundle has no records. So marking this bundle as an empty bundle.
emptyBundle = true;
return;
} else {
byte[] preambleBytes = preambleByteBuffer.toByteArray();
currentByteOffset = offsetInFileOfRecordElement;
setUpXMLParser(channel, preambleBytes);
parserBaseOffset = offsetInFileOfRecordElement - dummyStartDocumentBytes.length;
}
readingStarted = true;
}
// Gets the first occurrence of the next record within the given ReadableByteChannel. Puts
// any bytes read past the starting offset of the next record back to the preambleByteBuffer.
// If a record is found, returns the starting offset of the record, otherwise
// returns -1.
private long getFirstOccurenceOfRecordElement(
ReadableByteChannel channel, ByteArrayOutputStream preambleByteBuffer) throws IOException {
int byteIndexInRecordElementToMatch = 0;
// Index of the byte in the string " 0) {
buf.flip();
while (buf.hasRemaining()) {
offsetInFileOfCurrentByte++;
byte b = buf.get();
boolean reset = false;
if (recordStartBytesMatched) {
// We already matched "..."
// * "..."
// * "' || c == '/') {
fullyMatched = true;
// Add the recordStartBytes and charBytes to preambleByteBuffer since these were
// already read from the channel.
preambleByteBuffer.write(recordStartBytes);
preambleByteBuffer.write(charBytes);
// Also add the rest of the current buffer to preambleByteBuffer.
while (buf.hasRemaining()) {
preambleByteBuffer.write(buf.get());
}
break outer;
} else {
// Matching was unsuccessful. Reset the buffer to include bytes read for the char.
int bytesToWrite = buf.remaining() + charBytes.length;
ByteBuffer newbuf;
if (bytesToWrite > BUF_SIZE) {
// Avoiding buffer overflow. The number of bytes to push to the buffer might be
// larger than BUF_SIZE due to additional 'charBytes'.
newbuf = ByteBuffer.allocate(bytesToWrite);
bufSizeChanged = true;
} else {
newbuf = ByteBuffer.allocate(BUF_SIZE);
}
newbuf.put(charBytes);
offsetInFileOfCurrentByte -= charBytes.length;
while (buf.hasRemaining()) {
newbuf.put(buf.get());
}
newbuf.flip();
buf = newbuf;
// Ignore everything and try again starting from the current buffer.
reset = true;
}
} else if (b == recordStartBytes[byteIndexInRecordElementToMatch]) {
// Next byte matched.
if (!matchStarted) {
// Match was for the first byte, record the starting offset.
matchStarted = true;
startingOffsetInFileOfCurrentMatch = offsetInFileOfCurrentByte;
}
byteIndexInRecordElementToMatch++;
} else {
// Not a match. Ignore everything and try again starting at current point.
reset = true;
}
if (reset) {
// Clear variables and try to match starting from the next byte.
byteIndexInRecordElementToMatch = 0;
startingOffsetInFileOfCurrentMatch = -1;
matchStarted = false;
recordStartBytesMatched = false;
charBytes = new byte[MAX_CHAR_BYTES];
charBytesFound = 0;
}
if (byteIndexInRecordElementToMatch == recordStartBytes.length) {
// " jb =
jaxbUnmarshaller.unmarshal(parser, getCurrentSource().configuration.getRecordClass());
currentRecord = jb.getValue();
return true;
} catch (JAXBException | XMLStreamException e) {
throw new IOException(e);
}
}
@Override
public T getCurrent() throws NoSuchElementException {
if (!readingStarted) {
throw new NoSuchElementException();
}
return currentRecord;
}
@Override
protected boolean isAtSplitPoint() {
// Every record is at a split point.
return true;
}
@Override
protected long getCurrentOffset() {
return currentByteOffset;
}
}
}