All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.beam.sdk.io.xml.XmlSource Maven / Gradle / Ivy

There is a newer version: 2.61.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.beam.sdk.io.xml;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.SequenceInputStream;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.NoSuchElementException;
import javax.xml.bind.JAXBContext;
import javax.xml.bind.JAXBElement;
import javax.xml.bind.JAXBException;
import javax.xml.bind.Unmarshaller;
import javax.xml.stream.FactoryConfigurationError;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.io.FileBasedSource;
import org.apache.beam.sdk.io.Source;
import org.apache.beam.sdk.io.fs.MatchResult.Metadata;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.ValueProvider;
import org.codehaus.stax2.XMLInputFactory2;

/** Implementation of {@link XmlIO#read}. */
@SuppressWarnings({
  "nullness" // TODO(https://github.com/apache/beam/issues/20497)
})
public class XmlSource extends FileBasedSource {

  private static final String XML_VERSION = "1.1";

  private final XmlIO.MappingConfiguration configuration;

  XmlSource(
      ValueProvider spec,
      XmlIO.MappingConfiguration configuration,
      long minBundleSizeBytes) {
    super(spec, minBundleSizeBytes);
    this.configuration = configuration;
  }

  private XmlSource(
      XmlIO.MappingConfiguration configuration,
      long minBundleSizeBytes,
      Metadata metadata,
      long startOffset,
      long endOffset) {
    super(metadata, minBundleSizeBytes, startOffset, endOffset);
    this.configuration = configuration;
  }

  @Override
  protected FileBasedSource createForSubrangeOfFile(Metadata metadata, long start, long end) {
    return new XmlSource<>(configuration, getMinBundleSize(), metadata, start, end);
  }

  @Override
  protected FileBasedReader createSingleFileReader(PipelineOptions options) {
    return new XMLReader<>(this);
  }

  @Override
  public Coder getOutputCoder() {
    return JAXBCoder.of(configuration.getRecordClass());
  }

  /**
   * A {@link Source.Reader} for reading JAXB annotated Java objects from an XML file. The XML file
   * should be of the form defined at {@link XmlSource}.
   *
   * 

Timestamped values are currently unsupported - all values implicitly have the timestamp of * {@code BoundedWindow.TIMESTAMP_MIN_VALUE}. * * @param Type of objects that will be read by the reader. */ private static class XMLReader extends FileBasedReader { // The amount of bytes read from the channel to memory when determining the starting offset of // the first record in a bundle. After matching to starting offset of the first record the // remaining bytes read to this buffer and the bytes still not read from the channel are used to // create the XML parser. private static final int BUF_SIZE = 1024; // This should be the maximum number of bytes a character will encode to, for any encoding // supported by XmlSource. Currently this is set to 4 since UTF-8 characters may be // four bytes. private static final int MAX_CHAR_BYTES = 4; // In order to support reading starting in the middle of an XML file, we construct an imaginary // well-formed document (a header and root tag followed by the contents of the input starting at // the record boundary) and feed it to the parser. Because of this, the offset reported by the // XML parser is not the same as offset in the original file. They differ by a constant amount: // offsetInOriginalFile = parser.getLocation().getCharacterOffset() + parserBaseOffset; // Note that this is true only for files with single-byte characters. // It appears that, as of writing, there does not exist a Java XML parser capable of correctly // reporting byte offsets of elements in the presence of multi-byte characters. private long parserBaseOffset = 0; private boolean readingStarted = false; // If true, the current bundle does not contain any records. private boolean emptyBundle = false; private Unmarshaller jaxbUnmarshaller = null; private XMLStreamReader parser = null; private T currentRecord = null; // Byte offset of the current record in the XML file provided when creating the source. private long currentByteOffset = 0; XMLReader(XmlSource source) { super(source); // Set up a JAXB Unmarshaller that can be used to unmarshall record objects. try { JAXBContext jaxbContext = JAXBContext.newInstance(getCurrentSource().configuration.getRecordClass()); jaxbUnmarshaller = jaxbContext.createUnmarshaller(); if (getCurrentSource().configuration.getValidationEventHandler() != null) { jaxbUnmarshaller.setEventHandler( getCurrentSource().configuration.getValidationEventHandler()); } } catch (JAXBException e) { throw new RuntimeException(e); } } @Override public synchronized XmlSource getCurrentSource() { return (XmlSource) super.getCurrentSource(); } @Override protected void startReading(ReadableByteChannel channel) throws IOException { // This method determines the correct starting offset of the first record by reading bytes // from the ReadableByteChannel. This implementation does not need the channel to be a // SeekableByteChannel. // The method tries to determine the first record element in the byte channel. The first // record must start with the characters "' character // * '/' character (to support empty records). // // After this match this method creates the XML parser for parsing the XML document, // feeding it a fake document consisting of an XML header and the tag followed // by the contents of channel starting from tag may be never // closed. // This stores any bytes that should be used prior to the remaining bytes of the channel when // creating an XML parser object. ByteArrayOutputStream preambleByteBuffer = new ByteArrayOutputStream(); // A dummy declaration and root for the document with proper XML version and encoding. Without // this XML parsing may fail or may produce incorrect results. byte[] dummyStartDocumentBytes = String.format( "<%s>", XML_VERSION, getCurrentSource().configuration.getRootElement()) .getBytes(Charset.forName(getCurrentSource().configuration.getCharset())); preambleByteBuffer.write(dummyStartDocumentBytes); // Gets the byte offset (in the input file) of the first record in ReadableByteChannel. This // method returns the offset and stores any bytes that should be used when creating the XML // parser in preambleByteBuffer. long offsetInFileOfRecordElement = getFirstOccurenceOfRecordElement(channel, preambleByteBuffer); if (offsetInFileOfRecordElement < 0) { // Bundle has no records. So marking this bundle as an empty bundle. emptyBundle = true; return; } else { byte[] preambleBytes = preambleByteBuffer.toByteArray(); currentByteOffset = offsetInFileOfRecordElement; setUpXMLParser(channel, preambleBytes); parserBaseOffset = offsetInFileOfRecordElement - dummyStartDocumentBytes.length; } readingStarted = true; } // Gets the first occurrence of the next record within the given ReadableByteChannel. Puts // any bytes read past the starting offset of the next record back to the preambleByteBuffer. // If a record is found, returns the starting offset of the record, otherwise // returns -1. private long getFirstOccurenceOfRecordElement( ReadableByteChannel channel, ByteArrayOutputStream preambleByteBuffer) throws IOException { int byteIndexInRecordElementToMatch = 0; // Index of the byte in the string " 0) { buf.flip(); while (buf.hasRemaining()) { offsetInFileOfCurrentByte++; byte b = buf.get(); boolean reset = false; if (recordStartBytesMatched) { // We already matched "..." // * "..." // * "' || c == '/') { fullyMatched = true; // Add the recordStartBytes and charBytes to preambleByteBuffer since these were // already read from the channel. preambleByteBuffer.write(recordStartBytes); preambleByteBuffer.write(charBytes); // Also add the rest of the current buffer to preambleByteBuffer. while (buf.hasRemaining()) { preambleByteBuffer.write(buf.get()); } break outer; } else { // Matching was unsuccessful. Reset the buffer to include bytes read for the char. int bytesToWrite = buf.remaining() + charBytes.length; ByteBuffer newbuf; if (bytesToWrite > BUF_SIZE) { // Avoiding buffer overflow. The number of bytes to push to the buffer might be // larger than BUF_SIZE due to additional 'charBytes'. newbuf = ByteBuffer.allocate(bytesToWrite); bufSizeChanged = true; } else { newbuf = ByteBuffer.allocate(BUF_SIZE); } newbuf.put(charBytes); offsetInFileOfCurrentByte -= charBytes.length; while (buf.hasRemaining()) { newbuf.put(buf.get()); } newbuf.flip(); buf = newbuf; // Ignore everything and try again starting from the current buffer. reset = true; } } else if (b == recordStartBytes[byteIndexInRecordElementToMatch]) { // Next byte matched. if (!matchStarted) { // Match was for the first byte, record the starting offset. matchStarted = true; startingOffsetInFileOfCurrentMatch = offsetInFileOfCurrentByte; } byteIndexInRecordElementToMatch++; } else { // Not a match. Ignore everything and try again starting at current point. reset = true; } if (reset) { // Clear variables and try to match starting from the next byte. byteIndexInRecordElementToMatch = 0; startingOffsetInFileOfCurrentMatch = -1; matchStarted = false; recordStartBytesMatched = false; charBytes = new byte[MAX_CHAR_BYTES]; charBytesFound = 0; } if (byteIndexInRecordElementToMatch == recordStartBytes.length) { // " jb = jaxbUnmarshaller.unmarshal(parser, getCurrentSource().configuration.getRecordClass()); currentRecord = jb.getValue(); return true; } catch (JAXBException | XMLStreamException e) { throw new IOException(e); } } @Override public T getCurrent() throws NoSuchElementException { if (!readingStarted) { throw new NoSuchElementException(); } return currentRecord; } @Override protected boolean isAtSplitPoint() { // Every record is at a split point. return true; } @Override protected long getCurrentOffset() { return currentByteOffset; } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy