org.apache.beam.sdk.io.xml.XmlSource Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of beam-sdks-java-io-xml Show documentation
There is a newer version: 2.59.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.beam.sdk.io.xml;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.SequenceInputStream;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.NoSuchElementException;
import javax.xml.bind.JAXBContext;
import javax.xml.bind.JAXBElement;
import javax.xml.bind.JAXBException;
import javax.xml.bind.Unmarshaller;
import javax.xml.stream.FactoryConfigurationError;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.io.FileBasedSource;
import org.apache.beam.sdk.io.Source;
import org.apache.beam.sdk.io.fs.MatchResult.Metadata;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.ValueProvider;
import org.codehaus.stax2.XMLInputFactory2;

/** Implementation of {@link XmlIO#read}. */
@SuppressWarnings({
  "nullness" // TODO(https://github.com/apache/beam/issues/20497)
})
public class XmlSource extends FileBasedSource {

  private static final String XML_VERSION = "1.1";

  private final XmlIO.MappingConfiguration configuration;

  XmlSource(
      ValueProvider spec,
      XmlIO.MappingConfiguration configuration,
      long minBundleSizeBytes) {
    super(spec, minBundleSizeBytes);
    this.configuration = configuration;
  }

  private XmlSource(
      XmlIO.MappingConfiguration configuration,
      long minBundleSizeBytes,
      Metadata metadata,
      long startOffset,
      long endOffset) {
    super(metadata, minBundleSizeBytes, startOffset, endOffset);
    this.configuration = configuration;
  }

  @Override
  protected FileBasedSource createForSubrangeOfFile(Metadata metadata, long start, long end) {
    return new XmlSource<>(configuration, getMinBundleSize(), metadata, start, end);
  }

  @Override
  protected FileBasedReader createSingleFileReader(PipelineOptions options) {
    return new XMLReader<>(this);
  }

  @Override
  public Coder getOutputCoder() {
    return JAXBCoder.of(configuration.getRecordClass());
  }

  /**
   * A {@link Source.Reader} for reading JAXB annotated Java objects from an XML file. The XML file
   * should be of the form defined at {@link XmlSource}.
   *
   * Timestamped values are currently unsupported - all values implicitly have the timestamp of
   * {@code BoundedWindow.TIMESTAMP_MIN_VALUE}.
   *
   * @param  Type of objects that will be read by the reader.
   */
  private static class XMLReader extends FileBasedReader {
    // The amount of bytes read from the channel to memory when determining the starting offset of
    // the first record in a bundle. After matching to starting offset of the first record the
    // remaining bytes read to this buffer and the bytes still not read from the channel are used to
    // create the XML parser.
    private static final int BUF_SIZE = 1024;

    // This should be the maximum number of bytes a character will encode to, for any encoding
    // supported by XmlSource. Currently this is set to 4 since UTF-8 characters may be
    // four bytes.
    private static final int MAX_CHAR_BYTES = 4;

    // In order to support reading starting in the middle of an XML file, we construct an imaginary
    // well-formed document (a header and root tag followed by the contents of the input starting at
    // the record boundary) and feed it to the parser. Because of this, the offset reported by the
    // XML parser is not the same as offset in the original file. They differ by a constant amount:
    // offsetInOriginalFile = parser.getLocation().getCharacterOffset() + parserBaseOffset;
    // Note that this is true only for files with single-byte characters.
    // It appears that, as of writing, there does not exist a Java XML parser capable of correctly
    // reporting byte offsets of elements in the presence of multi-byte characters.
    private long parserBaseOffset = 0;
    private boolean readingStarted = false;

    // If true, the current bundle does not contain any records.
    private boolean emptyBundle = false;

    private Unmarshaller jaxbUnmarshaller = null;
    private XMLStreamReader parser = null;

    private T currentRecord = null;

    // Byte offset of the current record in the XML file provided when creating the source.
    private long currentByteOffset = 0;

    XMLReader(XmlSource source) {
      super(source);

      // Set up a JAXB Unmarshaller that can be used to unmarshall record objects.
      try {
        JAXBContext jaxbContext =
            JAXBContext.newInstance(getCurrentSource().configuration.getRecordClass());
        jaxbUnmarshaller = jaxbContext.createUnmarshaller();
        if (getCurrentSource().configuration.getValidationEventHandler() != null) {
          jaxbUnmarshaller.setEventHandler(
              getCurrentSource().configuration.getValidationEventHandler());
        }
      } catch (JAXBException e) {
        throw new RuntimeException(e);
      }
    }

    @Override
    public synchronized XmlSource getCurrentSource() {
      return (XmlSource) super.getCurrentSource();
    }

    @Override
    protected void startReading(ReadableByteChannel channel) throws IOException {
      // This method determines the correct starting offset of the first record by reading bytes
      // from the ReadableByteChannel. This implementation does not need the channel to be a
      // SeekableByteChannel.
      // The method tries to determine the first record element in the byte channel. The first
      // record must start with the characters "' character
      // * '/' character (to support empty records).
      //
      // After this match this method creates the XML parser for parsing the XML document,
      // feeding it a fake document consisting of an XML header and the  tag followed
      // by the contents of channel starting from  tag may be never
      // closed.

      // This stores any bytes that should be used prior to the remaining bytes of the channel when
      // creating an XML parser object.
      ByteArrayOutputStream preambleByteBuffer = new ByteArrayOutputStream();
      // A dummy declaration and root for the document with proper XML version and encoding. Without
      // this XML parsing may fail or may produce incorrect results.

      byte[] dummyStartDocumentBytes =
          String.format(
                  "<%s>",
                  XML_VERSION,
                  getCurrentSource().configuration.getRootElement())
              .getBytes(Charset.forName(getCurrentSource().configuration.getCharset()));
      preambleByteBuffer.write(dummyStartDocumentBytes);
      // Gets the byte offset (in the input file) of the first record in ReadableByteChannel. This
      // method returns the offset and stores any bytes that should be used when creating the XML
      // parser in preambleByteBuffer.
      long offsetInFileOfRecordElement =
          getFirstOccurenceOfRecordElement(channel, preambleByteBuffer);
      if (offsetInFileOfRecordElement < 0) {
        // Bundle has no records. So marking this bundle as an empty bundle.
        emptyBundle = true;
        return;
      } else {
        byte[] preambleBytes = preambleByteBuffer.toByteArray();
        currentByteOffset = offsetInFileOfRecordElement;
        setUpXMLParser(channel, preambleBytes);
        parserBaseOffset = offsetInFileOfRecordElement - dummyStartDocumentBytes.length;
      }
      readingStarted = true;
    }

    // Gets the first occurrence of the next record within the given ReadableByteChannel. Puts
    // any bytes read past the starting offset of the next record back to the preambleByteBuffer.
    // If a record is found, returns the starting offset of the record, otherwise
    // returns -1.
    private long getFirstOccurenceOfRecordElement(
        ReadableByteChannel channel, ByteArrayOutputStream preambleByteBuffer) throws IOException {
      int byteIndexInRecordElementToMatch = 0;
      // Index of the byte in the string " 0) {
        buf.flip();
        while (buf.hasRemaining()) {
          offsetInFileOfCurrentByte++;
          byte b = buf.get();
          boolean reset = false;
          if (recordStartBytesMatched) {
            // We already matched "..."
            // * "..."
            // * "' || c == '/') {
              fullyMatched = true;
              // Add the recordStartBytes and charBytes to preambleByteBuffer since these were
              // already read from the channel.
              preambleByteBuffer.write(recordStartBytes);
              preambleByteBuffer.write(charBytes);
              // Also add the rest of the current buffer to preambleByteBuffer.
              while (buf.hasRemaining()) {
                preambleByteBuffer.write(buf.get());
              }
              break outer;
            } else {
              // Matching was unsuccessful. Reset the buffer to include bytes read for the char.
              int bytesToWrite = buf.remaining() + charBytes.length;
              ByteBuffer newbuf;
              if (bytesToWrite > BUF_SIZE) {
                // Avoiding buffer overflow. The number of bytes to push to the buffer might be
                // larger than BUF_SIZE due to additional 'charBytes'.
                newbuf = ByteBuffer.allocate(bytesToWrite);
                bufSizeChanged = true;
              } else {
                newbuf = ByteBuffer.allocate(BUF_SIZE);
              }
              newbuf.put(charBytes);
              offsetInFileOfCurrentByte -= charBytes.length;
              while (buf.hasRemaining()) {
                newbuf.put(buf.get());
              }
              newbuf.flip();
              buf = newbuf;

              // Ignore everything and try again starting from the current buffer.
              reset = true;
            }
          } else if (b == recordStartBytes[byteIndexInRecordElementToMatch]) {
            // Next byte matched.
            if (!matchStarted) {
              // Match was for the first byte, record the starting offset.
              matchStarted = true;
              startingOffsetInFileOfCurrentMatch = offsetInFileOfCurrentByte;
            }
            byteIndexInRecordElementToMatch++;
          } else {
            // Not a match. Ignore everything and try again starting at current point.
            reset = true;
          }
          if (reset) {
            // Clear variables and try to match starting from the next byte.
            byteIndexInRecordElementToMatch = 0;
            startingOffsetInFileOfCurrentMatch = -1;
            matchStarted = false;
            recordStartBytesMatched = false;
            charBytes = new byte[MAX_CHAR_BYTES];
            charBytesFound = 0;
          }
          if (byteIndexInRecordElementToMatch == recordStartBytes.length) {
            // " jb =
            jaxbUnmarshaller.unmarshal(parser, getCurrentSource().configuration.getRecordClass());
        currentRecord = jb.getValue();
        return true;
      } catch (JAXBException | XMLStreamException e) {
        throw new IOException(e);
      }
    }

    @Override
    public T getCurrent() throws NoSuchElementException {
      if (!readingStarted) {
        throw new NoSuchElementException();
      }
      return currentRecord;
    }

    @Override
    protected boolean isAtSplitPoint() {
      // Every record is at a split point.
      return true;
    }

    @Override
    protected long getCurrentOffset() {
      return currentByteOffset;
    }
  }
}