org.apache.beam.sdk.io.xml.XmlIO Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.beam.sdk.io.xml;
import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkArgument;
import com.google.auto.value.AutoValue;
import java.io.IOException;
import java.io.OutputStream;
import java.io.Serializable;
import java.nio.channels.Channels;
import java.nio.channels.WritableByteChannel;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import javax.xml.bind.JAXBContext;
import javax.xml.bind.JAXBException;
import javax.xml.bind.Marshaller;
import javax.xml.bind.ValidationEventHandler;
import org.apache.beam.sdk.io.BoundedSource;
import org.apache.beam.sdk.io.CompressedSource;
import org.apache.beam.sdk.io.Compression;
import org.apache.beam.sdk.io.FileIO;
import org.apache.beam.sdk.io.FileIO.ReadableFile;
import org.apache.beam.sdk.io.FileSystems;
import org.apache.beam.sdk.io.OffsetBasedSource;
import org.apache.beam.sdk.io.ReadAllViaFileBasedSource;
import org.apache.beam.sdk.io.fs.ResourceId;
import org.apache.beam.sdk.options.ValueProvider;
import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider;
import org.apache.beam.sdk.transforms.PTransform;
import org.apache.beam.sdk.transforms.SerializableFunction;
import org.apache.beam.sdk.transforms.display.DisplayData;
import org.apache.beam.sdk.transforms.display.HasDisplayData;
import org.apache.beam.sdk.values.PBegin;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.PDone;
import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting;
import org.checkerframework.checker.nullness.qual.Nullable;
/** Transforms for reading and writing XML files using JAXB mappers. */
@SuppressWarnings({
"nullness" // TODO(https://github.com/apache/beam/issues/20497)
})
public class XmlIO {
// CHECKSTYLE.OFF: JavadocStyle
/**
* Reads XML files as a {@link PCollection} of a given type mapped via JAXB.
*
* The XML files must be of the following form, where {@code root} and {@code record} are XML
* element names that are defined by the user:
*
*
{@code
*
* ...
* ...
* ...
* ...
* ...
*
* }
*
* Basically, the XML document should contain a single root element with an inner list
* consisting entirely of record elements. The records may contain arbitrary XML content; however,
* that content must not contain the start {@code } or end {@code } tags.
* This restriction enables reading from large XML files in parallel from different offsets in the
* file.
*
*
Root and/or record elements may additionally contain an arbitrary number of XML attributes.
* Additionally users must provide a class of a JAXB annotated Java type that can be used convert
* records into Java objects and vice versa using JAXB marshalling/unmarshalling mechanisms.
* Reading the source will generate a {@code PCollection} of the given JAXB annotated Java type.
* Optionally users may provide a minimum size of a bundle that should be created for the source.
*
*
Example:
*
*
{@code
* PCollection output = p.apply(XmlIO.read()
* .from(file.toPath().toString())
* .withRootElement("root")
* .withRecordElement("record")
* .withRecordClass(Record.class));
* }
*
* By default, UTF-8 charset is used. To specify a different charset, use {@link
* Read#withCharset}.
*
*
Currently, only XML files that use single-byte characters are supported. Using a file that
* contains multi-byte characters may result in data loss or duplication.
*
* @param Type of the objects that represent the records of the XML file. The {@code
* PCollection} generated by this source will be of this type.
*/
// CHECKSTYLE.ON: JavadocStyle
public static Read read() {
return new AutoValue_XmlIO_Read.Builder()
.setConfiguration(
new AutoValue_XmlIO_MappingConfiguration.Builder()
.setCharset(StandardCharsets.UTF_8.name())
.build())
.setMinBundleSize(1L)
.setCompression(Compression.AUTO)
.build();
}
/**
* Like {@link #read}, but reads each file in a {@link PCollection} of {@link ReadableFile}, which
* allows more flexible usage via different configuration options of {@link FileIO#match} and
* {@link FileIO#readMatches} that are not explicitly provided for {@link #read}.
*
* For example:
*
*
{@code
* PCollection files = p
* .apply(FileIO.match().filepattern(options.getInputFilepatternProvider()).continuously(
* Duration.standardSeconds(30), afterTimeSinceNewOutput(Duration.standardMinutes(5))))
* .apply(FileIO.readMatches().withCompression(GZIP));
*
* PCollection output = files.apply(XmlIO.readFiles()
* .withRootElement("root")
* .withRecordElement("record")
* .withRecordClass(Record.class));
* }
*/
public static ReadFiles readFiles() {
return new AutoValue_XmlIO_ReadFiles.Builder()
.setConfiguration(
new AutoValue_XmlIO_MappingConfiguration.Builder()
.setCharset(StandardCharsets.UTF_8.name())
.build())
.build();
}
/**
* Writes all elements in the input {@link PCollection} to a single XML file using {@link #sink}.
*
* For more configurable usage, use {@link #sink} directly with {@link FileIO#write} or {@link
* FileIO#writeDynamic}.
*/
public static Write write() {
return new AutoValue_XmlIO_Write.Builder().setCharset(StandardCharsets.UTF_8.name()).build();
}
@AutoValue
abstract static class MappingConfiguration implements HasDisplayData, Serializable {
abstract @Nullable String getRootElement();
abstract @Nullable String getRecordElement();
abstract @Nullable Class getRecordClass();
abstract @Nullable String getCharset();
abstract @Nullable ValidationEventHandler getValidationEventHandler();
abstract Builder toBuilder();
@AutoValue.Builder
abstract static class Builder {
abstract Builder setRootElement(String rootElement);
abstract Builder setRecordElement(String recordElement);
abstract Builder setRecordClass(Class recordClass);
abstract Builder setCharset(String charset);
abstract Builder setValidationEventHandler(ValidationEventHandler validationEventHandler);
abstract MappingConfiguration build();
}
private MappingConfiguration withRootElement(String rootElement) {
return toBuilder().setRootElement(rootElement).build();
}
private MappingConfiguration withRecordElement(String recordElement) {
return toBuilder().setRecordElement(recordElement).build();
}
private MappingConfiguration withRecordClass(Class recordClass) {
return toBuilder().setRecordClass(recordClass).build();
}
private MappingConfiguration withCharset(Charset charset) {
return toBuilder().setCharset(charset.name()).build();
}
private MappingConfiguration withValidationEventHandler(
ValidationEventHandler validationEventHandler) {
return toBuilder().setValidationEventHandler(validationEventHandler).build();
}
private void validate() {
checkArgument(getRootElement() != null, "withRootElement() is required");
checkArgument(getRecordElement() != null, "withRecordElement() is required");
checkArgument(getRecordClass() != null, "withRecordClass() is required");
checkArgument(getCharset() != null, "withCharset() is required");
}
@Override
public void populateDisplayData(DisplayData.Builder builder) {
builder
.addIfNotNull(
DisplayData.item("rootElement", getRootElement()).withLabel("XML Root Element"))
.addIfNotNull(
DisplayData.item("recordElement", getRecordElement()).withLabel("XML Record Element"))
.addIfNotNull(
DisplayData.item("recordClass", getRecordClass()).withLabel("XML Record Class"))
.addIfNotNull(DisplayData.item("charset", getCharset()).withLabel("Charset"));
}
}
/** Implementation of {@link #read}. */
@AutoValue
public abstract static class Read extends PTransform> {
abstract MappingConfiguration getConfiguration();
abstract @Nullable ValueProvider getFileOrPatternSpec();
abstract Compression getCompression();
abstract long getMinBundleSize();
abstract Builder toBuilder();
@AutoValue.Builder
abstract static class Builder {
abstract Builder setConfiguration(MappingConfiguration configuration);
abstract Builder setFileOrPatternSpec(ValueProvider fileOrPatternSpec);
abstract Builder setCompression(Compression compression);
abstract Builder setMinBundleSize(long minBundleSize);
abstract Read build();
}
/** @deprecated Use {@link Compression} instead. */
@Deprecated
public enum CompressionType {
/** @see Compression#AUTO */
AUTO(Compression.AUTO),
/** @see Compression#UNCOMPRESSED */
UNCOMPRESSED(Compression.UNCOMPRESSED),
/** @see Compression#GZIP */
GZIP(Compression.GZIP),
/** @see Compression#BZIP2 */
BZIP2(Compression.BZIP2),
/** @see Compression#ZIP */
ZIP(Compression.ZIP),
/** @see Compression#DEFLATE */
DEFLATE(Compression.DEFLATE);
private final Compression canonical;
CompressionType(Compression canonical) {
this.canonical = canonical;
}
/** @see Compression#matches */
public boolean matches(String filename) {
return canonical.matches(filename);
}
}
/**
* Reads a single XML file or a set of XML files defined by a Java "glob" file pattern. Each XML
* file should be of the form defined in {@link #read}.
*/
public Read from(String fileOrPatternSpec) {
return from(StaticValueProvider.of(fileOrPatternSpec));
}
/**
* Reads a single XML file or a set of XML files defined by a Java "glob" file pattern. Each XML
* file should be of the form defined in {@link #read}. Using ValueProviders.
*/
public Read from(ValueProvider fileOrPatternSpec) {
return toBuilder().setFileOrPatternSpec(fileOrPatternSpec).build();
}
private Read withConfiguration(MappingConfiguration configuration) {
return toBuilder().setConfiguration(configuration).build();
}
/**
* Sets name of the root element of the XML document. This will be used to create a valid
* starting root element when initiating a bundle of records created from an XML document. This
* is a required parameter.
*/
public Read withRootElement(String rootElement) {
return withConfiguration(getConfiguration().withRootElement(rootElement));
}
/**
* Sets name of the record element of the XML document. This will be used to determine offset of
* the first record of a bundle created from the XML document. This is a required parameter.
*/
public Read withRecordElement(String recordElement) {
return withConfiguration(getConfiguration().withRecordElement(recordElement));
}
/**
* Sets a JAXB annotated class that can be populated using a record of the provided XML file.
* This will be used when unmarshalling record objects from the XML file. This is a required
* parameter.
*/
public Read withRecordClass(Class recordClass) {
return withConfiguration(getConfiguration().withRecordClass(recordClass));
}
/**
* Sets a parameter {@code minBundleSize} for the minimum bundle size of the source. Please
* refer to {@link OffsetBasedSource} for the definition of minBundleSize. This is an optional
* parameter.
*/
public Read withMinBundleSize(long minBundleSize) {
return toBuilder().setMinBundleSize(minBundleSize).build();
}
/** @deprecated use {@link #withCompression}. */
@Deprecated
public Read withCompressionType(CompressionType compressionType) {
return withCompression(compressionType.canonical);
}
/** Decompresses all input files using the specified compression type. */
public Read withCompression(Compression compression) {
return toBuilder().setCompression(compression).build();
}
/** Sets the XML file charset. */
public Read withCharset(Charset charset) {
return withConfiguration(getConfiguration().withCharset(charset));
}
/**
* Sets the {@link ValidationEventHandler} to use with JAXB. Calling this with a {@code null}
* parameter will cause the JAXB unmarshaller event handler to be unspecified.
*/
public Read withValidationEventHandler(ValidationEventHandler validationEventHandler) {
return withConfiguration(
getConfiguration().withValidationEventHandler(validationEventHandler));
}
@Override
public void populateDisplayData(DisplayData.Builder builder) {
builder
.addIfNotDefault(
DisplayData.item("minBundleSize", getMinBundleSize())
.withLabel("Minimum Bundle Size"),
1L)
.add(DisplayData.item("filePattern", getFileOrPatternSpec()).withLabel("File Pattern"))
.include("configuration", getConfiguration());
}
@VisibleForTesting
BoundedSource createSource() {
return CompressedSource.from(new XmlSource<>(getFileOrPatternSpec(), getConfiguration(), 1L))
.withCompression(getCompression());
}
@Override
public PCollection expand(PBegin input) {
getConfiguration().validate();
return input.apply(org.apache.beam.sdk.io.Read.from(createSource()));
}
}
/** Implementation of {@link #readFiles}. */
@AutoValue
public abstract static class ReadFiles
extends PTransform, PCollection> {
abstract MappingConfiguration getConfiguration();
abstract Builder toBuilder();
@AutoValue.Builder
abstract static class Builder {
abstract Builder setConfiguration(MappingConfiguration configuration);
abstract ReadFiles build();
}
private ReadFiles withConfiguration(MappingConfiguration configuration) {
return toBuilder().setConfiguration(configuration).build();
}
/** Like {@link Read#withRootElement}. */
public ReadFiles withRootElement(String rootElement) {
return withConfiguration(getConfiguration().withRootElement(rootElement));
}
/** Like {@link Read#withRecordElement}. */
public ReadFiles withRecordElement(String recordElement) {
return withConfiguration(getConfiguration().withRecordElement(recordElement));
}
/** Like {@link Read#withRecordClass}. */
public ReadFiles withRecordClass(Class recordClass) {
return withConfiguration(getConfiguration().withRecordClass(recordClass));
}
/** Like {@link Read#withCharset}. */
public ReadFiles withCharset(Charset charset) {
return withConfiguration(getConfiguration().withCharset(charset));
}
/** Like {@link Read#withValidationEventHandler}. */
public ReadFiles withValidationEventHandler(ValidationEventHandler validationEventHandler) {
return withConfiguration(
getConfiguration().withValidationEventHandler(validationEventHandler));
}
@Override
public PCollection expand(PCollection input) {
return input.apply(
new ReadAllViaFileBasedSource<>(
64 * 1024L * 1024L,
new CreateSourceFn<>(getConfiguration()),
JAXBCoder.of(getConfiguration().getRecordClass())));
}
}
private static class CreateSourceFn implements SerializableFunction> {
private final MappingConfiguration configuration;
CreateSourceFn(MappingConfiguration configuration) {
this.configuration = configuration;
}
@Override
public XmlSource apply(String input) {
return new XmlSource<>(StaticValueProvider.of(input), configuration, 1L);
}
}
/** Implementation of {@link #write}. */
@AutoValue
public abstract static class Write extends PTransform, PDone> {
abstract @Nullable String getFilenamePrefix();
abstract @Nullable Class getRecordClass();
abstract @Nullable String getRootElement();
abstract @Nullable String getCharset();
abstract Builder toBuilder();
@AutoValue.Builder
abstract static class Builder {
abstract Builder setFilenamePrefix(String prefix);
abstract Builder setRecordClass(Class recordClass);
abstract Builder setRootElement(String rootElement);
abstract Builder setCharset(String charset);
abstract Write build();
}
/**
* Writes to files with the given path prefix.
*
* Output files will have the name {@literal {filenamePrefix}-0000i-of-0000n.xml} where n is
* the number of output bundles.
*/
public Write to(String filenamePrefix) {
return toBuilder().setFilenamePrefix(filenamePrefix).build();
}
/**
* Writes objects of the given class mapped to XML elements using JAXB.
*
* The specified class must be able to be used to create a JAXB context.
*/
public Write withRecordClass(Class recordClass) {
return toBuilder().setRecordClass(recordClass).build();
}
/** Sets the enclosing root element for the generated XML files. */
public Write withRootElement(String rootElement) {
return toBuilder().setRootElement(rootElement).build();
}
/** Sets the charset used to write the file. */
public Write withCharset(Charset charset) {
return toBuilder().setCharset(charset.name()).build();
}
@Override
public PDone expand(PCollection input) {
checkArgument(getRecordClass() != null, "withRecordClass() is required");
checkArgument(getRootElement() != null, "withRootElement() is required");
checkArgument(getFilenamePrefix() != null, "to() is required");
checkArgument(getCharset() != null, "withCharset() is required");
try {
JAXBContext.newInstance(getRecordClass());
} catch (JAXBException e) {
throw new RuntimeException("Error binding classes to a JAXB Context.", e);
}
ResourceId prefix =
FileSystems.matchNewResource(getFilenamePrefix(), false /* isDirectory */);
input.apply(
FileIO.write()
.via(
sink(getRecordClass())
.withCharset(Charset.forName(getCharset()))
.withRootElement(getRootElement()))
.to(prefix.getCurrentDirectory().toString())
.withPrefix(prefix.getFilename())
.withSuffix(".xml")
.withIgnoreWindowing());
return PDone.in(input.getPipeline());
}
@Override
public void populateDisplayData(DisplayData.Builder builder) {
builder
.addIfNotNull(
DisplayData.item("rootElement", getRootElement()).withLabel("XML Root Element"))
.addIfNotNull(
DisplayData.item("recordClass", getRecordClass()).withLabel("XML Record Class"))
.addIfNotNull(DisplayData.item("charset", getCharset()).withLabel("Charset"));
}
}
// CHECKSTYLE.OFF: JavadocStyle
/**
* Outputs records as XML-formatted elements using JAXB.
*
* The produced file consists of a single root element containing 1 sub-element per element
* written to the sink.
*
*
The given class will be used in the marshalling of records in an input PCollection to their
* XML representation and must be able to be bound using JAXB annotations.
*
*
For example, consider the following class with JAXB annotations:
*
*
* {@literal @}XmlRootElement(name = "word_count_result")
* {@literal @}XmlType(propOrder = {"word", "frequency"})
* public class WordFrequency {
* public String word;
* public long frequency;
* }
*
*
* The following will produce XML output with a root element named "words" from a PCollection
* of WordFrequency objects:
*
*
{@code
* p.apply(FileIO.write()
* .via(XmlIO.sink(WordFrequency.class).withRootElement("words"))
* .to(prefixAndShardTemplate("...", DEFAULT_UNWINDOWED_SHARD_TEMPLATE + ".xml"));
* }
*
* The output will look like:
*
*
{@code
*
*
* decreased
* 1
*
*
* War
* 4
*
*
* empress'
* 14
*
*
* stoops
* 6
*
* ...
*
* }
*/
// CHECKSTYLE.ON: JavadocStyle
public static Sink sink(Class recordClass) {
return new AutoValue_XmlIO_Sink.Builder()
.setRecordClass(recordClass)
.setCharset(StandardCharsets.UTF_8.name())
.build();
}
/** Implementation of {@link #sink}. */
@AutoValue
public abstract static class Sink implements FileIO.Sink {
abstract Class getRecordClass();
abstract @Nullable String getRootElement();
abstract String getCharset();
abstract Builder toBuilder();
@AutoValue.Builder
abstract static class Builder {
abstract Builder setRecordClass(Class clazz);
abstract Builder setRootElement(String rootElement);
abstract Builder setCharset(String charset);
abstract Sink build();
}
public Sink withRootElement(String rootElement) {
return toBuilder().setRootElement(rootElement).build();
}
public Sink withCharset(Charset charset) {
return toBuilder().setCharset(charset.name()).build();
}
private transient OutputStream outputStream;
private transient Marshaller marshaller;
@Override
public void open(WritableByteChannel channel) throws IOException {
checkArgument(getRootElement() != null, ".withRootElement() is required");
try {
marshaller = JAXBContext.newInstance(getRecordClass()).createMarshaller();
marshaller.setProperty(Marshaller.JAXB_FORMATTED_OUTPUT, Boolean.TRUE);
marshaller.setProperty(Marshaller.JAXB_FRAGMENT, Boolean.TRUE);
marshaller.setProperty(Marshaller.JAXB_ENCODING, getCharset());
} catch (JAXBException e) {
throw new IOException(e);
}
this.outputStream = Channels.newOutputStream(channel);
outputStream.write(("<" + getRootElement() + ">\n").getBytes(Charset.forName(getCharset())));
}
@Override
public void write(T element) throws IOException {
try {
this.marshaller.marshal(element, outputStream);
} catch (JAXBException e) {
throw new IOException(e);
}
}
@Override
public void flush() throws IOException {
outputStream.write(("\n" + getRootElement() + ">").getBytes(Charset.forName(getCharset())));
outputStream.flush();
}
}
}