All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.cloud.dataflow.sdk.io.CompressedSource Maven / Gradle / Ivy

Go to download

Google Cloud Dataflow Java SDK provides a simple, Java-based interface for processing virtually any size data using Google cloud resources. This artifact includes entire Dataflow Java SDK.

There is a newer version: 2.5.0
Show newest version
/*
 * Copyright (C) 2015 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package com.google.cloud.dataflow.sdk.io;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;

import com.google.cloud.dataflow.sdk.annotations.Experimental;
import com.google.cloud.dataflow.sdk.coders.Coder;
import com.google.cloud.dataflow.sdk.options.PipelineOptions;
import com.google.cloud.dataflow.sdk.transforms.display.DisplayData;
import com.google.common.io.ByteStreams;
import com.google.common.primitives.Ints;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import java.io.IOException;
import java.io.PushbackInputStream;
import java.io.Serializable;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import java.util.NoSuchElementException;
import java.util.zip.GZIPInputStream;
import javax.annotation.concurrent.GuardedBy;

/**
 * A Source that reads from compressed files. A {@code CompressedSources} wraps a delegate
 * {@link FileBasedSource} that is able to read the decompressed file format.
 *
 * 

For example, use the following to read from a gzip-compressed XML file: * *

 {@code
 * XmlSource mySource = XmlSource.from(...);
 * PCollection collection = p.apply(Read.from(CompressedSource
 *     .from(mySource)
 *     .withDecompression(CompressedSource.CompressionMode.GZIP)));
 * } 
* *

Supported compression algorithms are {@link CompressionMode#GZIP} and * {@link CompressionMode#BZIP2}. User-defined compression types are supported by implementing * {@link DecompressingChannelFactory}. * *

By default, the compression algorithm is selected from those supported in * {@link CompressionMode} based on the file name provided to the source, namely * {@code ".bz2"} indicates {@link CompressionMode#BZIP2} and {@code ".gz"} indicates * {@link CompressionMode#GZIP}. If the file name does not match any of the supported * algorithms, it is assumed to be uncompressed data. * * @param The type to read from the compressed file. */ @Experimental(Experimental.Kind.SOURCE_SINK) public class CompressedSource extends FileBasedSource { /** * Factory interface for creating channels that decompress the content of an underlying channel. */ public interface DecompressingChannelFactory extends Serializable { /** * Given a channel, create a channel that decompresses the content read from the channel. * @throws IOException */ ReadableByteChannel createDecompressingChannel(ReadableByteChannel channel) throws IOException; } /** * Factory interface for creating channels that decompress the content of an underlying channel, * based on both the channel and the file name. */ private interface FileNameBasedDecompressingChannelFactory extends DecompressingChannelFactory { /** * Given a channel, create a channel that decompresses the content read from the channel. * @throws IOException */ ReadableByteChannel createDecompressingChannel(String fileName, ReadableByteChannel channel) throws IOException; /** * Given a file name, returns true if the file name matches any supported compression * scheme. */ boolean isCompressed(String fileName); } /** * Default compression types supported by the {@code CompressedSource}. */ public enum CompressionMode implements DecompressingChannelFactory { /** * Reads a byte channel assuming it is compressed with gzip. */ GZIP { @Override public boolean matches(String fileName) { return fileName.toLowerCase().endsWith(".gz"); } @Override public ReadableByteChannel createDecompressingChannel(ReadableByteChannel channel) throws IOException { // Determine if the input stream is gzipped. The input stream returned from the // GCS connector may already be decompressed; GCS does this based on the // content-encoding property. PushbackInputStream stream = new PushbackInputStream(Channels.newInputStream(channel), 2); byte[] headerBytes = new byte[2]; int bytesRead = ByteStreams.read( stream /* source */, headerBytes /* dest */, 0 /* offset */, 2 /* len */); stream.unread(headerBytes, 0, bytesRead); if (bytesRead >= 2) { byte zero = 0x00; int header = Ints.fromBytes(zero, zero, headerBytes[1], headerBytes[0]); if (header == GZIPInputStream.GZIP_MAGIC) { return Channels.newChannel(new GzipCompressorInputStream(stream, true)); } } return Channels.newChannel(stream); } }, /** * Reads a byte channel assuming it is compressed with bzip2. */ BZIP2 { @Override public boolean matches(String fileName) { return fileName.toLowerCase().endsWith(".bz2"); } @Override public ReadableByteChannel createDecompressingChannel(ReadableByteChannel channel) throws IOException { return Channels.newChannel( new BZip2CompressorInputStream(Channels.newInputStream(channel))); } }; /** * Returns {@code true} if the given file name implies that the contents are compressed * according to the compression embodied by this factory. */ public abstract boolean matches(String fileName); @Override public abstract ReadableByteChannel createDecompressingChannel(ReadableByteChannel channel) throws IOException; } /** * Reads a byte channel detecting compression according to the file name. If the filename * is not any other known {@link CompressionMode}, it is presumed to be uncompressed. */ private static class DecompressAccordingToFilename implements FileNameBasedDecompressingChannelFactory { @Override public ReadableByteChannel createDecompressingChannel( String fileName, ReadableByteChannel channel) throws IOException { for (CompressionMode type : CompressionMode.values()) { if (type.matches(fileName)) { return type.createDecompressingChannel(channel); } } // Uncompressed return channel; } @Override public ReadableByteChannel createDecompressingChannel(ReadableByteChannel channel) { throw new UnsupportedOperationException( String.format("%s does not support createDecompressingChannel(%s) but only" + " createDecompressingChannel(%s,%s)", getClass().getSimpleName(), String.class.getSimpleName(), ReadableByteChannel.class.getSimpleName(), ReadableByteChannel.class.getSimpleName())); } @Override public boolean isCompressed(String fileName) { for (CompressionMode type : CompressionMode.values()) { if (type.matches(fileName)) { return true; } } return false; } } private final FileBasedSource sourceDelegate; private final DecompressingChannelFactory channelFactory; /** * Creates a {@link Read} transform that reads from that reads from the underlying * {@link FileBasedSource} {@code sourceDelegate} after decompressing it with a {@link * DecompressingChannelFactory}. */ public static Read.Bounded readFromSource( FileBasedSource sourceDelegate, DecompressingChannelFactory channelFactory) { return Read.from(new CompressedSource<>(sourceDelegate, channelFactory)); } /** * Creates a {@code CompressedSource} from an underlying {@code FileBasedSource}. The type * of compression used will be based on the file name extension unless explicitly * configured via {@link CompressedSource#withDecompression}. */ public static CompressedSource from(FileBasedSource sourceDelegate) { return new CompressedSource<>(sourceDelegate, new DecompressAccordingToFilename()); } /** * Return a {@code CompressedSource} that is like this one but will decompress its underlying file * with the given {@link DecompressingChannelFactory}. */ public CompressedSource withDecompression(DecompressingChannelFactory channelFactory) { return new CompressedSource<>(this.sourceDelegate, channelFactory); } /** * Creates a {@code CompressedSource} from a delegate file based source and a decompressing * channel factory. */ private CompressedSource( FileBasedSource sourceDelegate, DecompressingChannelFactory channelFactory) { super(sourceDelegate.getFileOrPatternSpecProvider(), Long.MAX_VALUE); this.sourceDelegate = sourceDelegate; this.channelFactory = channelFactory; } /** * Creates a {@code CompressedSource} for an individual file. Used by {@link * CompressedSource#createForSubrangeOfFile}. */ private CompressedSource(FileBasedSource sourceDelegate, DecompressingChannelFactory channelFactory, String filePatternOrSpec, long minBundleSize, long startOffset, long endOffset) { super(filePatternOrSpec, minBundleSize, startOffset, endOffset); this.sourceDelegate = sourceDelegate; this.channelFactory = channelFactory; try { checkArgument( isSplittable() || startOffset == 0, "CompressedSources must start reading at offset 0. Requested offset: " + startOffset); } catch (Exception e) { throw new RuntimeException( "Error checking whether source " + sourceDelegate + " is splittable"); } } /** * Validates that the delegate source is a valid source and that the channel factory is not null. */ @Override public void validate() { super.validate(); checkNotNull(sourceDelegate); sourceDelegate.validate(); checkNotNull(channelFactory); } /** * Creates a {@code CompressedSource} for a subrange of a file. Called by superclass to create a * source for a single file. */ @Override protected FileBasedSource createForSubrangeOfFile(String fileName, long start, long end) { return new CompressedSource<>(sourceDelegate.createForSubrangeOfFile(fileName, start, end), channelFactory, fileName, sourceDelegate.getMinBundleSize(), start, end); } /** * Determines whether a single file represented by this source is splittable. Returns true * if we are using the default decompression factory and and it determines * from the requested file name that the file is not compressed. */ @Override protected final boolean isSplittable() throws Exception { if (channelFactory instanceof FileNameBasedDecompressingChannelFactory) { FileNameBasedDecompressingChannelFactory fileNameBasedChannelFactory = (FileNameBasedDecompressingChannelFactory) channelFactory; return !fileNameBasedChannelFactory.isCompressed(getFileOrPatternSpec()); } return false; } /** * Creates a {@code FileBasedReader} to read a single file. * *

Uses the delegate source to create a single file reader for the delegate source. * Utilizes the default decompression channel factory to not wrap the source reader * if the file name does not represent a compressed file allowing for splitting of * the source. */ @Override protected final FileBasedReader createSingleFileReader(PipelineOptions options) { if (channelFactory instanceof FileNameBasedDecompressingChannelFactory) { FileNameBasedDecompressingChannelFactory fileNameBasedChannelFactory = (FileNameBasedDecompressingChannelFactory) channelFactory; if (!fileNameBasedChannelFactory.isCompressed(getFileOrPatternSpec())) { return sourceDelegate.createSingleFileReader(options); } } return new CompressedReader( this, sourceDelegate.createSingleFileReader(options)); } /** * Returns whether the delegate source produces sorted keys. */ @Override public final boolean producesSortedKeys(PipelineOptions options) throws Exception { return sourceDelegate.producesSortedKeys(options); } @Override public void populateDisplayData(DisplayData.Builder builder) { // We explicitly do not register base-class data, instead we use the delegate inner source. builder .include(sourceDelegate) .add(DisplayData.item("source", sourceDelegate.getClass()) .withLabel("Read Source")); if (channelFactory instanceof Enum) { // GZIP and BZIP are implemented as enums; Enum classes are anonymous, so use the .name() // value instead builder.add(DisplayData.item("compressionMode", ((Enum) channelFactory).name()) .withLabel("Compression Mode")); } else { builder.add(DisplayData.item("compressionMode", channelFactory.getClass()) .withLabel("Compression Mode")); } } /** * Returns the delegate source's default output coder. */ @Override public final Coder getDefaultOutputCoder() { return sourceDelegate.getDefaultOutputCoder(); } public final DecompressingChannelFactory getChannelFactory() { return channelFactory; } /** * Reader for a {@link CompressedSource}. Decompresses its input and uses a delegate * reader to read elements from the decompressed input. * @param The type of records read from the source. */ public static class CompressedReader extends FileBasedReader { private final FileBasedReader readerDelegate; private final CompressedSource source; private final Object progressLock = new Object(); @GuardedBy("progressLock") private int numRecordsRead; @GuardedBy("progressLock") private CountingChannel channel; /** * Create a {@code CompressedReader} from a {@code CompressedSource} and delegate reader. */ public CompressedReader(CompressedSource source, FileBasedReader readerDelegate) { super(source); this.source = source; this.readerDelegate = readerDelegate; } /** * Gets the current record from the delegate reader. */ @Override public T getCurrent() throws NoSuchElementException { return readerDelegate.getCurrent(); } @Override public final long getSplitPointsConsumed() { synchronized (progressLock) { return (isDone() && numRecordsRead > 0) ? 1 : 0; } } @Override public final long getSplitPointsRemaining() { return isDone() ? 0 : 1; } /** * Returns true only for the first record; compressed sources cannot be split. */ @Override protected final boolean isAtSplitPoint() { // We have to return true for the first record, but not for the state before reading it, // and not for the state after reading any other record. Hence == rather than >= or <=. // This is required because FileBasedReader is intended for readers that can read a range // of offsets in a file and where the range can be split in parts. CompressedReader, // however, is a degenerate case because it cannot be split, but it has to satisfy the // semantics of offsets and split points anyway. synchronized (progressLock) { return numRecordsRead == 1; } } private static class CountingChannel implements ReadableByteChannel { long count; private final ReadableByteChannel inner; public CountingChannel(ReadableByteChannel inner, long count) { this.inner = inner; this.count = count; } public long getCount() { return count; } @Override public int read(ByteBuffer dst) throws IOException { int bytes = inner.read(dst); if (bytes > 0) { // Avoid the -1 from EOF. count += bytes; } return bytes; } @Override public boolean isOpen() { return inner.isOpen(); } @Override public void close() throws IOException { inner.close(); } } /** * Creates a decompressing channel from the input channel and passes it to its delegate reader's * {@link FileBasedReader#startReading(ReadableByteChannel)}. */ @Override protected final void startReading(ReadableByteChannel channel) throws IOException { synchronized (progressLock) { this.channel = new CountingChannel(channel, getCurrentSource().getStartOffset()); channel = this.channel; } if (source.getChannelFactory() instanceof FileNameBasedDecompressingChannelFactory) { FileNameBasedDecompressingChannelFactory channelFactory = (FileNameBasedDecompressingChannelFactory) source.getChannelFactory(); readerDelegate.startReading(channelFactory.createDecompressingChannel( getCurrentSource().getFileOrPatternSpec(), channel)); } else { readerDelegate.startReading(source.getChannelFactory().createDecompressingChannel( channel)); } } /** * Reads the next record via the delegate reader. */ @Override protected final boolean readNextRecord() throws IOException { if (!readerDelegate.readNextRecord()) { return false; } synchronized (progressLock) { ++numRecordsRead; } return true; } // Unsplittable: returns the offset in the input stream that has been read by the input. // these positions are likely to be coarse-grained (in the event of buffering) and // over-estimates (because they reflect the number of bytes read to produce an element, not its // start) but both of these provide better data than e.g., reporting the start of the file. @Override protected final long getCurrentOffset() throws NoSuchElementException { synchronized (progressLock) { if (numRecordsRead <= 1) { // Since the first record is at a split point, it should start at the beginning of the // file. This avoids the bad case where the decompressor read the entire file, which // would cause the file to be treated as empty when returning channel.getCount() as it // is outside the valid range. return 0; } return channel.getCount(); } } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy