All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.cloud.dataflow.sdk.coders.Coder Maven / Gradle / Ivy

/*
 * Copyright (C) 2015 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package com.google.cloud.dataflow.sdk.coders;

import com.google.cloud.dataflow.sdk.annotations.Experimental;
import com.google.cloud.dataflow.sdk.annotations.Experimental.Kind;
import com.google.cloud.dataflow.sdk.util.CloudObject;
import com.google.cloud.dataflow.sdk.util.common.ElementByteSizeObserver;
import com.google.common.base.Joiner;
import com.google.common.base.Preconditions;

import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.Serializable;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;

import javax.annotation.Nullable;

/**
 * A {@code Coder} defines how to encode and decode values of type {@code T} into byte streams.
 *
 * 

All methods of a {@link Coder} are required to be thread safe. * *

{@link Coder} instances are serialized during job creation and deserialized * before use, via JSON serialization. * *

See {@link SerializableCoder} for an example of a {@code Coder} that adds a custom field to * the {@link Coder} serialization. It provides a constructor annotated with * {@link com.fasterxml.jackson.annotation.JsonCreator}, which is a factory method used when * deserializing a {@link Coder} instance. * *

See {@link KvCoder} for an example of a nested {@code Coder} type. * *

The binary format of a {@link Coder} is identified by {@link #getEncodingId()}; be sure to * understand the requirements for evolving coder formats. * * @param the type of the values being transcoded */ public interface Coder extends Serializable { /** The context in which encoding or decoding is being done. */ public static class Context { /** * The outer context. The value being encoded or decoded takes * up the remainder of the whole record/stream contents. */ public static final Context OUTER = new Context(true); /** * The nested context. The value being encoded or decoded is * (potentially) a part of a larger record/stream contents, and * may have other parts encoded or decoded after it. */ public static final Context NESTED = new Context(false); /** * Whether the encoded or decoded value fills the remainder of the * output or input (resp.) record/stream contents. If so, then * the size of the decoded value can be determined from the * remaining size of the record/stream contents, and so explicit * lengths aren't required. */ public final boolean isWholeStream; public Context(boolean isWholeStream) { this.isWholeStream = isWholeStream; } public Context nested() { return NESTED; } } /** * Encodes the given value of type {@code T} onto the given output stream * in the given context. * * @throws IOException if writing to the {@code OutputStream} fails * for some reason * @throws CoderException if the value could not be encoded for some reason */ public void encode(T value, OutputStream outStream, Context context) throws CoderException, IOException; /** * Decodes a value of type {@code T} from the given input stream in * the given context. Returns the decoded value. * * @throws IOException if reading from the {@code InputStream} fails * for some reason * @throws CoderException if the value could not be decoded for some reason */ public T decode(InputStream inStream, Context context) throws CoderException, IOException; /** * If this is a {@code Coder} for a parameterized type, returns the * list of {@code Coder}s being used for each of the parameters, or * returns {@code null} if this cannot be done or this is not a * parameterized type. */ public List> getCoderArguments(); /** * Returns the {@link CloudObject} that represents this {@code Coder}. */ public CloudObject asCloudObject(); /** * Throw {@link NonDeterministicException} if the coding is not deterministic. * *

In order for a {@code Coder} to be considered deterministic, * the following must be true: *

    *
  • two values that compare as equal (via {@code Object.equals()} * or {@code Comparable.compareTo()}, if supported) have the same * encoding. *
  • the {@code Coder} always produces a canonical encoding, which is the * same for an instance of an object even if produced on different * computers at different times. *
* * @throws Coder.NonDeterministicException if this coder is not deterministic. */ public void verifyDeterministic() throws Coder.NonDeterministicException; /** * Returns true if the encoded bytes of two objects are * equal only when they are also equal according to {@code Object.equals()}. * (and also implements a compatible {@code Object.hasCode()}) * *

This most notably false for arrays. It will generally * be false when {@code Object.equals()} compares object identity, * rather than performing a semantic/structural comparison. */ public boolean consistentWithEquals(); /** * Returns an object with an {@code Object.equals()} method * that represents structural equality on the argument. * (and also implements a compatible {@code Object.hashCode()}). * *

For any two objects of type T, if their encoded bytes * are the same, then their structural values are equal * according to {@code Object.equals()}. * *

Most notably, the structural value for an array coder * should perform a structural comparison of the contents of * the arrays, rather than the default behavior of * comparing according to object identity. * *

See also {@link #consistentWithEquals()}. */ public Object structuralValue(T value) throws Exception; /** * Returns whether {@link #registerByteSizeObserver} cheap enough to * call for every element, that is, if this {@code Coder} can * calculate the byte size of the element to be coded in roughly * constant time (or lazily). * *

Not intended to be called by user code, but instead by * {@link com.google.cloud.dataflow.sdk.runners.PipelineRunner} * implementations. */ public boolean isRegisterByteSizeObserverCheap(T value, Context context); /** * Notifies the {@code ElementByteSizeObserver} about the byte size * of the encoded value using this {@code Coder}. * *

Not intended to be called by user code, but instead by * {@link com.google.cloud.dataflow.sdk.runners.PipelineRunner} * implementations. */ public void registerByteSizeObserver( T value, ElementByteSizeObserver observer, Context context) throws Exception; /** * An identifier for the binary format written by {@link #encode}. * *

This value, along with the fully qualified class name, forms an identifier for the * binary format of this coder. Whenever this value changes, the new encoding is considered * incompatible with the prior format: It is presumed that the prior version of the coder will * be unable to correctly read the new format and the new version of the coder will be unable to * correctly read the old format. * *

If the format is changed in a backwards-compatible way (the Coder can still accept data from * the prior format), such as by adding optional fields to a Protocol Buffer or Avro definition, * and you want Dataflow to understand that the new coder is compatible with the prior coder, * this value must remain unchanged. It is then the responsibility of {@link #decode} to correctly * read data from the prior format. */ @Experimental(Kind.CODER_ENCODING_ID) public String getEncodingId(); /** * A collection of encodings supported by {@link #decode} in addition to the encoding * from {@link #getEncodingId()} (which is assumed supported). * *

This information is not currently used for any purpose. It is descriptive only, * and this method is subject to change. * * @see #getEncodingId() */ @Experimental(Kind.CODER_ENCODING_ID) public Collection getAllowedEncodings(); /** * Exception thrown by {@link Coder#verifyDeterministic()} if the encoding is * not deterministic. */ public static class NonDeterministicException extends Throwable { private static final long serialVersionUID = 0; private Coder coder; private List reasons; public NonDeterministicException( Coder coder, String reason, @Nullable NonDeterministicException e) { this(coder, Arrays.asList(reason), e); } public NonDeterministicException(Coder coder, String reason) { this(coder, Arrays.asList(reason), null); } public NonDeterministicException(Coder coder, List reasons) { this(coder, reasons, null); } public NonDeterministicException( Coder coder, List reasons, @Nullable NonDeterministicException cause) { super(cause); Preconditions.checkArgument(reasons.size() > 0, "Reasons must not be empty."); this.reasons = reasons; this.coder = coder; } public Iterable getReasons() { return reasons; } @Override public String getMessage() { return String.format("%s is not deterministic because:\n %s", coder, Joiner.on("\n ").join(reasons)); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy