All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.cloud.dataflow.sdk.coders.StringUtf8Coder Maven / Gradle / Ivy

Go to download

Google Cloud Dataflow Java SDK provides a simple, Java-based interface for processing virtually any size data using Google cloud resources. This artifact includes entire Dataflow Java SDK.

There is a newer version: 2.5.0
Show newest version
/*
 * Copyright (C) 2015 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package com.google.cloud.dataflow.sdk.coders;

import com.google.cloud.dataflow.sdk.util.ExposedByteArrayOutputStream;
import com.google.cloud.dataflow.sdk.util.StreamUtils;
import com.google.cloud.dataflow.sdk.util.VarInt;
import com.google.common.base.Utf8;
import com.google.common.io.ByteStreams;
import com.google.common.io.CountingOutputStream;

import com.fasterxml.jackson.annotation.JsonCreator;

import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.UTFDataFormatException;
import java.nio.charset.StandardCharsets;

/**
 * A {@link Coder} that encodes {@link String Strings} in UTF-8 encoding.
 * If in a nested context, prefixes the string with an integer length field,
 * encoded via a {@link VarIntCoder}.
 */
public class StringUtf8Coder extends AtomicCoder {

  @JsonCreator
  public static StringUtf8Coder of() {
    return INSTANCE;
  }

  /////////////////////////////////////////////////////////////////////////////

  private static final StringUtf8Coder INSTANCE = new StringUtf8Coder();

  private static void writeString(String value, DataOutputStream dos)
      throws IOException {
    byte[] bytes = value.getBytes(StandardCharsets.UTF_8);
    VarInt.encode(bytes.length, dos);
    dos.write(bytes);
  }

  private static String readString(DataInputStream dis) throws IOException {
    int len = VarInt.decodeInt(dis);
    if (len < 0) {
      throw new CoderException("Invalid encoded string length: " + len);
    }
    byte[] bytes = new byte[len];
    dis.readFully(bytes);
    return new String(bytes, StandardCharsets.UTF_8);
  }

  private StringUtf8Coder() {}

  @Override
  public void encode(String value, OutputStream outStream, Context context)
      throws IOException {
    if (value == null) {
      throw new CoderException("cannot encode a null String");
    }
    if (context.isWholeStream) {
      byte[] bytes = value.getBytes(StandardCharsets.UTF_8);
      if (outStream instanceof ExposedByteArrayOutputStream) {
        ((ExposedByteArrayOutputStream) outStream).writeAndOwn(bytes);
      } else {
        outStream.write(bytes);
      }
    } else {
      writeString(value, new DataOutputStream(outStream));
    }
  }

  @Override
  public String decode(InputStream inStream, Context context)
      throws IOException {
    if (context.isWholeStream) {
      byte[] bytes = StreamUtils.getBytes(inStream);
      return new String(bytes, StandardCharsets.UTF_8);
    } else {
      try {
        return readString(new DataInputStream(inStream));
      } catch (EOFException | UTFDataFormatException exn) {
        // These exceptions correspond to decoding problems, so change
        // what kind of exception they're branded as.
        throw new CoderException(exn);
      }
    }
  }

  /**
   * {@inheritDoc}
   *
   * @return {@code true}. This coder is injective.
   */
  @Override
  public boolean consistentWithEquals() {
    return true;
  }

  /**
   * {@inheritDoc}
   *
   * @return the byte size of the UTF-8 encoding of the a string or, in a nested context,
   * the byte size of the encoding plus the encoded length prefix.
   */
  @Override
  protected long getEncodedElementByteSize(String value, Context context)
      throws Exception {
    if (value == null) {
      throw new CoderException("cannot encode a null String");
    }
    if (context.isWholeStream) {
      return Utf8.encodedLength(value);
    } else {
      CountingOutputStream countingStream =
          new CountingOutputStream(ByteStreams.nullOutputStream());
      DataOutputStream stream = new DataOutputStream(countingStream);
      writeString(value, stream);
      return countingStream.getCount();
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy