All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.iceberg.flink.source.split.SerializerHelper Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.iceberg.flink.source.split;

import java.io.IOException;
import java.io.Serializable;
import java.io.UTFDataFormatException;
import org.apache.flink.core.memory.DataInputDeserializer;
import org.apache.flink.core.memory.DataOutputSerializer;

/**
 * Helper class to serialize and deserialize strings longer than 65K. The inspiration is mostly
 * taken from the class org.apache.flink.core.memory.DataInputSerializer.readUTF and
 * org.apache.flink.core.memory.DataOutputSerializer.writeUTF.
 */
class SerializerHelper implements Serializable {

  private SerializerHelper() {}

  /**
   * Similar to {@link DataOutputSerializer#writeUTF(String)}. Except this supports larger payloads
   * which is up to max integer value.
   *
   * 

Note: This method can be removed when the method which does similar thing within the {@link * DataOutputSerializer} already which does the same thing, so use that one instead once that is * released on Flink version 1.20. * *

See * FLINK-34228 * https://github.com/apache/flink/pull/24191 * * @param out the output stream to write the string to. * @param str the string value to be written. */ public static void writeLongUTF(DataOutputSerializer out, String str) throws IOException { int strlen = str.length(); long utflen = 0; int ch; /* use charAt instead of copying String to char array */ for (int i = 0; i < strlen; i++) { ch = str.charAt(i); utflen += getUTFBytesSize(ch); if (utflen > Integer.MAX_VALUE) { throw new UTFDataFormatException("Encoded string reached maximum length: " + utflen); } } if (utflen > Integer.MAX_VALUE - 4) { throw new UTFDataFormatException("Encoded string is too long: " + utflen); } out.writeInt((int) utflen); writeUTFBytes(out, str, (int) utflen); } /** * Similar to {@link DataInputDeserializer#readUTF()}. Except this supports larger payloads which * is up to max integer value. * *

Note: This method can be removed when the method which does similar thing within the {@link * DataOutputSerializer} already which does the same thing, so use that one instead once that is * released on Flink version 1.20. * *

See * FLINK-34228 * https://github.com/apache/flink/pull/24191 * * @param in the input stream to read the string from. * @return the string value read from the input stream. * @throws IOException if an I/O error occurs when reading from the input stream. */ public static String readLongUTF(DataInputDeserializer in) throws IOException { int utflen = in.readInt(); byte[] bytearr = new byte[utflen]; char[] chararr = new char[utflen]; int ch; int char2; int char3; int count = 0; int chararrCount = 0; in.readFully(bytearr, 0, utflen); while (count < utflen) { ch = (int) bytearr[count] & 0xff; if (ch > 127) { break; } count++; chararr[chararrCount++] = (char) ch; } while (count < utflen) { ch = (int) bytearr[count] & 0xff; switch (ch >> 4) { case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7: /* 0xxxxxxx */ count++; chararr[chararrCount++] = (char) ch; break; case 12: case 13: /* 110x xxxx 10xx xxxx */ count += 2; if (count > utflen) { throw new UTFDataFormatException("malformed input: partial character at end"); } char2 = bytearr[count - 1]; if ((char2 & 0xC0) != 0x80) { throw new UTFDataFormatException("malformed input around byte " + count); } chararr[chararrCount++] = (char) (((ch & 0x1F) << 6) | (char2 & 0x3F)); break; case 14: /* 1110 xxxx 10xx xxxx 10xx xxxx */ count += 3; if (count > utflen) { throw new UTFDataFormatException("malformed input: partial character at end"); } char2 = bytearr[count - 2]; char3 = bytearr[count - 1]; if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80)) { throw new UTFDataFormatException("malformed input around byte " + (count - 1)); } chararr[chararrCount++] = (char) (((ch & 0x0F) << 12) | ((char2 & 0x3F) << 6) | (char3 & 0x3F)); break; default: /* 10xx xxxx, 1111 xxxx */ throw new UTFDataFormatException("malformed input around byte " + count); } } // The number of chars produced may be less than utflen return new String(chararr, 0, chararrCount); } private static int getUTFBytesSize(int ch) { if ((ch >= 0x0001) && (ch <= 0x007F)) { return 1; } else if (ch > 0x07FF) { return 3; } else { return 2; } } private static void writeUTFBytes(DataOutputSerializer out, String str, int utflen) throws IOException { int strlen = str.length(); int ch; int len = Math.max(1024, utflen); byte[] bytearr = new byte[len]; int count = 0; int index; for (index = 0; index < strlen; index++) { ch = str.charAt(index); if (!((ch >= 0x0001) && (ch <= 0x007F))) { break; } bytearr[count++] = (byte) ch; } for (; index < strlen; index++) { ch = str.charAt(index); if ((ch >= 0x0001) && (ch <= 0x007F)) { bytearr[count++] = (byte) ch; } else if (ch > 0x07FF) { bytearr[count++] = (byte) (0xE0 | ((ch >> 12) & 0x0F)); bytearr[count++] = (byte) (0x80 | ((ch >> 6) & 0x3F)); bytearr[count++] = (byte) (0x80 | (ch & 0x3F)); } else { bytearr[count++] = (byte) (0xC0 | ((ch >> 6) & 0x1F)); bytearr[count++] = (byte) (0x80 | (ch & 0x3F)); } } out.write(bytearr, 0, count); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy