org.apache.iceberg.flink.source.split.SerializerHelper Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg.flink.source.split;
import java.io.IOException;
import java.io.Serializable;
import java.io.UTFDataFormatException;
import org.apache.flink.core.memory.DataInputDeserializer;
import org.apache.flink.core.memory.DataOutputSerializer;
/**
* Helper class to serialize and deserialize strings longer than 65K. The inspiration is mostly
* taken from the class org.apache.flink.core.memory.DataInputSerializer.readUTF and
* org.apache.flink.core.memory.DataOutputSerializer.writeUTF.
*/
class SerializerHelper implements Serializable {
private SerializerHelper() {}
/**
* Similar to {@link DataOutputSerializer#writeUTF(String)}. Except this supports larger payloads
* which is up to max integer value.
*
* Note: This method can be removed when the method which does similar thing within the {@link
* DataOutputSerializer} already which does the same thing, so use that one instead once that is
* released on Flink version 1.20.
*
*
See * FLINK-34228 * https://github.com/apache/flink/pull/24191
*
* @param out the output stream to write the string to.
* @param str the string value to be written.
*/
public static void writeLongUTF(DataOutputSerializer out, String str) throws IOException {
int strlen = str.length();
long utflen = 0;
int ch;
/* use charAt instead of copying String to char array */
for (int i = 0; i < strlen; i++) {
ch = str.charAt(i);
utflen += getUTFBytesSize(ch);
if (utflen > Integer.MAX_VALUE) {
throw new UTFDataFormatException("Encoded string reached maximum length: " + utflen);
}
}
if (utflen > Integer.MAX_VALUE - 4) {
throw new UTFDataFormatException("Encoded string is too long: " + utflen);
}
out.writeInt((int) utflen);
writeUTFBytes(out, str, (int) utflen);
}
/**
* Similar to {@link DataInputDeserializer#readUTF()}. Except this supports larger payloads which
* is up to max integer value.
*
*
Note: This method can be removed when the method which does similar thing within the {@link
* DataOutputSerializer} already which does the same thing, so use that one instead once that is
* released on Flink version 1.20.
*
*
See * FLINK-34228 * https://github.com/apache/flink/pull/24191
*
* @param in the input stream to read the string from.
* @return the string value read from the input stream.
* @throws IOException if an I/O error occurs when reading from the input stream.
*/
public static String readLongUTF(DataInputDeserializer in) throws IOException {
int utflen = in.readInt();
byte[] bytearr = new byte[utflen];
char[] chararr = new char[utflen];
int ch;
int char2;
int char3;
int count = 0;
int chararrCount = 0;
in.readFully(bytearr, 0, utflen);
while (count < utflen) {
ch = (int) bytearr[count] & 0xff;
if (ch > 127) {
break;
}
count++;
chararr[chararrCount++] = (char) ch;
}
while (count < utflen) {
ch = (int) bytearr[count] & 0xff;
switch (ch >> 4) {
case 0:
case 1:
case 2:
case 3:
case 4:
case 5:
case 6:
case 7:
/* 0xxxxxxx */
count++;
chararr[chararrCount++] = (char) ch;
break;
case 12:
case 13:
/* 110x xxxx 10xx xxxx */
count += 2;
if (count > utflen) {
throw new UTFDataFormatException("malformed input: partial character at end");
}
char2 = bytearr[count - 1];
if ((char2 & 0xC0) != 0x80) {
throw new UTFDataFormatException("malformed input around byte " + count);
}
chararr[chararrCount++] = (char) (((ch & 0x1F) << 6) | (char2 & 0x3F));
break;
case 14:
/* 1110 xxxx 10xx xxxx 10xx xxxx */
count += 3;
if (count > utflen) {
throw new UTFDataFormatException("malformed input: partial character at end");
}
char2 = bytearr[count - 2];
char3 = bytearr[count - 1];
if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80)) {
throw new UTFDataFormatException("malformed input around byte " + (count - 1));
}
chararr[chararrCount++] =
(char) (((ch & 0x0F) << 12) | ((char2 & 0x3F) << 6) | (char3 & 0x3F));
break;
default:
/* 10xx xxxx, 1111 xxxx */
throw new UTFDataFormatException("malformed input around byte " + count);
}
}
// The number of chars produced may be less than utflen
return new String(chararr, 0, chararrCount);
}
private static int getUTFBytesSize(int ch) {
if ((ch >= 0x0001) && (ch <= 0x007F)) {
return 1;
} else if (ch > 0x07FF) {
return 3;
} else {
return 2;
}
}
private static void writeUTFBytes(DataOutputSerializer out, String str, int utflen)
throws IOException {
int strlen = str.length();
int ch;
int len = Math.max(1024, utflen);
byte[] bytearr = new byte[len];
int count = 0;
int index;
for (index = 0; index < strlen; index++) {
ch = str.charAt(index);
if (!((ch >= 0x0001) && (ch <= 0x007F))) {
break;
}
bytearr[count++] = (byte) ch;
}
for (; index < strlen; index++) {
ch = str.charAt(index);
if ((ch >= 0x0001) && (ch <= 0x007F)) {
bytearr[count++] = (byte) ch;
} else if (ch > 0x07FF) {
bytearr[count++] = (byte) (0xE0 | ((ch >> 12) & 0x0F));
bytearr[count++] = (byte) (0x80 | ((ch >> 6) & 0x3F));
bytearr[count++] = (byte) (0x80 | (ch & 0x3F));
} else {
bytearr[count++] = (byte) (0xC0 | ((ch >> 6) & 0x1F));
bytearr[count++] = (byte) (0x80 | (ch & 0x3F));
}
}
out.write(bytearr, 0, count);
}
}