org.flywaydb.core.internal.resource.filesystem.EncodingDetector Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of flyway-core Show documentation
There is a newer version: 10.15.2
/*
 * Copyright (C) Red Gate Software Ltd 2010-2024
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.flywaydb.core.internal.resource.filesystem;

import java.nio.ByteBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;

public class EncodingDetector {
    private static final int UTF16BigEndianHeaderByte1 = 254;
    private static final int UTF16BigEndianHeaderByte2 = 255;

    private static final int UTF16LittleEndianHeaderByte1 = 255;
    private static final int UTF16LittleEndianHeaderByte2 = 254;

    private static final String[] detectableCharsets = {
            StandardCharsets.UTF_8.toString(),
            StandardCharsets.ISO_8859_1.toString(),
            StandardCharsets.UTF_16.toString()
    };

    public static Charset detectFileEncoding(Path path) throws FlywayEncodingDetectionException {
        try {
            byte[] buffer = readAllBytesFromPath(path);
            Charset charset = detectCharset(buffer);

            if (charset == null) {
                throw new FlywayEncodingDetectionException("Could not detect charset for " + path);
            }

            return charset;
        } catch (Exception e) {
            throw new FlywayEncodingDetectionException("Could not detect encoding for " + path, e);
        }
    }

    private static byte[] readAllBytesFromPath(Path path) {
        try {
            return Files.readAllBytes(path.toAbsolutePath());
        } catch (Exception e) {
            throw new FlywayEncodingDetectionException("Could not detect encoding for " +
                                                               path + " as the file could not be read", e);
        }
    }

    private static Charset detectCharset(byte[] buffer) {
        Charset charset = readCharsetHeader(buffer);

        if (charset != null) {
            return charset;
        }

        for (String charsetName : detectableCharsets) {
            Charset detectingCharset = Charset.forName(charsetName);

            if (canCorrectlyDecodeBufferWithNamedCharset(buffer, detectingCharset)) {
                charset = detectingCharset;
                break;
            }
        }

        if (charset != StandardCharsets.UTF_16 &&
                charset != StandardCharsets.UTF_16BE &&
                charset != StandardCharsets.UTF_16LE &&
                isLikelyToBeUTF16(buffer)) {
            charset = StandardCharsets.UTF_16;
        }

        return charset;
    }

    private static Charset readCharsetHeader(byte[] buffer) {
        if (buffer.length < 2) {
            // We assume two header bytes
            return null;
        }

        if (buffer.length % 2 != 0) {
            // UTF16 files cannot be an odd length
            return null;
        }

        int headerByte1 = Byte.toUnsignedInt(buffer[0]);
        int headerByte2 = Byte.toUnsignedInt(buffer[1]);

        if (UTF16BigEndianHeaderByte1 == headerByte1 && UTF16BigEndianHeaderByte2 == headerByte2) {
            return StandardCharsets.UTF_16BE;
        }

        if (UTF16LittleEndianHeaderByte1 == headerByte1 && UTF16LittleEndianHeaderByte2 == headerByte2) {
            return StandardCharsets.UTF_16LE;
        }

        return null;
    }

    /**
     * UTF16 documents tend to have most odd indexed bytes be a 0 byte. Non UTF16 this is very rare.
     * We use this fact to ascertain if a BOMLess file is infact UTF16.
     */
    private static boolean isLikelyToBeUTF16(byte[] buffer) {
        if (buffer.length % 2 != 0) {
            // UTF16 files cannot be an odd length
            return false;
        }

        int matchCount = 0;

        for (int i = 1; i < buffer.length; i = i + 2) {
            boolean oddIndexedByteIsZero = (Byte.toUnsignedInt(buffer[i]) == 0);

            if (oddIndexedByteIsZero) {
                matchCount++;
            } else {
                matchCount--;
            }
        }

        // if matchCount is greater than 0 (more than half of odd indexed bytes are zero) we can assume this is
        // BOMLess UTF16. A lower threshold may be applicable.
        return matchCount > 0;
    }

    private static boolean canCorrectlyDecodeBufferWithNamedCharset(byte[] buffer, Charset charset) {
        try {
            charset.newDecoder().decode(ByteBuffer.wrap(buffer));
            return true;
        } catch (CharacterCodingException e) {
            return false;
        }
    }
}