All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.facebook.presto.i18n.functions.I18nMyanmarFunctions Maven / Gradle / Ivy

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.facebook.presto.i18n.functions;

import com.facebook.presto.common.type.StandardTypes;
import com.facebook.presto.spi.function.Description;
import com.facebook.presto.spi.function.ScalarFunction;
import com.facebook.presto.spi.function.SqlType;
import com.google.myanmartools.TransliterateZ2U;
import com.google.myanmartools.ZawgyiDetector;
import io.airlift.slice.Slice;

import static io.airlift.slice.Slices.utf8Slice;

public final class I18nMyanmarFunctions
{
    /**
     * ZawgyiDetector returns a confidence score that a given string is Zawgyi encoded.
     * This constant defines the default confidence at which a string is treated as Zawgyi.
     */
    private static final double ZAWGYI_PROBABILITY_THRESHOLD = 0.9;

    private static final ZawgyiDetector detector = new ZawgyiDetector();
    private static final TransliterateZ2U z2uTransliterator = new TransliterateZ2U("Zawgyi to Unicode");

    private I18nMyanmarFunctions() {}

    @Description("labels whether input strings use Unicode or Zawgyi font encoding")
    @ScalarFunction
    @SqlType(StandardTypes.VARCHAR)
    public static Slice myanmarFontEncoding(@SqlType(StandardTypes.VARCHAR) Slice slice)
    {
        if (detector.getZawgyiProbability(slice.toStringUtf8()) > ZAWGYI_PROBABILITY_THRESHOLD) {
            return utf8Slice("zawgyi");
        }
        else {
            return utf8Slice("unicode");
        }
    }

    @Description("transforms strings using Myanmar characters to a normalized Unicode form")
    @ScalarFunction
    @SqlType(StandardTypes.VARCHAR)
    public static Slice myanmarNormalizeUnicode(@SqlType(StandardTypes.VARCHAR) Slice slice)
    {
        String[] rawInputPieces = slice.toStringUtf8().split("\n");
        String[] normalizedPieces = new String[rawInputPieces.length];
        for (int i = 0; i < rawInputPieces.length; i++) {
            if (detector.getZawgyiProbability(rawInputPieces[i]) > ZAWGYI_PROBABILITY_THRESHOLD) {
                normalizedPieces[i] = z2uTransliterator.convert(rawInputPieces[i]);
            }
            else {
                normalizedPieces[i] = rawInputPieces[i];
            }
        }
        return utf8Slice(String.join("\n", normalizedPieces));
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy