All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.camel.dataformat.bindy.UnicodeHelper Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.camel.dataformat.bindy;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;

import com.ibm.icu.text.BreakIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * This class replicates the essential parts of the String class in order to aid proper work for Unicode chars in the
 * presense of UTF-16. So for all operations please see {@link String} with the same signature. This class is equally
 * immutable.
 */
public class UnicodeHelper implements Serializable {
    /**
     * Defines how length if a string is defined, i.e how chars are counted.
     */
    public enum Method {
        /**
         * One "char" is one Unicode codepoint, which is the standard case.
         */
        CODEPOINTS,

        /**
         * One "char" is one graphem.
         */
        GRAPHEME;
    }

    private static final Logger LOG = LoggerFactory.getLogger(UnicodeHelper.class);

    private String input;

    private List splitted;

    private Method method;

    /**
     * Create instance.
     * 
     * @param input  String, that is to be wrapped.
     * @param method Method, that is used to determin "chars" of string.
     */
    public UnicodeHelper(final String input, final Method method) {
        this.input = input;
        this.method = method;
        this.splitted = null;
    }

    /**
     * For Serialization only!
     */
    protected UnicodeHelper() {
        // Empty
    }

    /**
     * @return Returns the method used to determining the string length.
     */
    public Method getMethod() {
        return method;
    }

    /**
     * @see String#substring(int)
     */
    public String substring(final int beginIndex) {
        split();

        final int beginChar = splitted.get(beginIndex);
        return input.substring(beginChar);
    }

    /**
     * @see String#substring(int, int)
     */
    public String substring(final int beginIndex, final int endIndex) {
        split();

        final int beginChar = splitted.get(beginIndex);
        final int endChar = splitted.get(endIndex);
        return input.substring(beginChar, endChar);
    }

    /**
     * @see String#length()
     */
    public int length() {
        split();

        return splitted.size() - 1;
    }

    /**
     * @see String#indexOf(String)
     */
    public int indexOf(final String str) {
        return indexOf(str, 0);
    }

    /**
     * @see String#indexOf(String, int)
     */
    public int indexOf(final String str, final int fromIndex) {
        split();

        final int len = new UnicodeHelper(str, method).length();

        for (int index = fromIndex; index + len < length(); index++) {
            if (str.equals(input.substring(splitted.get(index), splitted.get(index + len)))) {
                return index;
            }
        }

        return -1;
    }

    private void split() {
        if (this.splitted != null) {
            return;
        }

        if (method.equals(Method.CODEPOINTS)) {
            splitCodepoints();

        } else /* (method.equals(Method.GRAPHEME)) */ {
            splitGrapheme();
        }

        LOG.debug("\"{}\" is splitted into {} ({} {}).", input, splitted, splitted.size() - 1, method);
        if (LOG.isTraceEnabled()) {
            for (int i = 0; i < splitted.size() - 2; i++) {
                LOG.trace("segment [{},{}[=\"{}\".", splitted.get(i), splitted.get(i + 1),
                        input.substring(splitted.get(i), splitted.get(i + 1)));
            }
        }
    }

    private void splitCodepoints() {
        final List result = new ArrayList<>();

        int i = 0;
        final int len = input.length();
        while (i < len) {
            result.add(i);
            i += (Character.codePointAt(input, i) > 0xffff) ? 2 : 1;
        }
        result.add(len);

        this.splitted = result;
    }

    private void splitGrapheme() {
        final List result = new ArrayList<>();

        // 
        // Caution: The BreakIterator of ICU lib (com.ibm.icu.text.BreakIterator; siehe Dependencies) ist used here, 
        //          since the Java builtin one cannot handle modern unicode (Emojis with sex, skin colour, etc.) correctly.
        //
        final BreakIterator bit = BreakIterator.getCharacterInstance();
        bit.setText(input);

        result.add(bit.first());
        for (int end = bit.next(); end != BreakIterator.DONE; end = bit.next()) {
            result.add(end);
        }
        this.splitted = result;
    }

    @Override
    public String toString() {
        return "StringHelper [input=" + input + ", splitted=" + splitted + ", method=" + method + "]";
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy