net.sf.sfac.string.StringUtils Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of sfac-utils Show documentation
This project is the model side of the Swing Framework and Components (SFaC). If your doing a clean separation between model (or business) and view (or GUI or rendering) parts of your application, (like in the MVC pattern), then the only classes of SFaC your model can access are in this project. On the other hand, the classes in sfac-core project are GUI-specific and should not be known by your model.
The newest version!
/*-------------------------------------------------------------------------
 Copyright 2009 Olivier Berlanger

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

 http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 -------------------------------------------------------------------------*/
package net.sf.sfac.string;


import java.util.StringTokenizer;


/**
 * String manipulation/comparison utility class.
 * 
 * @author Olivier Berlanger
 */
public abstract class StringUtils {


    // @formatter:off
    /** 
     * Character conversion map to characters without diacritic marks (unicode range = 00C0 -> 021F).
     */
    private static final char[] CHAR_MAP = {
        'A',     'A',     'A',     'A',     'A',     'A',     '\u00C6','C',     'E',     'E',     'E',     'E',     'I',     'I',     'I',     'I',      // 0x00Cx
        'D',     'N',     'O',     'O',     'O',     'O',     'O',     '\u00D7','O',     'U',     'U',     'U',     'U',     'Y',     '\u00DE','\u00DF', // 0x00Dx
        'a',     'a',     'a',     'a',     'a',     'a',     '\u00E6','c',     'e',     'e',     'e',     'e',     'i',     'i',     'i',     'i',      // 0x00Ex
        'o',     'n',     'o',     'o',     'o',     'o',     'o',     '\u00F7','o',     'u',     'u',     'u',     'u',     'y',     '\u00FE','y',      // 0x00Fx
        'A',     'a',     'A',     'a',     'A',     'a',     'C',     'c',     'C',     'c',     'C',     'c',     'C',     'c',     'D',     'd',      // 0x010x
        'D',     'd',     'E',     'e',     'E',     'e',     'E',     'e',     'E',     'e',     'E',     'e',     'G',     'g',     'G',     'g',      // 0x011x
        'G',     'g',     'G',     'g',     'H',     'h',     'H',     'h',     'I',     'i',     'I',     'i',     'I',     'i',     'I',     'i',      // 0x012x
        'I',     'i',     '\u0132','\u0133','J',     'j',     'K',     'k',     'k',     'L',     'l',     'L',     'l',     'L',     'l',     'L',      // 0x013x
        'l',     'L',     'l',     'N',     'n',     'N',     'n',     'N',     'n',     'n',     'n',     'n',     'O',     'o',     'O',     'o',      // 0x014x
        'O',     'o',     '\u0152','\u0153','R',     'r',     'R',     'r',     'R',     'r',     'S',     's',     'S',     's',     'S',     's',      // 0x015x
        'S',     's',     'T',     't',     'T',     't',     'T',     't',     'U',     'u',     'U',     'u',     'U',     'u',     'U',     'u',      // 0x016x
        'U',     'u',     'U',     'u',     'W',     'w',     'Y',     'y',     'Y',     'Z',     'z',     'Z',     'z',     'Z',     'z',     '\u017F', // 0x017x
        '\u0180','\u0181','\u0182','\u0183','\u0184','\u0185','\u0186','\u0187','\u0188','\u0189','\u018A','\u018B','\u018C','\u018D','\u018E','\u018F', // 0x018x
        '\u0190','\u0191','f'     ,'\u0193','\u0194','\u0195','\u0196','\u0197','\u0198','\u0199','\u019A','\u019B','\u019C','\u019D','\u019E','\u019F', // 0x019x
        'O',     'o',     '\u01A2','\u01A3','\u01A4','\u01A5','\u01A6','\u01A7','\u01A8','\u01A9','\u01AA','\u01AB','\u01AC','\u01AD','\u01AE','U',      // 0x01Ax
        'u',     '\u01B1','\u01B2','\u01B3','\u01B4','\u01B5','\u01B6','\u01B7','\u01B8','\u01B9','\u01BA','\u01BB','\u01BC','\u01BD','\u01BE','\u01BF', // 0x01Bx
        '\u01C0','\u01C1','\u01C2','\u01C3','\u01C4','\u01C5','\u01C6','\u01C7','\u01C8','\u01C9','\u01CA','\u01CB','\u01CC','A',     'a',     'I',      // 0x01Cx
        'i',     'O',     'o',     'U',     'u',     'U',     'u',     'U',     'u',     'U',     'u',     'U',     'u',     '\u01DD','\u01DE','\u01DF', // 0x01Dx
        '\u01E0','\u01E1','\u01E2','\u01E3','\u01E4','\u01E5','\u01E6','\u01E7','\u01E8','\u01E9','\u01EA','\u01EB','\u01EC','\u01ED','\u01EE','\u01EF', // 0x01Ex
        '\u01F0','\u01F1','\u01F2','\u01F3','\u01F4','\u01F5','\u01F6','\u01F7','\u01F8','\u01F9','A',     'a',     '\u01FC','\u01FD','O',     'o',      // 0x01Fx
        '\u0200','\u0201','\u0202','\u0203','\u0204','\u0205','\u0206','\u0207','\u0208','\u0209','\u020A','\u020B','\u020C','\u020D','\u020E','\u020F', // 0x020x
        '\u0210','\u0211','\u0212','\u0213','\u0214','\u0215','\u0216','\u0217','S',     's',     'T',     't',     '\u021C','\u021D','\u021E','\u021F', // 0x021x
    };
    // @formatter:on

    /** char iterator used to normalize strings. */
    private static StringCharIterator strIterator;


    /**
     * Check if the string is contained in the iterator. The match will be strict (including whitespace and non-letter chars).
     * 
     * @param pattern
     *            the pattern to find in the char iterator.
     * @param src
     *            A CharIterator on the text to search.
     * @return true iff the pattern was found in the char iterator.
     */
    public static boolean matchString(String pattern, CharIterator src, boolean ignoreCase) {
        boolean matched = false;
        // match
        if (pattern == null) pattern = "";
        if (ignoreCase) pattern = pattern.toLowerCase();
        int patternLen = pattern.length();
        if (patternLen > 0) {
            int matchedIndex = 0;
            char ch = src.nextChar();
            while (ch != '\0') {
                if (ignoreCase) ch = Character.toLowerCase(ch);
                if (pattern.charAt(matchedIndex) == ch) {
                    matchedIndex++;
                    if (matchedIndex >= patternLen) {
                        matched = true;
                        break;
                    }
                } else {
                    if (pattern.charAt(0) == ch) matchedIndex = 1;
                    else matchedIndex = 0;
                }
                // process next char
                ch = src.nextChar();
            }
        } else {
            matched = true;
        }
        return matched;
    }


    /**
     * Check if the given pattern is contained in the iterator. The pattern will be normalized (as the CharIterator content) before
     * comparison.
     * 
     * @param pattern
     *            the pattern to find in the char iterator.
     * @param src
     *            A CharIterator on the text to search.
     * @return true iff the pattern was found in the char iterator.
     */
    public static boolean matchPattern(String pattern, CharIterator src) {
        boolean matched = false;
        // format the pattern
        pattern = getNormalizedString(pattern);
        // match
        int patternLen = pattern.length();
        if (patternLen > 0) {
            int matchedIndex = 0;
            char ch = src.nextNormalizedChar();
            while (ch != '\0') {
                if (pattern.charAt(matchedIndex) == ch) {
                    matchedIndex++;
                    if (matchedIndex >= patternLen) {
                        matched = true;
                        break;
                    }
                } else {
                    if (pattern.charAt(0) == ch) matchedIndex = 1;
                    else matchedIndex = 0;
                }
                // process next char
                ch = src.nextNormalizedChar();
            }
        } else {
            matched = true;
        }
        return matched;
    }


    /**
     * Check if the content of the two iterators is the same. if normalized is true, the iterators content will be
     * normalized and trimmed for comparison.
     * 
     * @param it1
     *            first char iterator.
     * @param it2
     *            second char iterator.
     * @param normalized
     *            true content should be normalized for comparison.
     * @return true iff the content of the two iterators is the same.
     */
    public static boolean areEquals(CharIterator it1, CharIterator it2, boolean normalized) {
        char ch1 = normalized ? it1.nextNormalizedChar() : it1.nextChar();
        char ch2 = normalized ? it2.nextNormalizedChar() : it2.nextChar();
        // eat leading space
        if (normalized) {
            while (ch1 == ' ')
                ch1 = it1.nextNormalizedChar();
            while (ch2 == ' ')
                ch2 = it2.nextNormalizedChar();
        }
        // compare
        while ((ch1 != '\0') && (ch2 != '\0') && (ch1 == ch2)) {
            ch1 = normalized ? it1.nextNormalizedChar() : it1.nextChar();
            ch2 = normalized ? it2.nextNormalizedChar() : it2.nextChar();
        }
        // eat trailing space if end is reached
        if (normalized && ((ch1 == '\0') || (ch2 == '\0'))) {
            while (ch1 == ' ')
                ch1 = it1.nextNormalizedChar();
            while (ch2 == ' ')
                ch2 = it2.nextNormalizedChar();
        }
        return ch1 == ch2;
    }


    /**
     * Check if all/any of the given keywords are contained in the iterator. The keywords will be normalized (as the CharIterator
     * content) and tokenized before comparison.
     * 
     * @param keywords
     *            String containing list of keyword to compare.
     * @param matchAll
     *            true if all the keyword have to be matched, false if only one of the keywords have to be matched.
     * @param src
     *            A CharIteraor on the text to search.
     * @return true iff all/any of the given keywords were found in the char iterator.
     */
    public static boolean matchKeywords(String keywords, boolean matchAll, CharIterator src) {
        String[] keyWords = getNormalizedKeywords(keywords);
        return matchNormalizedKeywords(keyWords, matchAll, src);
    }


    public static String[] getNormalizedKeywords(String keywordString) {
        keywordString = getNormalizedString(keywordString);
        StringTokenizer tokenizer = new StringTokenizer(keywordString, " ");
        int nbrKey = tokenizer.countTokens();
        String[] keyWords = new String[nbrKey];
        for (int i = 0; i < nbrKey; i++) {
            keyWords[i] = tokenizer.nextToken();
        }
        return keyWords;
    }


    /**
     * Check if all/any of the given keywords are contained in the iterator. The keywords will be normalized (as the CharIterator
     * content) and tokenized before comparison.
     * 
     * @param keywords
     *            String containing list of keyword to compare.
     * @param matchAll
     *            true if all the keyword have to be matched, false if only one of the keywords have to be matched.
     * @param src
     *            A CharIteraor on the text to search.
     * @return true iff all/any of the given keywords were found in the char iterator.
     */
    public static boolean matchNormalizedKeywords(String[] keywords, boolean matchAll, CharIterator src) {
        boolean matched = false;
        int nbrKey = (keywords == null) ? 0 : keywords.length;
        // match
        if (nbrKey > 0) {
            int[] matchedIndex = new int[nbrKey];
            boolean[] matchedKey = new boolean[nbrKey];
            int i;
            char ch = src.nextNormalizedChar();
            while (ch != '\0') {
                for (i = 0; i < nbrKey; i++) {
                    if (keywords[i].charAt(matchedIndex[i]) == ch) {
                        matchedIndex[i]++;
                        // System.out.println("char "+ch+" -> matchedIndex["+i+"]="+matchedIndex[i]+", len="+keyLen[i]) ;
                        if (matchedIndex[i] >= keywords[i].length()) {
                            if (matchAll) {
                                matchedKey[i] = true;
                                matchedIndex[i] = 0;
                            } else {
                                matched = true;
                                break;
                            }
                        }
                    } else {
                        if (keywords[i].charAt(0) == ch) matchedIndex[i] = 1;
                        else matchedIndex[i] = 0;
                    }
                }
                if (matched) break;
                // process next char
                ch = src.nextNormalizedChar();
            }
            if (matchAll) {
                matched = true;
                for (i = 0; i < nbrKey; i++) {
                    if (!matchedKey[i]) {
                        matched = false;
                        break;
                    }
                }
            }
        } else {
            matched = true;
        }
        return matched;
    }


    /**
     * Normalize a string. 

     * The result will be:
     * 
     * Diacritic marks are removed.
     * 
All lowercase.
     * 
All non-letter or digit chars replaced by space.
     * 
Any suite of white chars replaced by a single space.
     * 
Trimmed.
     * 
     * 
     * @param src
     *            Source string
     * @return normalized string.
     */
    public static String getNormalizedString(String src) {
        if (strIterator == null) strIterator = new StringCharIterator(src);
        else strIterator.setData(src);
        return strIterator.getNormalizedString();
    }


    /**
     * Get the equivalent char with removed diacritic marks (like accents, cedillas, dots, tildes ...). 

     * The character case will be preserved. If the given char has no diacritic mark, it will be returned without change. The
     * characters taken in accout by this method are in the range 0000-024F = unicode blocks "Basic Latin", "Latin 1
     * supplement", "Latin extended A" and "Latin extended B". (but all accentued chars of those blocks are between 00C0 and 021F).
     * the other chars will be returned without changes.
     * 
     * @param ch
     *            the possibly accentued char to convert.
     * @return the corresponding non-accentued char.
     */
    public static final char removeDiacritic(char ch) {
        if ((ch < 0x00C0) || (ch > 0x021F)) return ch;
        return CHAR_MAP[ch - 0x00C0];
    }


    /**
     * Get the uppercase char corresponding to the given character with removed diacritic mark. 

     * So this method will transform '�' to 'A', '�' to 'E' ... while the default Character.toUpperCase
     * implementation transforms '�' to '�', '�' to '�' ...
     * 
     * @param ch
     *            the character.
     * @return Corresponding uppercase character with any diacritic mark removed.
     */
    public static final char getUppercaseChar(char ch) {
        char nonAccentued = removeDiacritic(ch);
        return Character.toUpperCase(nonAccentued);
    }


    public static final String firstToUpperCase(String src) {
        if (src == null) return null;
        int len = src.length();
        if (len == 0) return src;
        if (len == 1) return String.valueOf(getUppercaseChar(src.charAt(0)));
        return String.valueOf(getUppercaseChar(src.charAt(0))) + src.substring(1);
    }


    public static final String firstToLowerCase(String src) {
        if (src == null) return null;
        int len = src.length();
        if (len == 0) return src;
        if (len == 1) return String.valueOf(Character.toLowerCase(src.charAt(0)));
        return String.valueOf(Character.toLowerCase(src.charAt(0))) + src.substring(1);
    }


    /**
     * Transform the string to have the first character of each word in uppercase. 

     * Note that this method transform only some characters from lowercase to uppercase, the character that are not the first of a
     * word are left as-is.
     * 
     * @param src
     *            source string
     * @return transformed string.
     */
    public static final String firstOfWordsUpperCase(String src) {
        return firstOfWordsUpperCase(src, false);
    }


    /**
     * Transform the string to have the first character of each word in uppercase. 

     * 
     * @param src
     *            source string
     * @param othersToLowercase
     *            if true, the characters that are not the first of a word are forced to lowercase, otherwise they are left
     *            unchanged.
     * @return transformed string.
     */
    public static final String firstOfWordsUpperCase(String src, boolean othersToLowercase) {
        StringBuffer sb = new StringBuffer(src);
        boolean previousIsWhite = true;
        int len = src.length();
        char ch;
        for (int i = 0; i < len; i++) {
            ch = src.charAt(i);
            if (previousIsWhite) sb.setCharAt(i, getUppercaseChar(ch));
            else if (othersToLowercase) sb.setCharAt(i, Character.toLowerCase(ch));
            previousIsWhite = (ch <= ' ');
        }
        return sb.toString();
    }


    /**
     * Encode a string to avoid spaces and non-alphanumeric characters. It's used to generate file names supported on all platforms. 

     * Examples:
     * 
     * "hello world" -> "HelloWorld"
     * 
"1, 2, 3 hop" -> "123Hop"
     * 
"H? ?i|?l?pr?( haha" -> "HeCiOlopreHaha"
     * 
     * 
     * @param src
     *            the source string
     * @return the string encoded.
     */
    public static final String getEncodedString(String src) {
        if (src == null) return null;
        StringBuffer res = new StringBuffer();
        int len = src.length();
        boolean isBlanc;
        boolean lastBlanc = true;
        for (int i = 0; i < len; i++) {
            char ch = removeDiacritic(src.charAt(i));
            if ((ch >= 'a') && (ch <= 'z')) {
                isBlanc = false;
            } else if ((ch >= 'A') && (ch <= 'Z')) {
                isBlanc = false;
                ch = Character.toLowerCase(ch);
            } else if ((ch >= '0') && (ch <= '9')) {
                isBlanc = false;
            } else if ((ch == '\u00C6') || (ch == '\u00E6')) {
                if (lastBlanc) res.append('A');
                else res.append('a');
                lastBlanc = false;
                isBlanc = false;
                ch = 'e';
            } else if ((ch == '\u0152') || (ch == '\u0153')) {
                if (lastBlanc) res.append('O');
                else res.append('o');
                lastBlanc = false;
                isBlanc = false;
                ch = 'e';
            } else {
                isBlanc = true;
            }
            if (!isBlanc) {
                if (lastBlanc) ch = Character.toUpperCase(ch);
                res.append(ch);
            }
            lastBlanc = isBlanc;
        }
        return res.toString();
    }


    /**
     * Remove XML or HTML tags and normalize whitespaces.
     */
    public static final String removeTags(String src) {
        if (src == null) return null;
        StringBuffer res = new StringBuffer();
        int len = src.length();
        boolean isBlanc;
        boolean lastBlanc = true;
        for (int i = 0; i < len; i++) {
            char ch = src.charAt(i);
            // eat tags
            while (ch == '<') {
                for (i++; i < len; i++) {
                    ch = src.charAt(i);
                    if (ch == '>') break;
                }
                if (ch == '>') {
                    i++;
                    ch = (i < len) ? src.charAt(i) : '\0';
                } else {
                    ch = '\0';
                }
            }
            isBlanc = (ch <= ' ');
            if (!isBlanc) {
                if (lastBlanc && (res.length() > 0)) res.append(' ');
                res.append(ch);
            }
            lastBlanc = isBlanc;
        }
        return res.toString();
    }

}