com.causecode.seo.friendlyurl.FriendlyUrlService.groovy Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of content Show documentation
A plugin used to manage contents like static pages, menus etc. at one place. Also provides shortened and user friendly urls.
There is a newer version: 2.6.2
Show newest version
/*
 * Copyright (c) 2011, CauseCode Technologies Pvt Ltd, India.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or
 * without modification, are not permitted.
 */
package com.causecode.seo.friendlyurl

import java.util.regex.Pattern

/**
 * This service provides methods to create friendly URL's.
 */
class FriendlyUrlService {

    static transactional = false

    /**
     * This method transforms the text passed as an argument to a text without spaces,
     * html entities, accents, dots and extranges characters (only %,a-z,A-Z,0-9, ,_ and - are allowed).
     *
     * Borrowed from Wordpress: file wp-includes/formatting.php, function sanitize_title_with_dashes
     * http://core.svn.wordpress.org/trunk/wp-includes/formatting.php
     */
    def sanitizeWithDashes(String text) {
        // To avoid ParameterReassignment
        String localText = text

        // Preserve escaped octets
        localText = localText.replaceAll('%([a-fA-F0-9][a-fA-F0-9])', '---$1---')
        localText = localText.replaceAll('%', '')
        localText = localText.replaceAll('---([a-fA-F0-9][a-fA-F0-9])---', '%$1')

        // Remove accents
        localText = removeAccents(localText)

        //localText = localText.

        // To lower case
        localText = localText.toLowerCase()

        // Kill entities
        localText = localText.replaceAll('&.+?;', '')

        // Dots -> ''
        localText = localText.replaceAll('\\.', '')

        // Remove any character except %a-zA-Z0-9 _-
        localText = localText.replaceAll('[^%a-zA-Z0-9 _-]', '')

        // Trim
        localText = localText.trim()

        // Spaces -> dashes
        localText = localText.replaceAll('\\s+', '-')

        // Dashes -> dash
        localText = localText.replaceAll('-+', '-')

        // It must end in a letter or digit, otherwise we strip the last char
        if (!localText[-1].charAt(0).isLetterOrDigit()) { localText = localText[0..-2] }

        return localText
    }

    /**
     * Converts all accent characters to ASCII characters.
     *
     * If there are no accent characters, then the string given is just returned.
     *
     * Borrowed from Wordpress, file wp-includes/formatting.php, function remove_accents
     * http://core.svn.wordpress.org/trunk/wp-includes/formatting.php
     */
    @SuppressWarnings('ElseBlockBraces')
     def removeAccents(text) {
        // To avoid ParameterReassignment
        String localText = text

        def chars, out

        // Pattern matches: a single character in the range between  (ASCII 128) and ÿ (ASCII 255) (case sensitive)
        if (!Pattern.matches('.*[\\x80-\\xFF].*', localText)) {
            return localText
        } else if (seemsUtf8(localText)) {
            // If 'text' fits a UTF-8 model
            chars = charMapFitsUTF8Model

            // Replacing...
            chars.each { key, value -> localText = localText.replace(key, value) }
        } else {
            // Assume ISO-8859-1 if not UTF-8
            chars = [128, 131, 138, 142, 154, 158, 159, 162, 165, 181, 192, 193,
                194, 195, 196, 197, 199, 200, 201, 202, 203, 204, 205, 206,
                207, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220,
                221, 224, 225, 226, 227, 228, 229, 231, 232, 233, 234, 235,
                236, 237, 238, 239, 241, 242, 243, 244, 245, 246, 248, 249,
                250, 251, 252, 253, 255]
            out = 'EfSZszYcYuAAAAAACEEEEIIIINOOOOOOUUUUYaaaaaaceeeeiiiinoooooouuuuyy'

            chars.eachWithIndex {
                it, index -> localText = localText.replace(new String((char)it), new String(out.charAt(index)))
            }

            // Double chars
            localText = localText.replace(new String((char)140), 'OE')
            localText = localText.replace(new String((char)156), 'oe')
            localText = localText.replace(new String((char)198), 'AE')
            localText = localText.replace(new String((char)208), 'DH')
            localText = localText.replace(new String((char)222), 'TH')
            localText = localText.replace(new String((char)223), 'ss')
            localText = localText.replace(new String((char)230), 'ae')
            localText = localText.replace(new String((char)240), 'dh')
            localText = localText.replace(new String((char)254), 'th')
        }

        return localText
    }

    @SuppressWarnings('DuplicateStringLiteral')
    def getCharMapFitsUTF8Model() {
        def chars = [
                // Decomposition for Latin-1 Supplement
                '\u00C3\u0080': 'A', '\u00C3\u0081': 'A', '\u00C3\u0082': 'A',
                '\u00C3\u0083': 'A', '\u00C3\u0084': 'A', '\u00C3\u0085': 'A',
                '\u00C3\u0087': 'C', '\u00C3\u0088': 'E', '\u00C3\u0089': 'E',
                '\u00C3\u008A': 'E', '\u00C3\u008B': 'E', '\u00C3\u008C': 'I',
                '\u00C3\u008D': 'I', '\u00C3\u008E': 'I', '\u00C3\u008F': 'I',
                '\u00C3\u0091': 'N', '\u00C3\u0092': 'O', '\u00C3\u0093': 'O',
                '\u00C3\u0094': 'O', '\u00C3\u0095': 'O', '\u00C3\u0096': 'O',
                '\u00C3\u0099': 'U', '\u00C3\u009A': 'U', '\u00C3\u009B': 'U',
                '\u00C3\u009C': 'U', '\u00C3\u009D': 'Y', '\u00C3\u009F': 's',
                '\u00C3\u00A0': 'a', '\u00C3\u00A1': 'a', '\u00C3\u00A2': 'a',
                '\u00C3\u00A3': 'a', '\u00C3\u00A4': 'a', '\u00C3\u00A5': 'a',
                '\u00C3\u00A7': 'c', '\u00C3\u00A8': 'e', '\u00C3\u00A9': 'e',
                '\u00C3\u00AA': 'e', '\u00C3\u00AB': 'e', '\u00C3\u00AC': 'i',
                '\u00C3\u00AD': 'i', '\u00C3\u00AE': 'i', '\u00C3\u00AF': 'i',
                '\u00C3\u00B1': 'n', '\u00C3\u00B2': 'o', '\u00C3\u00B3': 'o',
                '\u00C3\u00B4': 'o', '\u00C3\u00B5': 'o', '\u00C3\u00B6': 'o',
                '\u00C3\u00B9': 'u', '\u00C3\u00BA': 'u', '\u00C3\u00BB': 'u',
                '\u00C3\u00BC': 'u', '\u00C3\u00BE': 'y', '\u00C3\u00BF': 'y',

                // Decomposition for Latin Extended-A
                '\u00C4\u0080': 'A', '\u00C4\u0081': 'a', '\u00C4\u0082': 'A',
                '\u00C4\u0083': 'a', '\u00C4\u0084': 'A', '\u00C4\u0085': 'a',
                '\u00C4\u0086': 'C', '\u00C4\u0087': 'c', '\u00C4\u0088': 'C',
                '\u00C4\u0089': 'c', '\u00C4\u008A': 'C', '\u00C4\u008B': 'c',
                '\u00C4\u008C': 'C', '\u00C4\u008D': 'c', '\u00C4\u008E': 'D',
                '\u00C4\u008F': 'd', '\u00C4\u0090': 'D', '\u00C4\u0091': 'd',
                '\u00C4\u0092': 'E', '\u00C4\u0093': 'e', '\u00C4\u0094': 'E',
                '\u00C4\u0095': 'e', '\u00C4\u0096': 'E', '\u00C4\u0097': 'e',
                '\u00C4\u0098': 'E', '\u00C4\u0099': 'e', '\u00C4\u009A': 'E',
                '\u00C4\u009B': 'e', '\u00C4\u009C': 'G', '\u00C4\u009D': 'g',
                '\u00C4\u009E': 'G', '\u00C4\u009F': 'g', '\u00C4\u00A0': 'G',
                '\u00C4\u00A1': 'g', '\u00C4\u00A2': 'G', '\u00C4\u00A3': 'g',
                '\u00C4\u00A4': 'H', '\u00C4\u00A5': 'h', '\u00C4\u00A6': 'H',
                '\u00C4\u00A7': 'h', '\u00C4\u00A8': 'I', '\u00C4\u00A9': 'i',
                '\u00C4\u00AA': 'I', '\u00C4\u00AB': 'i', '\u00C4\u00AC': 'I',
                '\u00C4\u00AD': 'i', '\u00C4\u00AE': 'I', '\u00C4\u00AF': 'i',
                '\u00C4\u00B0': 'I', '\u00C4\u00B1': 'i', '\u00C4\u00B2': 'IJ',
                '\u00C4\u00B3': 'ij', '\u00C4\u00B4': 'J', '\u00C4\u00B5': 'j',
                '\u00C4\u00B6': 'K', '\u00C4\u00B7': 'k', '\u00C4\u00B8': 'k',
                '\u00C4\u00B9': 'L', '\u00C4\u00BA': 'l', '\u00C4\u00BB': 'L',
                '\u00C4\u00BC': 'l', '\u00C4\u00BD': 'L', '\u00C4\u00BE': 'l',
                '\u00C4\u00BF': 'L',
                '\u00C5\u0080': 'l', '\u00C5\u0081': 'L', '\u00C5\u0082': 'l',
                '\u00C5\u0083': 'N', '\u00C5\u0084': 'n', '\u00C5\u0085': 'N',
                '\u00C5\u0086': 'n', '\u00C5\u0087': 'N', '\u00C5\u0088': 'n',
                '\u00C5\u0089': 'N', '\u00C5\u008A': 'n', '\u00C5\u008B': 'N',
                '\u00C5\u008C': 'O', '\u00C5\u008D': 'o', '\u00C5\u008E': 'O',
                '\u00C5\u008F': 'o', '\u00C5\u0090': 'O', '\u00C5\u0091': 'o',
                '\u00C5\u0092': 'OE', '\u00C5\u0093': 'oe', '\u00C5\u0094': 'R',
                '\u00C5\u0095': 'r', '\u00C5\u0096': 'R', '\u00C5\u0097': 'r',
                '\u00C5\u0098': 'R', '\u00C5\u0099': 'r', '\u00C5\u009A': 'S',
                '\u00C5\u009B': 's', '\u00C5\u009C': 'S', '\u00C5\u009D': 's',
                '\u00C5\u009E': 'S', '\u00C5\u009F': 's', '\u00C5\u00A0': 'S',
                '\u00C5\u00A1': 's', '\u00C5\u00A2': 'T', '\u00C5\u00A3': 't',
                '\u00C5\u00A4': 'T', '\u00C5\u00A5': 't', '\u00C5\u00A6': 'T',
                '\u00C5\u00A7': 't', '\u00C5\u00A8': 'U', '\u00C5\u00A9': 'u',
                '\u00C5\u00AA': 'U', '\u00C5\u00AB': 'u', '\u00C5\u00AC': 'U',
                '\u00C5\u00AD': 'u', '\u00C5\u00AE': 'U', '\u00C5\u00AF': 'u',
                '\u00C5\u00B0': 'U', '\u00C5\u00B1': 'u', '\u00C5\u00B2': 'U',
                '\u00C5\u00B3': 'u', '\u00C5\u00B4': 'W', '\u00C5\u00B5': 'w',
                '\u00C5\u00B6': 'Y', '\u00C5\u00B7': 'y', '\u00C5\u00B8': 'Y',
                '\u00C5\u00B9': 'Z', '\u00C5\u00BA': 'z', '\u00C5\u00BB': 'Z',
                '\u00C5\u00BC': 'z', '\u00C5\u00BD': 'Z', '\u00C5\u00BE': 'z',
                '\u00C5\u00BF': 's',

                // Euro sign
                '\u00E3\u0082\u00AC': 'E',

                // GBP (Pound) sign
                '\u00C2\u00A3': '']

        return chars
    }

    /**
     * Checks to see if a string is UTF encoded.
     *
     * NOTE: This function checks for 5-Byte sequences, UTF8
     *       has Bytes Sequences with a maximum length of 4.
     *
     * Borrowed from Wordpress, file wp-includes/formatting.php, function seems_utf8
     * http://core.svn.wordpress.org/trunk/wp-includes/formatting.php
     *
     * @author bmorel at ssi dot fr (modified)
     *
     * @param string str The string to be checked
     * @return bool True if str fits a UTF-8 model, false otherwise.
     */
    @SuppressWarnings(['NestedForLoop', 'BitwiseOperatorInConditional', 'ElseBlockBraces', 'DuplicateNumberLiteral'])
    private def seemsUtf8(str) {
        int c, n

        for (int i=0; i < str.size(); i++) {
            c = (int)str.charAt(i)

            if (c < 0x80) { n = 0 } // 0bbbbbbb
            else if ((c & 0xE0) == 0xC0) { n = 1 } // 110bbbbb
            else if ((c & 0xF0) == 0xE0) { n = 2 } // 1110bbbb
            else if ((c & 0xF8) == 0xF0) { n = 3 } // 11110bbb
            else if ((c & 0xFC) == 0xF8) { n = 4 } // 111110bb
            else if ((c & 0xFE) == 0xFC) { n = 5 } // 1111110b
            else { return false } // Does not match any model

            for (int j=0; j < n; j++) {
                // n bytes matching 10bbbbbb follow ?
                if (++i == str.size() || ((((int)str.charAt(i)) & 0xC0) != 0x80)) {
                    return false
                }
            }
        }

        return true
    }
}