All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.jena.atlas.lib.EscapeStr Maven / Gradle / Ivy

There is a newer version: 5.2.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.jena.atlas.lib;

import org.apache.jena.atlas.AtlasException ;
import org.apache.jena.atlas.io.AWriter ;
import org.apache.jena.atlas.io.OutputUtils ;
import org.apache.jena.atlas.io.StringWriterI ;

/** String escape utilities */
public class EscapeStr
{
    /*
     * Escape characters in a string according to Turtle rules.
     */
    public static String stringEsc(String s) {
        AWriter w = new StringWriterI() ;
        stringEsc(w, s, Chars.CH_QUOTE2, true, CharSpace.UTF8) ;
        return w.toString() ;
    }

    /** Write a string - basic escaping, no quote escaping. */
    public static void stringEsc(AWriter out, String s, boolean asciiOnly) {
        int len = s.length() ;
        for (int i = 0; i < len; i++) {
            char c = s.charAt(i);
            // \\ Escape always possible.
            if (c == '\\') {
                out.print('\\') ;
                out.print(c) ;
                continue ;
            }
            switch(c) {
                case '\n':  out.print("\\n"); continue;
                case '\t':  out.print("\\t"); continue;
                case '\r':  out.print("\\r"); continue;
                case '\f':  out.print("\\f"); continue;
                default:    // Drop through
            }
            if ( !asciiOnly )
                out.print(c);
            else
                writeCharAsASCII(out, c) ;
        }
    }

    /** String escape, with quote escaping, including option for multi-line 3 quote form. */
    public static void stringEsc(AWriter out, String s, char quoteChar, boolean singleLineString) {
        stringEsc(out, s, quoteChar, singleLineString, CharSpace.UTF8);
    }

    public static void stringEsc(AWriter out, String s, char quoteChar, boolean singleLineString, CharSpace charSpace) {
        boolean ascii = ( CharSpace.ASCII == charSpace ) ;
        int len = s.length() ;
        int quotesInARow = 0 ;
        for (int i = 0; i < len; i++) {
            char c = s.charAt(i);
            // \\ Escape always possible.
            if (c == '\\') {
                out.print('\\') ;
                out.print(c) ;
                continue ;
            }
            if ( ! singleLineString ) {
                // Multiline string.
                if ( c == quoteChar ) {
                    quotesInARow++ ;
                    if ( (quotesInARow == 3) || (!singleLineString && (i == len - 1)) ) {
                        // Always quote the final character for multiline use
                        // otherwise it will run into the wrapping 3 quotes.
                        out.print("\\");
                        out.print(quoteChar);
                        quotesInARow = 0;
                        continue;
                    }
                } else {
                    quotesInARow = 0 ;
                }
            } else {
                // Single line.
                if ( c == quoteChar ) {
                    out.print("\\"); out.print(c) ; continue ;
                }
                switch(c) {
                    case '\n':  out.print("\\n"); continue;
                    case '\t':  out.print("\\t"); continue;
                    case '\r':  out.print("\\r"); continue;
                    case '\f':  out.print("\\f"); continue;
                    default:    // Drop through
                }
            }

            if ( ascii ) {
                writeCharAsASCII(out, c) ;
                continue;
            }

            if ( c == '\uFFFD' ) {
                // Unicode replacement character: write as \-u escape
                // The text tokenizer raises warnings on raw U+FFFD. A replacement character is generated
                // if a decoding error occurs (e.g. ISO-8859-1 passed into UTF-8); there is no literal U+FFFD
                // in the original input. Written as a unicode escape is not treated as a warning.
                out.print("\\uFFFD");
                continue;
            }

            // Normal case!
            out.print(c);        }
    }

    /** Write a string with Unicode to ASCII conversion using \-u escapes */
    public static void writeASCII(AWriter out, String s) {
        int len = s.length() ;
        for (int i = 0; i < len; i++) {
            char c = s.charAt(i);
            writeCharAsASCII(out, c);
        }
    }

    /** Write a character with Unicode to ASCII conversion using \-u escapes */
    public static void writeCharAsASCII(AWriter out, char c) {
        if ( c >= 32 && c < 127 )
            out.print(c);
        else {
            // Outside the charset range.
            // Does not cover beyond 16 bits codepoints directly
            // (i.e. \U escapes) but Java keeps these as surrogate
            // pairs and will print as characters
            out.print("\\u") ;
            OutputUtils.printHex(out, c, 4) ;
        }
    }

    // Utilities to remove escapes

    /** Replace \ escapes (\\u, \t, \n etc) in a string */
    public static String unescapeStr(String s)
    { return unescapeStr(s, '\\') ; }

    /** Replace \ escapes (\\u, \t, \n etc) in a string */
    public static String unescapeStr(String s, char escapeChar)
    { return unescape(s, escapeChar, false) ; }


    /** Unicode escapes  \-u and \-U only */
    public static String unescapeUnicode(String s) {
        return unescape(s, '\\', true) ;
    }

    // Main worker function for unescaping strings.
    public static String unescape(String s, char escape, boolean pointCodeOnly) {
        int i = s.indexOf(escape) ;

        if ( i == -1 )
            return s ;

        // Dump the initial part straight into the string buffer
        StringBuilder sb = new StringBuilder(s.substring(0,i)) ;

        for ( ; i < s.length() ; i++ )
        {
            char ch = s.charAt(i) ;

            if ( ch != escape )
            {
                sb.append(ch) ;
                continue ;
            }

            // Escape
            if ( i >= s.length()-1 )
                throw new AtlasException("Illegal escape at end of string") ;
            char ch2 = s.charAt(i+1) ;
            i = i + 1 ;

            // \\u and \\U
            if ( ch2 == 'u' )
            {
                if ( i+4 >= s.length() )
                    throw new AtlasException("\\u escape too short") ;
                int x4 = Hex.hexStringToInt(s, i+1, 4) ;
                sb.append((char)x4) ;
                // Jump 1 2 3 4 -- already skipped \ and u
                i = i+4 ;
                continue ;
            }
            if ( ch2 == 'U' )
            {
                if ( i+8 >= s.length() )
                    throw new AtlasException("\\U escape too short") ;
                int ch8 = Hex.hexStringToInt(s, i+1, 8) ;
                if ( Character.charCount(ch8) == 1 )
                    sb.append((char)ch8);
                else {
                    // See also TokenerText.insertCodepoint and TokenerText.readUnicodeEscape
                    // Convert to UTF-16. Note that the rest of any system this is used
                    // in must also respect codepoints and surrogate pairs.
                    if ( !Character.isDefined(ch8) && !Character.isSupplementaryCodePoint(ch8) )
                        throw new AtlasException(String.format("Illegal codepoint: 0x%04X", ch8));
                    if ( ch8 > Character.MAX_CODE_POINT )
                        throw new AtlasException(String.format("Illegal code point in \\U sequence value: 0x%08X", ch8));
                    char[] chars = Character.toChars(ch8);
                    sb.append(chars);
                }
                // Jump 1 2 3 4 5 6 7 8 -- already skipped \ and u
                i = i+8 ;
                continue ;
            }

            // Are we doing just point code escapes?
            // If so, \X-anything else is legal as a literal "\" and "X"

            if ( pointCodeOnly )
            {
                sb.append('\\') ;
                sb.append(ch2) ;
                continue ;
            }

            // Not just codepoints.  Must be a legal escape.
            char ch3 = 0 ;
            switch (ch2)
            {
                case 'n': ch3 = '\n' ;  break ;
                case 't': ch3 = '\t' ;  break ;
                case 'r': ch3 = '\r' ;  break ;
                case 'b': ch3 = '\b' ;  break ;
                case 'f': ch3 = '\f' ;  break ;
                case '\'': ch3 = '\'' ; break ;
                case '\"': ch3 = '\"' ; break ;
                case '\\': ch3 = '\\' ; break ;
                default:
                    throw new AtlasException("Unknown escape: \\"+ch2) ;
            }
            sb.append(ch3) ;
        }
        return sb.toString() ;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy