ucar.nc2.util.EscapeStrings Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of cdm Show documentation
The NetCDF-Java Library is a Java interface to NetCDF files, as well as to many other types of scientific data formats.
The newest version!
/////////////////////////////////////////////////////////////////////////////
// This file is part of the "Java-DAP" project, a Java implementation
// of the OPeNDAP Data Access Protocol.
//
// Copyright (c) 2010, OPeNDAP, Inc.
// Copyright (c) 2002,2003 OPeNDAP, Inc.
// 
// Author: James Gallagher 
// 
// All rights reserved.
// 
// Redistribution and use in source and binary forms,
// with or without modification, are permitted provided
// that the following conditions are met:
// 
// - Redistributions of source code must retain the above copyright
//   notice, this list of conditions and the following disclaimer.
// 
// - Redistributions in binary form must reproduce the above copyright
//   notice, this list of conditions and the following disclaimer in the
//   documentation and/or other materials provided with the distribution.
// 
// - Neither the name of the OPeNDAP nor the names of its contributors may
//   be used to endorse or promote products derived from this software
//   without specific prior written permission.
// 
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, O
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/////////////////////////////////////////////////////////////////////////////


package ucar.nc2.util;

import ucar.nc2.constants.CDM;
import ucar.httpservices.HTTPSession;

import java.net.*;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * User: ndp
 * Date: Jul 7, 2006
 * Time: 10:23:19 AM
 */
public class EscapeStrings
{

    //////////////////////////////////////////////////////////////////////////
    static public org.slf4j.Logger log
                = org.slf4j.LoggerFactory.getLogger(HTTPSession.class);
    //////////////////////////////////////////////////////////////////////////

/*
    // Sets of ascii characters
    static final String alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
    static final String numeric = "0123456789";
    static final String alphaNumeric = alpha + numeric;

    // Experimentally determined url and query
    // legal and illegal chars as defined by apache httpclient3
    // Sets are larger than strictly necessary
    static final String httpclient_urllegal   = "!#$&'()*+,-./:;=?@_~";
    static final String httpclient_querylegal = "!#$&'()*+,-./:;=?@_~%"; // % is difference
    static final String httpclient_urlillegal   = " \"<>[\\]^`{|}%";
    static final String httpclient_queryillegal = " \"<>[\\]^`{|}";  // % is difference


    // Set of all ascii printable non-alphanumeric (aka nan) characters
    static private final String nonAlphaNumeric = " !\"#$%&'()*+,-./:;<=>?@[]\\^_`|{}~" ;

    static private final String queryReserved
        = httpclient_queryillegal; // " ?&=,+;#"; // special parsing meaning in queries
    static private final String urlReserved
        = httpclient_urlillegal; // ":/#?"; // special parsing meaning in url

    // We assume that whoever constructs a url (minus the query)
    // has properly percent encoded whatever characters need to be encoded.
    // Sets of characters absolutely DIS-allowed in url.
    static private final String urlDisallowed
                                = stringUnion(urlReserved); // what else?
    // Complement of urlDisallowed
    static private final String urlAllowed = stringDiff(nonAlphaNumeric,urlDisallowed);

    // This is set of legal characters that can appear unescaped in a url
    static private final String _allowableInUrl= alphaNumeric + urlAllowed;

    // Sets of characters absolutely DIS-allowed in query string identifiers.
    // Basically, this set is determined by what kind of query parsing needs to occur.
    static private final String queryIdentDisallowed
                                = stringUnion(queryReserved+"\"\\^`|<>[]{}");

    // Complement of queryIdentDisallowed
    static private final String queryIdentAllowed = stringDiff(nonAlphaNumeric,queryIdentDisallowed);

    // This is set of legal characters that can appear unescaped in a url query.
    static private final String _allowableInUrlQuery = alphaNumeric + queryIdentAllowed;

    // Define the set of characters allowable in DAP Identifiers
    static private final String dapSpecAllowed = "_!~*'-\"" ; //as specified in dap2 spec
    static private final String _namAllowedInDAP = dapSpecAllowed
                                                   + "./"; // for groups and structure names
    static private final String _allowableInDAP = alphaNumeric + _namAllowedInDAP;


    // This is set of legal characters that can appear unescaped in an OGC query.
    // See OGC Web Services Common 2.0.0 section 11.3
    static private final String _namAllowedInOGC = "-_.!~*'()";
    static private final String _disallowedInOGC
			        = stringUnion(queryReserved+stringDiff(nonAlphaNumeric,_namAllowedInOGC));
    static private final String _allowableInOGC = alphaNumeric + _namAllowedInOGC;

    static private final char _URIEscape = '%';

    // These are the DEFINITIVE set of non-alphanumeric characters that are legal
    // in opendap identifiers (according to DAP2 protocol spec).
    // Add '/' to support group names

    static private String opendap_identifier_special_characters = "_!~*'-\"" // see dap spec.
                                                                  + "./" ; // less strict

    // The complete set of legal opendap identifier characters
    public static String opendap_identifier_characters
	 	         = alphaNumeric + opendap_identifier_special_characters;
*/

    static protected final String alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
    static protected final String numeric = "0123456789";
    static protected final String alphaNumeric = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
    static protected final String httpclient_urllegal   = "!#$&'()*+,-./:;=?@_~";
    static protected final String httpclient_querylegal = "!#$&'()*+,-./:;=?@_~%";
    static protected final String httpclient_urlillegal   = " \"<>[\\]^`{|}%";
    static protected final String httpclient_queryillegal = " \"<>[\\]^`{|}";
    static protected final String nonAlphaNumeric = " !\"#$%&'()*+,-./:;<=>?@[]\\^_`|{}~";
    static protected final String queryReserved = " \"<>[\\]^`{|}";
    static protected final String urlReserved = " \"<>[\\]^`{|}%";
    static protected final String urlDisallowed = " \"<>[\\]^`{|}%";
    static protected final String urlAllowed = "!#$&'()*+,-./:;=?@_~";
    static protected final String _allowableInUrl = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!#$&'()*+,-./:;=?@_~";
    static protected final String queryIdentDisallowed = " \"\\^`|<>[]{}";
    static protected final String queryIdentAllowed = "!#$%&'()*+,-./:;=?@_~";
    static protected final String _allowableInUrlQuery = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!#$%&'()*+,-./:;=?@_~";
    static protected final String dapSpecAllowed = "_!~*'-\"";
    static protected final String _namAllowedInDAP = "_!~*'-\"./";
    static protected final String _allowableInDAP = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_!~*'-\"./";
    static protected final String _namAllowedInOGC = "-_.!~*'()";
    static protected final String _disallowedInOGC = " \"#$%&+,/:;<=>?@[]\\^`|{}";
    static protected final String _allowableInOGC = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_.!~*'()";
    static protected final String opendap_identifier_special_characters = "_!~*'-\"./";
    static protected final String opendap_identifier_characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_!~*'-\"./";
    static protected final char _URIEscape = '%';

    /*
     * s1 union s2
     */
    static private String stringUnion(String s)
    {
        StringBuilder union = new StringBuilder();
        for(int i=0;i= 0) continue; // later occurrence
            union.append(c);
        }
        return union.toString();
    }

    /**
     * Replace all characters in the String in not present in the String allowable with
     * their hexidecimal values (encoded as UTF8) and preceeded by the String esc
     * 
     * The esc character may not appear on the allowable list, as if it did it would break the 1:1
     * and onto mapping between the unescaped character space and the escaped characater space.
     *
     * @param in        The string in which to replace characters.
     * @param allowable The set of allowable characters.
     * @param esc       The escape String (typically "%" for a URI or "\" for a regular expression).
     * @param spaceplus True if spaces should be replaced by '+'.
     * @return The modified identifier.
     */

    // Useful constants
    static final byte blank = ((byte)' ');
    static final byte plus = ((byte)'+');

    private static String xescapeString(String in, String allowable, char esc, boolean spaceplus)
    {
        try {
            StringBuffer out = new StringBuffer();
            if (in == null) return null;
            byte[] utf8 = in.getBytes(CDM.utf8Charset);
            byte[] allow8 = allowable.getBytes(CDM.utf8Charset);
            for(byte b: utf8) {
                if(b == blank && spaceplus) {
                   out.append('+');
                } else {
                    // search allow8
                    boolean found = false;
                    for(byte a: allow8) {
                        if(a == b) {found = true; break;}
                    }
                    if(found) {out.append((char)b);}
                    else {
                        String c = Integer.toHexString(b);
                        out.append(esc);
                        if (c.length() < 2) out.append('0');
                        out.append(c);
                    }
                }
            }

            return out.toString();

        } catch (Exception e) {
            return in;
        }
    }

    private static String escapeString(String in, String allowable)
   {return xescapeString(in,allowable,_URIEscape,false);}

    /**
     * Given a string that contains WWW escape sequences, translate those escape
     * sequences back into ASCII characters. Return the modified string.
     *
     * @param in     The string to modify.
     * @param escape The character used to signal the begining of an escape sequence.
     * param except If there is some escape code that should not be removed by
     *               this call (e.g., you might not want to remove spaces, %20) use this
     *               parameter to specify that code. The function will then transform all
     *               escapes except that one.
     * @param spaceplus True if spaces should be replaced by '+'.
     * @return The modified string.
     */

    private static String xunescapeString(String in, char escape, boolean spaceplus)
    {
        try {
            if (in == null) return null;

            byte[] utf8 = in.getBytes(CDM.utf8Charset);
            byte escape8 = (byte)escape;
            byte[] out = new byte[utf8.length]; // Should be max we need

            int index8 = 0;
            for(int i=0;i= hex0 && b <= hex9) return (byte)(b - hex0);
        if(b >= hexa && b <= hexf) return (byte)(ten + (b - hexa));
        if(b >= hexA && b <= hexF) return (byte)(ten + (b - hexA));
        throw new NumberFormatException("Illegal hex character: "+b);
    }


    /**
    * Define the DEFINITIVE opendap identifier unescape function.
    * @param id The identifier to unescape.
    * @return The unescaped identifier.
    */
    public static String unescapeDAPIdentifier(String id)
    {
        String s;
        try {
            s = unescapeString(id);
        } catch (Exception e) {
            s = null;
        }
        return s;
    }

    /**
     * Define the DEFINITIVE URL escape function.
     * Beware, this is a rather complex operation
     *
     * @param url The url string
     * @return The escaped expression.
     */
     static private final Pattern p
            = Pattern.compile("([\\w]+)://([.\\w]+(:[\\d]+)?)([/][^?#])?([?][^#]*)?([#].*)?");

     public static String escapeURL(String url)
     {
        String protocol;
        String authority;
        String path;
        String query;
        String fragment;
        if(false) {
            // We split the url ourselves to minimize character dependencies
            Matcher m = p.matcher(url);
            boolean match = m.matches();
            if(!match) return null;
            protocol = m.group(1);
            authority = m.group(2);
            path = m.group(3);
            query = m.group(4);
            fragment = m.group(5);
        } else {// faster, but may not work quite right
            URL u;
            try {u = new URL(url);} catch (MalformedURLException e) {
                return null;
            }
            protocol = u.getProtocol();
            authority = u.getAuthority();
            path = u.getPath();
            query = u.getQuery();
            fragment = u.getRef();
        }
        // Reassemble
        StringBuilder ret = new StringBuilder(protocol);
        ret.append("://");
        ret.append(authority);
        if(path != null && path.length() > 0) {
            // Encode pieces between '/'
            String pieces[] = path.split("[/]",-1);
            for(int i=0;i 0) ret.append("/");
                ret.append(urlEncode(p));
            }
        }
        if(query != null && query.length() > 0) {
            ret.append("?");
            ret.append(escapeURLQuery(query));
        }
        if(fragment != null && fragment.length() > 0) {
            ret.append("#");
            ret.append(urlEncode(fragment));
        }
        return ret.toString();
     }

     static int nextpiece(String s, int index, String sep)
     {
         index = s.indexOf(sep,index);
         if(index < 0)
             index = s.length();
         return index;
     }

    /**
     * Define the DEFINITIVE URL constraint expression escape function.
     *
     * @param ce The expression to modify.
     * @return The escaped expression.
     */
     public static String escapeURLQuery(String ce)
     {
        try {
            ce = escapeString(ce, _allowableInUrlQuery);
        } catch(Exception e) {ce = null;}
        return ce;
     }

    /**
     * Define the DEFINITIVE URL constraint expression unescape function.
     *
     * @param ce The expression to unescape.
     * @return The unescaped expression.
     */
     public static String unescapeURLQuery(String ce)
     {
        try {
            ce = unescapeString(ce);
        } catch(Exception e) {ce = null;}
        return ce;
     }

    /**
     * Define the DEFINITIVE URL escape function.
     * Note that the whole string is escaped, so
     * be careful what you pass into this procedure.
     *
     * @param s The string to modify.
     * @return The escaped expression.
     */
     private static String urlEncode(String s)
     {
        //try {s = URLEncoder.encode(s,"UTF-8");} catch(Exception e) {s = null;}
        s = escapeString(s, _allowableInUrl);
        return s;
     }

    /**
     * Define the DEFINITIVE URL unescape function.
     *
     * @param s The string to unescape.
     * @return The unescaped expression.
     */
     public static String urlDecode(String s)
     {
        try {
            //s = unescapeString(s, _URIEscape, "", false);
            s = URLDecoder.decode(s,"UTF-8");
        } catch(Exception e) {s = null;}
        return s;
     }

    /**
     *  Decode all of the parts of the url including query and fragment
     * @param url  the url to encode
     */
     public static String unescapeURL(String url)
     {
        String newurl;
        newurl = urlDecode(url);
        return newurl;
     }


    /**
     * Define the DAP escape identifier function.
     *
     * @param s The string to encode.
     * @return The escaped string.
     */
     public static String escapeDAPIdentifier(String s)
     {
        return escapeString(s, _allowableInDAP);
     }

    /**
     * Define the OGC Web Services escape function.
     *
     * @param s The string to encode.
     * @return The escaped string.
     */
     public static String escapeOGC(String s)
     {
        return escapeString(s, _allowableInOGC);
     }

    static public void testOGC()
    {
        for (char c : (alphaNumeric + " !\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~").toCharArray()) {
            String encoded = EscapeStrings.escapeOGC("" + c);
            System.err.printf("|%c|=|%s|%n", c, encoded);
        }
    }

    /**
     * Define the OGC unescape function.
     *
     * @param s The string to unescape.                                                                 b
     * @return The unescaped string.
     */
     public static String unescapeOGC(String s)
     {
        return urlDecode(s);
     }


  ///////////////////////////////////////////////////////////////

  /**
   * backslash escape a string
   * @param x escape this; may be null
   * @param reservedChars these chars get a backslash in front of them
   * @return escaped string
   */
  static public String backslashEscape(String x, String reservedChars) {
    if (x == null) {
      return null;
    } else if (reservedChars == null) {
      return x;
    }

    boolean ok = true;
    for (int pos = 0; pos < x.length(); pos++) {
      char c = x.charAt(pos);
      if (reservedChars.indexOf(c) >= 0) {
        ok = false;
        break;
      }
    }
    if (ok) return x;

    // gotta do it
    StringBuilder sb = new StringBuilder(x);
    for (int pos = 0; pos < sb.length(); pos++) {
      char c = sb.charAt(pos);
      if (reservedChars.indexOf(c) < 0) {
        continue;
      }

      sb.setCharAt(pos, '\\');
      pos++;
      sb.insert(pos, c);
      pos++;
    }

    return sb.toString();
  }

  /**
   * backslash unescape a string
   * @param x unescape this
   * @return string with \c -> c
   */
  static public String backslashUnescape(String x) {
    if (!x.contains("\\")) return x;

    // gotta do it
    StringBuilder sb = new StringBuilder(x.length());
    for (int pos = 0; pos < x.length(); pos++) {
      char c = x.charAt(pos);
      if (c == '\\') {
        c = x.charAt(++pos); // skip backslash, get next cha
      }
      sb.append(c);
    }

    return sb.toString();
  }


  /**
   * Tokenize an escaped name using "." as delimiter, skipping "\."
   *
   * @param escapedName an escaped name
   * @return list of tokens
   */
  public static List tokenizeEscapedName(String escapedName) {
    List result = new ArrayList<>();
    int pos = 0;
    int start = 0;
    while (true) {
      pos = escapedName.indexOf(sep, pos+1);
      if (pos <= 0) break;
      if ((pos > 0) && escapedName.charAt(pos-1) != '\\') {
        result.add(escapedName.substring(start, pos));
        start = pos+1;
      }
    }
    result.add(escapedName.substring(start, escapedName.length())); // remaining
    return result;
  }
  static private final int sep = '.';

  /**
   * Find first occurence of char c in escapedName, excluding escaped c.
   * @param escapedName search in this string
   * @param c for this char but not \\cha
   * @return  pos in string, or -1
   */
  public static int indexOf(String escapedName, char c) {
    int pos = 0;
    while (true) {
      pos = escapedName.indexOf(c, pos+1);
      if (pos <= 0) return pos;
      if ((pos > 0) && escapedName.charAt(pos-1) != '\\') return pos;
    }
  }

  public static void main2(String[] args) {
    String s = "http://thredds.ucar.edu/thredds/dodsC/fmrc/NCEP/GFS/Global_0p5deg/runs/NCEP-GFS-Global_0p5deg_RUN_2011-07-15T00:00:00Z.html.asc?Total_cloud_cover_low_cloud%5B1:1:1%5D%5B0:1:360%5D%5B0:1:719%5D";
    log.debug("%s%n", s);
    log.debug("%s%n", unescapeURL(s));
  }

  public static void main(String[] args) {
    String s = "https://localhost:8443/thredds/admin/log/access/";
    System.out.printf("%s%n", s);
    System.out.printf("%s%n", escapeURL(s));
  }

    public static void mainOld(String[] args) throws Exception
    {
        /* Ignore

        if (args.length > 0) {
            for (String s : args) {
                LogStream.out.println("id2www - Input: \"" + s + "\"   Output: \"" + id2www(s) + "\"   recaptured: " + www2id(id2www(s)));
            }
            for (String s : args) {
                String out = id2www(s);
                LogStream.out.println("www2id - Input: \"" + out + "\"   Output: \"" + www2id(out) + "\" recaptured: " + id2www(www2id(out)));
            }

        } else {
            char[] allBytes = new char[256];

            for (int b = 0; b < 256; b++)
                allBytes[b] = (char) b;
            String allChars = new String(allBytes);
            LogStream.out.println("id2www All Characters");
            LogStream.out.println("Input String:      \"" + allChars + "\"");
            LogStream.out.println("Output String:     \"" + id2www(allChars) + "\"");
            LogStream.out.println("Recaptured String: \"" + www2id(id2www(allChars)) + "\" ");
        }
        */
    }



/**
 * This method is used to normalize strings prio
 * to their inclusion in XML documents. XML has certain parsing requirements
 * around reserved characters. These reserved characters must be replaced with
 * symbols recognized by the XML parser as place holder for the actual symbol.
 * 

 * The rule for this normalization is as follows:
 * 

 * 

 *  The < (less than) character is replaced with &lt;
 * 
 The > (greater than) character is replaced with &gt;
 * 
 The & (ampersand) character is replaced with &amp;
 * 
 The ' (apostrophe) character is replaced with &apos;
 * 
 The " (double quote) character is replaced with &quot;
 * 
 *
 * @param s The String to be normalized.
 * @return The normalized String.
 */

    // Some handy definitons.
    static final String xmlGT = ">";
    static final String xmlLT = "<";
    static final String xmlAmp = "&";
    static final String xmlApos = "'";
    static final String xmlQuote = """;

public static String normalizeToXML(String s)
{
    StringBuffer sb = new StringBuffer(s);
    for (int offset = 0; offset < sb.length(); offset++) {
        char c = sb.charAt(offset);
        switch (c) {
            case '>': // GreaterThan
                sb.replace(offset, offset + 1, xmlGT);
                break;
            case '<': // Less Than
                sb.replace(offset, offset + 1, xmlLT);
                break;
            case '&': // Ampersand
                sb.replace(offset, offset + 1, xmlAmp);
                break;
            case '\'': // Single Quote
                sb.replace(offset, offset + 1, xmlApos);
                break;
            case '\"': // Double Quote
                sb.replace(offset, offset + 1, xmlQuote);
                break;
            default: break;
        }
    }
    return (sb.toString());

}

    /**
     * Given a backslash escaped name,
     * convert to a DAP escaped name
     * 
     * @param bs the string to DAP encode; may have backslash escapes
     * @return escaped string
    */

    public static String backslashToDAP(String bs)
    {
	StringBuilder buf = new StringBuilder();
	int len = bs.length();	
	for(int i=0;i