org.simpleframework.http.parse.PathParser Maven / Gradle / Ivy

Go to download
/*
 * PathParser.java February 2001
 *
 * Copyright (C) 2001, Niall Gallagher 
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General 
 * Public License along with this library; if not, write to the 
 * Free Software Foundation, Inc., 59 Temple Place, Suite 330, 
 * Boston, MA  02111-1307  USA
 */
 
package org.simpleframework.http.parse;

import java.io.Serializable;

import org.simpleframework.http.Path;
import org.simpleframework.util.parse.Parser;

/**
 * This is used to parse a path given as part of a URI. This will  read the
 * path, normalize it, and break it up into its components. The normalization
 * of the path is the conversion of the path given into it's actual path by
 * removing the references to the parent directorys and to the current dir.
 * 
 * If the path that this represents is /usr/bin/../etc/./README
 * then the actual path, normalized, is /usr/etc/README. Once
 * the path has been normalized it is possible to acquire the segments as
 * an array of strings, which allows simple manipulation of the path.
 * 

 * Although RFC 2396 defines the path within a URI to have parameters this
 * does not extract those parameters this will simply normalize the path and
 * include the path parameters in the path. If the path is to be converted
 * into a OS specific file system path that has the parameters extracted 
 * then the AddressParser should be used. 
 *
 * @author Niall Gallagher
 */
public class PathParser extends Parser implements Path{

   /**
    * Used to store the individual path segments.
    */
   private TokenList list;

   /**
    * Used to store consumed name characters.
    */
   private Token name;

   /**
    * Used to store consumed file extension.
    */
   private Token ext;

   /**
    * Used to store the highest directory path.
    */
   private Token dir;

   /**
    * Used to store consumed normalized path name.
    */
   private Token path;

   /**
    * The default constructor will create a PathParser that
    * contains no specifics. The instance will return null
    * for all the get methods. The PathParser's get methods
    * may be populated by using the parse method.
    */
   public PathParser() {
      this.list = new TokenList();
      this.ext = new Token();
      this.dir = new Token();
      this.path = new Token();
      this.name = new Token();   
   }

   /**
    * This is primarily a convineance constructor. This will parse the
    * String given to extract the specifics. This could be
    * achived by calling the default no-arg constructor and then using
    * the instance to invoke the parse method on that
    * String to extract the parts.
    *
    * @param path a String containing a path value
    */
   public PathParser(String path){
      this();
      parse(path);
   }

   /**
    * This will parse the path in such a way that it ensures that at no
    * stage there are trailing back references, using path normalization.  
    * The need to remove the back references is so that this
    * PathParser will create the same String
    * path given a set of paths that have different back references. For
    * example the paths /path/../path and /path
    * are the same path but different String's.
    * 

    * This will NOT parse an immediate back reference as this signifies
    * a path that cannot exist. So a path such as /../ will
    * result in a null for all methods. Paths such as ../bin
    * will not be allowed.
    */
   protected void parse() {
      normalize();
      path();
      segments();
      name();
      extension();
   }

   /**
    * This will initialize the parser so that it is in a ready state.
    * This allows the parser to be used to parse many paths. This will
    * clear the parse buffer objects and reset the offset to point to
    * the start of the char buffer. The count variable is reset by the
    * Parser.parse method.
    */
   protected void init() {
      list.clear();
      ext.clear();
      dir.clear();
      name.clear();
      path.clear();
      off = 0;
   }

   /**
    * This will return the extension that the file name contains.
    * For example a file name file.en_US.extension
    * will produce an extension of extension. This 
    * will return null if the path contains no file extension.
    *
    * @return this will return the extension this path contains
    */
   public String getExtension() {
      return ext.toString();
   }

   /**
    * This will return the full name of the file without the path.
    * As regargs the definition of the path in RFC 2396 the name
    * would be considered the last path segment. So if the path 
    * was /usr/README the name is README.
    * Also for directorys the name of the directory in the last
    * path segment is returned. This returns the name without any
    * of the path parameters. As RFC 2396 defines the path to have
    * path parameters after the path segments.
    *
    * @return this will return the name of the file in the path
    */ 
   public String getName(){
      return name.toString();
   }

   /**
    * This will return the normalized path. The normalized path is
    * the path without any references to its parent or itself. So
    * if the path to be parsed is /usr/../etc/./ the
    * path is /etc/. If the path that this represents
    * is a path with an immediate back reference then this will
    * return null. This is the path with all its information even
    * the parameter information if it was defined in the path.
    *
    * @return this returns the normalize path without
    *    ../ or ./
    */
   public String getPath() {
      return path.toString();
   }

   /**
    * This will return the highest directory that exists within 
    * the path. This is used to that files within the same path
    * can be acquired. An example of that this would do given
    * the path /pub/./bin/README would be to return
    * the highest directory path /pub/bin/. The "/"
    * character will allways be the last character in the path.
    *
    * @return this method will return the highest directory
    */
   public String getDirectory(){
      return dir.toString();
   }

   /**
    * This method is used to break the path into individual parts
    * called segments, see RFC 2396. This can be used as an easy
    * way to compare paths and to examine the directory tree that
    * the path points to. For example, if an path was broken from
    * the string /usr/bin/../etc then the segments
    * returned would be usr and etc as
    * the path is normalized before the segments are extracted.
    *
    * @return return all the path segments within the directory
    */
   public String[] getSegments(){
      return list.list();
   }

   /**
    * This will return the path as it is relative to the issued
    * path. This in effect will chop the start of this path if
    * it's start matches the highest directory of the given path
    * as of getDirectory. This is useful if paths 
    * that are relative to a specific location are required. To
    * illustrate what this method will do the following example
    * is provided. If this object represented the path string
    * /usr/share/rfc/rfc2396.txt and the issued
    * path was /usr/share/text.txt then this will
    * return the path string /rfc/rfc2396.txt.
    *
    * @param path the path prefix to acquire a relative path
    *
    * @return returns a path relative to the one it is given
    * otherwize this method will return null 
    */
   public String getRelative(String path){
      return getRelative(new PathParser(path));
   }

   /**
    * This is used by the getRelative(String) to
    * normalize the path string and determine if it contains a
    * highest directory which is shared with the path that is
    * represented by this object. If the path has leading back
    * references, such as ../, then the result of
    * this is null. The returned path begins with a '/'.
    *
    * @param path the path prefix to acquire a relative path
    *
    * @return returns a path relative to the one it is given
    * otherwize this method will return null 
    */
   private String getRelative(PathParser path){
      char[] text = path.buf;
      int off = path.dir.off;
      int len = path.dir.len;

      return getRelative(text, off, len);
   }

   /**
    * This will return the path as it is relative to the issued
    * path. This in effect will chop the start of this path if
    * it's start matches the highest directory of the given path
    * as of getDirectory. This is useful if paths 
    * that are relative to a specific location are required. To
    * illustrate what this method will do the following example
    * is provided. If this object represented the path string
    * /usr/share/rfc/rfc2396.txt and the issued
    * path was /usr/share/text.txt then this will
    * return the path string /rfc/rfc2396.txt.
    *
    * @param text the path prefix to acquire a relative path   
    * @param off this is the offset within the text to read
    * @param len this is the number of characters in the path
    *
    * @return returns a path relative to the one it is given
    * otherwize this method will return null 
    */
   private String getRelative(char[] text, int off, int len){
      int size = path.len - len + 1; /* '/' */
      int pos = path.off + len - 1;

      for(int i = 0; i < len; i++){
         if(text[off++] != buf[path.off+i]){
            return null;
         }
      }
      if(pos < 0) { /* ../ */
         return null;
      } 
      return new String(buf,pos,size);
   }

   /**
    * This will extract the path of the given String
    * after it has been normalized. If the path can not be normalized
    * then the count is set to -1 and the path cannot be extracted.
    * When this happens then the path parameter is null.
    */
   private void path() {
      if(count > 0){
         path.len = count;      
         path.off = 0;
      }
   }

   /**
    * This will simply read the characters from the end of the
    * buffer until it encounters the first peroid character. When
    * this is read it will store the file extension and remove the
    * characters from the buffer.
    */
   private void extension() {
      int pos = off + count; /* index.html[]*/
      int len = 0;

      while(pos-1 >= off) { /* index.htm[l]*/
         if(buf[--pos]=='.'){ /* index[.]html*/
            ext.off = pos+1;
            ext.len = len;
            count = pos;
            break;
         }
         len++;
      }
   }

   /**
    * This wil extract each individual segment from the path and
    * also extract the highest directory. The path segments are
    * basically the strings delimited by the '/' character of a
    * normalized path. As well as extracting the path segments
    * this will also extract the directory of path, that is, the
    * the path up to the last occurance of the '/' character. 
    */
   private void segments() {
      int pos = count - 1;
      int len = 1;

      if(count > 0){
         if(buf[pos] == '/'){ /* /pub/bin[/] */
            dir.len = pos+1;
            dir.off = 0;
            pos--; /* /pub/bi[n]/ */
         }
         while(pos >= off){
            if(buf[pos] == '/'){ /* /pub[/]bin/*/
               if(dir.len == 0){
                  dir.len = pos+1; /* [/] is 0*/
                  dir.off = 0;
               }
               list.add(pos+1,len-1); 
               len = 0;
            }
            len++;
            pos--;
         }
      }
   }

   /**
    * The normalization of the path is the conversion of the path
    * given into it's actual path by removing the references to
    * the parent directorys and to the current dir. So if the path
    * given was /usr/bin/../etc/./README then the actual
    * path, the normalized path, is /usr/etc/README.
    * 
    * This method ensures the if there are an illegal number of back
    * references that the path will be evaluated as empty. This can
    * evaluate any path configuration, this includes any references
    * like ../ or /.. within the path.
    * This will also remove empty segments like //.
    */
   private void normalize(){
      int size = count + off;
      int pos = off;
  
      for(off = count = 0; pos < size; pos++) {
         buf[count++] = buf[pos];

         if(buf[pos] == '/') {
            if(count -1 > 0){               
               if(buf[count -2] == '/') /* [/]/./path/ */
                  count--; /* /[/]./path/ */
            }
         } else if(buf[pos] == '.') { /* //[.]/path/ */
            if(count -1 > 0) { /* /[/]./path/ */
               if(buf[count - 2] !='/') /* /[/]./path./ */
                  continue; /* /path.[/] */
            }     
            if(pos + 2 > size){ /* /path/[.] */
               count--; 
            } else {
               if(buf[pos + 1] =='/'){ /* /.[/]path */ 
                  pos++;/* /[/]. */
                  count--; /* /.[/]path */ 
               }
               if(buf[pos] !='.'){ /* /.[/]path */
                  continue;            
               } 
               if(pos + 2< size){
                  if(buf[pos + 2]!='/') /* /..[p]ath */
                     continue; /* /[.].path */
               }  
               if(count - 2 > 0) {
                  for(count -= 2; count - 1 > 0;){ /* /path[/]..*/
                     if(buf[count - 1]=='/') { /* [/]path/..*/
                        break;           
                     }
                     count--; 
                  }
               }else { /* /../ */
                  count = 0;
                  off = 0;
                  break;
               }
               pos += 2; /* /path/.[.]/ */
            }
         }
      }
   }
   
   /**
    * This will extract the full name of the file without the path.
    * As regards the definition of the path in RFC 2396 the name
    * would be considered the last path segment. So if the path 
    * was /usr/README the name is README.
    * Also for directorys the name of the directory in the last
    * path segment is returned. This returns the name without any
    * of the path parameters. As RFC 2396 defines the path to have
    * path parameters after the path segments. So the path for the
    * directory "/usr/bin;param=value/;param=value" would result 
    * in the name "bin". If the path given was "/" then there will
    * be nothing in the buffer because extract will
    * have removed it.
    */    
   private void name(){
      int pos = count;
      int len = 0;
      
      while(pos-- > off) { /* /usr/bin/;para[m] */
         if(buf[pos]==';'){ /* /usr/bin/[;]param */
            if(buf[pos-1]=='/'){ /* /usr/bin[/];param */
               pos--;   /* /usr/bin[/];param */
            }
            len = 0;  /* /usr/bin[/]*/
         }else if(buf[pos]=='/'){ /* /usr[/]bin*/
            off = pos + 1; /* /usr/[b]in*/
            count = len; /* [b]in */
            break;
         }else{
            len++;
         }
      }
      name.len = count;
      name.off = off;
   }

   /**
    * This will return the normalized path. The normalized path is
    * the path without any references to its parent or itself. So
    * if the path to be parsed is /usr/../etc/./ the
    * path is /etc/. If the path that this represents
    * is a path with an immediate back reference then this will
    * return null. This is the path with all its information even
    * the parameter information if it was defined in the path.
    *
    * @return this returns the normalize path without
    *    ../ or ./
    */
   public String toString(){
      return getPath();
   }

   /**
    * This is used so that the PathParser can speed
    * up the parsing of the data. Rather than using a buffer like
    * a ParseBuffer or worse a StringBuffer
    * this just keeps an index into the character array from the
    * start and end of the token. Also this enables a cache to be
    * kept so that a String does not need to be made
    * again after the first time it is created.
    */ 
   private class Token implements Serializable {

      /**
       * Provides a quick retrival of the token value. 
       */
      public String value;

      /**
       * Offset within the buffer that the token starts.
       */
      public int off;

      /**
       * Length of the region that the token consumes.
       */
      public int len;

      /**
       * If the Token is to be reused this will clear
       * all previous data. Clearing the buffer allows it to be
       * reused if there is a new URI to be parsed. This ensures
       * that a null is returned if the token length is zero.
       */
      public void clear() {
         value = null;
         len = 0;
      }
      
      /**
       * This method will convert the Token into it's
       * String equivelant. This will firstly check
       * to see if there is a value, for the string representation,
       * if there is the value is returned, otherwise the region
       * is converted into a String and returned.
       *
       * @return this returns a value representing the token
       */
      public String toString() {
         if(value != null) {
            return value;
         }
         if(len > 0) {
            value = new String(buf,off,len);
         }
         return value;
      }
   }

   /**
    * The TokenList class is used to store a list of
    * tokens. This provides an add method which can
    * be used to store an offset and length of a token within 
    * the buffer. Once the tokens have been added to they can be
    * examined, in the order they were added, using the provided
    * list method. This has a scalable capacity.
    */    
   private class TokenList implements Serializable {
   
      /** 
       * Contains the offsets and lengths of the tokens.
       */
      private int[] list;

      /**
       * Determines the write offset into the array.
       */
      private int count;

      /**
       * Constructor for the TokenList is used to
       * create a scalable list to store tokens. The initial
       * list is created with an array of sixteen ints, which 
       * is enough to store eight tokens. 
       */
      private TokenList(){
         list = new int[16];
      }
      
      /**
       * This is used to add a new token to the list. Tokens
       * will be available from the list method in
       * the order it was added, so the first to be added will
       * at index zero and the last with be in the last index.
       *
       * @param off this is the read offset within the buffer
       * @param len the number of characters within the token
       */
      public void add(int off, int len){
         if(count+1 > list.length) {
            resize(count *2);
         }
         list[count++] = off;
         list[count++] = len;
      }

      /**
       * This is used to retrieve the list of tokens inserted
       * to this list using the add method. The
       * indexes of the tokens represents the order that the
       * tokens were added to the list.
       *
       * @return returns an ordered list of token strings 
       */
      public String[] list(){
         String[] value= new String[count/2];
         
         for(int i =0, j=count/2; i< count; i+=2){
            value[j-(i/2)-1] = new String(buf,
               list[i],list[i+1]);
         } 
         return value;
      }

      /**
       * This is used to clear all tokens previously stored
       * in the list. This is required so that initialization
       * of the parser with the init method can 
       * ensure that there are no tokens from previous data.
       */
      public void clear(){
         count =0;
      }

      /**
       * Scales the internal array used should the number of
       * tokens exceed the initial capacity. This will just
       * copy across the ints used to represent the token. 
       *
       * @param size length the capacity is to increase to 
       */     
      private void resize(int size){
         int[] copy = new int[size];
         System.arraycopy(list,0,copy,0,count);
         list = copy; 
      }
   }
}