com.oreilly.servlet.multipart.MultipartParser Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of cos Show documentation
Cos
The newest version!
// Copyright (C) 1998-2001 by Jason Hunter .
// All rights reserved.  Use of this class is limited.
// Please see the LICENSE for more information.

package com.oreilly.servlet.multipart;

import java.io.IOException;
import java.util.Enumeration;
import java.util.Vector;

import javax.servlet.http.HttpServletRequest;
import javax.servlet.ServletInputStream;

/** 
 * A utility class to handle multipart/form-data requests,
 * the kind of requests that support file uploads.  This class uses a 
 * "pull" model where the reading of incoming files and parameters is 
 * controlled by the client code, which allows incoming files to be stored 
 * into any OutputStream.  If you wish to use an API which 
 * resembles HttpServletRequest, use the "push" model 
 * MultipartRequest instead.  It's an easy-to-use wrapper 
 * around this class.
 * 
 * This class can receive arbitrarily large files (up to an artificial limit 
 * you can set), and fairly efficiently too.  
 * It cannot handle nested data (multipart content within multipart content).
 * It can now with the latest release handle internationalized content
 * (such as non Latin-1 filenames).
 * 

 * It also optionally includes enhanced buffering and Content-Length
 * limitation.  Buffering is only required if your servlet container is 
 * poorly implemented (many are, including Tomcat 3.2),
 * but it is generally recommended because it will make a slow servlet 
 * container a lot faster, and will only make a fast servlet container a 
 * little slower.  Content-Length limiting is usually only required if you find 
 * that your servlet is hanging trying to read the input stram from the POST, 
 * and it is similarly recommended because it only has a minimal impact on 
 * performance.
 * 

 * See the included upload.war for an example of how to use this class.
 * 

 * The full file upload specification is contained in experimental RFC 1867,
 * available at 
 * http://www.ietf.org/rfc/rfc1867.txt.
 * 
 * @see com.oreilly.servlet.MultipartRequest
 * 
 * @author Jason Hunter
 * @author Geoff Soutter
 * @version 1.13, 2004/09/01, added workaround if content-length is -1
 * @version 1.12, 2004/05/17, added trim on disposition
 * @version 1.11, 2002/11/01, added constructor that takes an encoding, to
 *                            make sure chars are always read correctly
 * @version 1.10, 2002/11/01, added support for a preamble before the first
 *                            boundary marker
 * @version 1.9, 2002/11/01, added support to parse odd Opera Content-Type
 * @version 1.8, 2002/11/01, added support for lynx with unquoted param vals
 * @version 1.7, 2002/04/30, fixed bug if a line was '\n' alone
 * @version 1.6, 2002/04/30, added better internationalization support, thanks
 *                           to Changshin Lee
 * @version 1.5, 2002/04/30, added Opera header fix, thanks to Nic Ferrier
 * @version 1.4, 2001/03/23, added IE5 bug workaround supporting \n as line
 *                           ending, thanks to Michael Alyn Miller
 * @version 1.3, 2001/01/22, added support for boundaries surrounded by quotes
 *                           and content-disposition after content-type,
 *                           thanks to Scott Stark
 * @version 1.2, 2001/01/22, getFilePath() support thanks to Stefan Eissing
 * @version 1.1, 2000/10/29, integrating old WebSphere fix
 * @version 1.0, 2000/10/27, initial revision
 */
public class MultipartParser {
  
  /** input stream to read parts from */
  private ServletInputStream in;
  
  /** MIME boundary that delimits parts */
  private String boundary;
  
  /** reference to the last file part we returned */
  private FilePart lastFilePart;

  /** buffer for readLine method */
  private byte[] buf = new byte[8 * 1024];
  
  /** default encoding */
  private static String DEFAULT_ENCODING = "ISO-8859-1";

  /** preferred encoding */
  private String encoding = DEFAULT_ENCODING;

  /**
   * Creates a MultipartParser from the specified request,
   * which limits the upload size to the specified length, buffers for 
   * performance and prevent attempts to read past the amount specified 
   * by the Content-Length.
   * 
   * @param req   the servlet request.
   * @param maxSize the maximum size of the POST content.
   */
  public MultipartParser(HttpServletRequest req, 
                         long maxSize) throws IOException {
    this(req, maxSize, true, true);
  }
  
  /**
   * Creates a MultipartParser from the specified request,
   * which limits the upload size to the specified length, and optionally 
   * buffers for performance and prevents attempts to read past the amount 
   * specified by the Content-Length. 
   * 
   * @param req   the servlet request.
   * @param maxSize the maximum size of the POST content.
   * @param buffer whether to do internal buffering or let the server buffer,
   *               useful for servers that don't buffer
   * @param limitLength boolean flag to indicate if we need to filter 
   *                    the request's input stream to prevent trying to 
   *                    read past the end of the stream.
   */
  public MultipartParser(HttpServletRequest req, long maxSize, boolean buffer,
                         boolean limitLength) throws IOException {
    this(req, maxSize, buffer, limitLength, null);
  }

  /**
   * Creates a MultipartParser from the specified request,
   * which limits the upload size to the specified length, and optionally 
   * buffers for performance and prevents attempts to read past the amount 
   * specified by the Content-Length, and with a specified encoding. 
   * 
   * @param req   the servlet request.
   * @param maxSize the maximum size of the POST content.
   * @param buffer whether to do internal buffering or let the server buffer,
   *               useful for servers that don't buffer
   * @param limitLength boolean flag to indicate if we need to filter 
   *                    the request's input stream to prevent trying to 
   *                    read past the end of the stream.
   * @param encoding the encoding to use for parsing, default is ISO-8859-1.
   */
  public MultipartParser(HttpServletRequest req, long maxSize, boolean buffer,
                         boolean limitLength, String encoding)
                                                throws IOException {
    // First make sure we know the encoding to handle chars correctly.
    // Thanks to Andreas Granzer, [email protected],
    // for pointing out the need to have this in the constructor.
    if (encoding != null) {
      setEncoding(encoding);
    }

    // Check the content type to make sure it's "multipart/form-data"
    // Access header two ways to work around WebSphere oddities
    String type = null;
    String type1 = req.getHeader("Content-Type");
    String type2 = req.getContentType();
    // If one value is null, choose the other value
    if (type1 == null && type2 != null) {
      type = type2;
    }
    else if (type2 == null && type1 != null) {
      type = type1;
    }
    // If neither value is null, choose the longer value
    else if (type1 != null && type2 != null) {
      type = (type1.length() > type2.length() ? type1 : type2);
    }

    if (type == null || 
        !type.toLowerCase().startsWith("multipart/form-data")) {
      throw new IOException("Posted content type isn't multipart/form-data");
    }

    // Check the content length to prevent denial of service attacks
    long length = req.getContentLengthLong();
    if (length > maxSize) {
      // throw new IOException("Posted content length of " + length + " exceeds limit of " + maxSize);
    	throw new ExceededSizeException("Posted content length of " + length + " exceeds limit of " + maxSize);
    }

    // Get the boundary string; it's included in the content type.
    // Should look something like "------------------------12012133613061"
    String boundary = extractBoundary(type);
    if (boundary == null) {
      throw new IOException("Separation boundary was not specified");
    }

    ServletInputStream in = req.getInputStream();
    
    // If required, wrap the real input stream with classes that 
    // "enhance" its behaviour for performance and stability
    if (buffer) {
      in = new BufferedServletInputStream(in);
    }
    if (limitLength && length > 0) {
      in = new LimitedServletInputStream(in, length);
    }

    // Save our values for later
    this.in = in;
    this.boundary = boundary;
    
    // Read until we hit the boundary
    // Some clients send a preamble (per RFC 2046), so ignore that
    // Thanks to Ben Johnson, [email protected], for pointing out
    // the need for preamble support.
    do {
      String line = readLine();
      if (line == null) {
        throw new IOException("Corrupt form data: premature ending");
      }
      // See if this line is the boundary, and if so break
      if (line.startsWith(boundary)) {
        break;  // success
      }
    } while (true);
  }

  /**
   * Sets the encoding used to parse from here onward.  The default is
   * ISO-8859-1.  Encodings are actually best passed into the contructor,
   * so even the initial line reads are correct.
   *
   * @param encoding The encoding to use for parsing
   */
   public void setEncoding(String encoding) {
     this.encoding = encoding;
   }

  /**
   * Read the next part arriving in the stream. Will be either a 
   * FilePart or a ParamPart, or null
   * to indicate there are no more parts to read. The order of arrival 
   * corresponds to the order of the form elements in the submitted form.
   * 
   * @return either a FilePart, a ParamPart or
   *        null if there are no more parts to read.
   * @exception IOException	if an input or output exception has occurred.
   * 
   * @see FilePart
   * @see ParamPart
   */
  public Part readNextPart() throws IOException {
    // Make sure the last file was entirely read from the input
    if (lastFilePart != null) {
      lastFilePart.getInputStream().close();
      lastFilePart = null;
    }
    
    // Read the headers; they look like this (not all may be present):
    // Content-Disposition: form-data; name="field1"; filename="file1.txt"
    // Content-Type: type/subtype
    // Content-Transfer-Encoding: binary
    Vector headers = new Vector();

    String line = readLine();
    if (line == null) {
      // No parts left, we're done
      return null;
    }
    else if (line.length() == 0) {
      // IE4 on Mac sends an empty line at the end; treat that as the end.
      // Thanks to Daniel Lemire and Henri Tourigny for this fix.
      return null;
    }

    // Read the following header lines we hit an empty line
    // A line starting with whitespace is considered a continuation;
    // that requires a little special logic.  Thanks to Nic Ferrier for
    // identifying a good fix.
    while (line != null && line.length() > 0) {
      String nextLine = null;
      boolean getNextLine = true;
      while (getNextLine) {
        nextLine = readLine();
        if (nextLine != null
            && (nextLine.startsWith(" ")
      	  || nextLine.startsWith("\t"))) {
          line = line + nextLine;
        }
        else {
          getNextLine = false;
        }
      }
      // Add the line to the header list
      headers.addElement(line);
      line = nextLine;
    }

    // If we got a null above, it's the end
    if (line == null) {
      return null;
    }

    String name = null;
    String filename = null;
    String origname = null;
    String contentType = "text/plain";  // rfc1867 says this is the default

    Enumeration enu = headers.elements();
    while (enu.hasMoreElements()) {
      String headerline = (String) enu.nextElement();
      if (headerline.toLowerCase().startsWith("content-disposition:")) {
        // Parse the content-disposition line
        String[] dispInfo = extractDispositionInfo(headerline);
        // String disposition = dispInfo[0];  // not currently used
        name = dispInfo[1];
        filename = dispInfo[2];
        origname = dispInfo[3];
      }
      else if (headerline.toLowerCase().startsWith("content-type:")) {
        // Get the content type, or null if none specified
        String type = extractContentType(headerline);
        if (type != null) {
          contentType = type;
        }
      }
    }

    // Now, finally, we read the content (end after reading the boundary)
    if (filename == null) {
      // This is a parameter, add it to the vector of values
      // The encoding is needed to help parse the value
      return new ParamPart(name, in, boundary, encoding);
    }
    else {
      // This is a file
      if (filename.equals("")) {
        filename = null; // empty filename, probably an "empty" file param
      }
      lastFilePart = new FilePart(name, in, boundary,
                                  contentType, filename, origname);
      return lastFilePart;
    }
  }
  
  /**
   * Extracts and returns the boundary token from a line.
   * 
   * @return the boundary token.
   */
  private String extractBoundary(String line) {
    // Use lastIndexOf() because IE 4.01 on Win98 has been known to send the
    // "boundary=" string multiple times.  Thanks to David Wall for this fix.
    int index = line.lastIndexOf("boundary=");
    if (index == -1) {
      return null;
    }
    String boundary = line.substring(index + 9);  // 9 for "boundary="
    if (boundary.charAt(0) == '"') {
      // The boundary is enclosed in quotes, strip them
      index = boundary.lastIndexOf('"');
      boundary = boundary.substring(1, index);
    }

    // The real boundary is always preceeded by an extra "--"
    boundary = "--" + boundary;

    return boundary;
  }

  /**
   * Extracts and returns disposition info from a line, as a String
   * array with elements: disposition, name, filename.
   * 
   * @return String[] of elements: disposition, name, filename.
   * @exception  IOException if the line is malformatted.
   */
  private String[] extractDispositionInfo(String line) throws IOException {
    // Return the line's data as an array: disposition, name, filename
    String[] retval = new String[4];

    // Convert the line to a lowercase string without the ending \r\n
    // Keep the original line for error messages and for variable names.
    String origline = line;
    line = origline.toLowerCase();

    // Get the content disposition, should be "form-data"
    int start = line.indexOf("content-disposition: ");
    int end = line.indexOf(";");
    if (start == -1 || end == -1) {
      throw new IOException("Content disposition corrupt: " + origline);
    }
    String disposition = line.substring(start + 21, end).trim();
    if (!disposition.equals("form-data")) {
      throw new IOException("Invalid content disposition: " + disposition);
    }

    // Get the field name
    start = line.indexOf("name=\"", end);  // start at last semicolon
    end = line.indexOf("\"", start + 7);   // skip name=\"
    int startOffset = 6;
    if (start == -1 || end == -1) {
      // Some browsers like lynx don't surround with ""
      // Thanks to Deon van der Merwe, [email protected], for noticing
      start = line.indexOf("name=", end);
      end = line.indexOf(";", start + 6);
      if (start == -1) {
        throw new IOException("Content disposition corrupt: " + origline);
      }
      else if (end == -1) {
        end = line.length();
      }
      startOffset = 5;  // without quotes we have one fewer char to skip
    }
    String name = origline.substring(start + startOffset, end);

    // Get the filename, if given
    String filename = null;
    String origname = null;
    start = line.indexOf("filename=\"", end + 2);  // start after name
    end = line.indexOf("\"", start + 10);          // skip filename=\"
    if (start != -1 && end != -1) {                // note the !=
      filename = origline.substring(start + 10, end);
      origname = filename;
      // The filename may contain a full path.  Cut to just the filename.
      int slash =
        Math.max(filename.lastIndexOf('/'), filename.lastIndexOf('\\'));
      if (slash > -1) {
        filename = filename.substring(slash + 1);  // past last slash
      }
    }

    // Return a String array: disposition, name, filename
    // empty filename denotes no file posted!
    retval[0] = disposition;
    retval[1] = name;
    retval[2] = filename;
    retval[3] = origname;
    return retval;
  }

  /**
   * Extracts and returns the content type from a line, or null if the
   * line was empty.
   * 
   * @return content type, or null if line was empty.
   * @exception  IOException if the line is malformatted.
   */
  private static String extractContentType(String line) throws IOException {
    // Convert the line to a lowercase string
    line = line.toLowerCase();

    // Get the content type, if any
    // Note that Opera at least puts extra info after the type, so handle
    // that.  For example:  Content-Type: text/plain; name="foo"
    // Thanks to Leon Poyyayil, [email protected], for noticing this.
    int end = line.indexOf(";");
    if (end == -1) {
      end = line.length();
    }

    return line.substring(13, end).trim();  // "content-type:" is 13
  }
  
  /**
   * Read the next line of input.
   * 
   * @return     a String containing the next line of input from the stream,
   *        or null to indicate the end of the stream.
   * @exception IOException	if an input or output exception has occurred.
   */
  private String readLine() throws IOException {
    StringBuffer sbuf = new StringBuffer();
    int result;
    String line;

    do {
      result = in.readLine(buf, 0, buf.length);  // does +=
      if (result != -1) {
        sbuf.append(new String(buf, 0, result, encoding));
      }
    } while (result == buf.length);  // loop only if the buffer was filled

    if (sbuf.length() == 0) {
      return null;  // nothing read, must be at the end of stream
    }

    // Cut off the trailing \n or \r\n
    // It should always be \r\n but IE5 sometimes does just \n
    // Thanks to Luke Blaikie for helping make this work with \n
    int len = sbuf.length();
    if (len >= 2 && sbuf.charAt(len - 2) == '\r') {
      sbuf.setLength(len - 2);  // cut \r\n
    }
    else if (len >= 1 && sbuf.charAt(len - 1) == '\n') {
      sbuf.setLength(len - 1);  // cut \n
    }
    return sbuf.toString();
  }
}