com.oreilly.servlet.multipart.MultipartParser Maven / Gradle / Ivy
// Copyright (C) 1998-2001 by Jason Hunter .
// All rights reserved. Use of this class is limited.
// Please see the LICENSE for more information.
package com.oreilly.servlet.multipart;
import java.io.IOException;
import java.util.Enumeration;
import java.util.Vector;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.ServletInputStream;
/**
* A utility class to handle multipart/form-data
requests,
* the kind of requests that support file uploads. This class uses a
* "pull" model where the reading of incoming files and parameters is
* controlled by the client code, which allows incoming files to be stored
* into any OutputStream
. If you wish to use an API which
* resembles HttpServletRequest
, use the "push" model
* MultipartRequest
instead. It's an easy-to-use wrapper
* around this class.
*
* This class can receive arbitrarily large files (up to an artificial limit
* you can set), and fairly efficiently too.
* It cannot handle nested data (multipart content within multipart content).
* It can now with the latest release handle internationalized content
* (such as non Latin-1 filenames).
*
* It also optionally includes enhanced buffering and Content-Length
* limitation. Buffering is only required if your servlet container is
* poorly implemented (many are, including Tomcat 3.2),
* but it is generally recommended because it will make a slow servlet
* container a lot faster, and will only make a fast servlet container a
* little slower. Content-Length limiting is usually only required if you find
* that your servlet is hanging trying to read the input stram from the POST,
* and it is similarly recommended because it only has a minimal impact on
* performance.
*
* See the included upload.war for an example of how to use this class.
*
* The full file upload specification is contained in experimental RFC 1867,
* available at
* http://www.ietf.org/rfc/rfc1867.txt.
*
* @see com.oreilly.servlet.MultipartRequest
*
* @author Jason Hunter
* @author Geoff Soutter
* @version 1.13, 2004/09/01, added workaround if content-length is -1
* @version 1.12, 2004/05/17, added trim on disposition
* @version 1.11, 2002/11/01, added constructor that takes an encoding, to
* make sure chars are always read correctly
* @version 1.10, 2002/11/01, added support for a preamble before the first
* boundary marker
* @version 1.9, 2002/11/01, added support to parse odd Opera Content-Type
* @version 1.8, 2002/11/01, added support for lynx with unquoted param vals
* @version 1.7, 2002/04/30, fixed bug if a line was '\n' alone
* @version 1.6, 2002/04/30, added better internationalization support, thanks
* to Changshin Lee
* @version 1.5, 2002/04/30, added Opera header fix, thanks to Nic Ferrier
* @version 1.4, 2001/03/23, added IE5 bug workaround supporting \n as line
* ending, thanks to Michael Alyn Miller
* @version 1.3, 2001/01/22, added support for boundaries surrounded by quotes
* and content-disposition after content-type,
* thanks to Scott Stark
* @version 1.2, 2001/01/22, getFilePath() support thanks to Stefan Eissing
* @version 1.1, 2000/10/29, integrating old WebSphere fix
* @version 1.0, 2000/10/27, initial revision
*/
public class MultipartParser {
/** input stream to read parts from */
private ServletInputStream in;
/** MIME boundary that delimits parts */
private String boundary;
/** reference to the last file part we returned */
private FilePart lastFilePart;
/** buffer for readLine method */
private byte[] buf = new byte[8 * 1024];
/** default encoding */
private static String DEFAULT_ENCODING = "ISO-8859-1";
/** preferred encoding */
private String encoding = DEFAULT_ENCODING;
/**
* Creates a MultipartParser
from the specified request,
* which limits the upload size to the specified length, buffers for
* performance and prevent attempts to read past the amount specified
* by the Content-Length.
*
* @param req the servlet request.
* @param maxSize the maximum size of the POST content.
*/
public MultipartParser(HttpServletRequest req,
int maxSize) throws IOException {
this(req, maxSize, true, true);
}
/**
* Creates a MultipartParser
from the specified request,
* which limits the upload size to the specified length, and optionally
* buffers for performance and prevents attempts to read past the amount
* specified by the Content-Length.
*
* @param req the servlet request.
* @param maxSize the maximum size of the POST content.
* @param buffer whether to do internal buffering or let the server buffer,
* useful for servers that don't buffer
* @param limitLength boolean flag to indicate if we need to filter
* the request's input stream to prevent trying to
* read past the end of the stream.
*/
public MultipartParser(HttpServletRequest req, int maxSize, boolean buffer,
boolean limitLength) throws IOException {
this(req, maxSize, buffer, limitLength, null);
}
/**
* Creates a MultipartParser
from the specified request,
* which limits the upload size to the specified length, and optionally
* buffers for performance and prevents attempts to read past the amount
* specified by the Content-Length, and with a specified encoding.
*
* @param req the servlet request.
* @param maxSize the maximum size of the POST content.
* @param buffer whether to do internal buffering or let the server buffer,
* useful for servers that don't buffer
* @param limitLength boolean flag to indicate if we need to filter
* the request's input stream to prevent trying to
* read past the end of the stream.
* @param encoding the encoding to use for parsing, default is ISO-8859-1.
*/
public MultipartParser(HttpServletRequest req, int maxSize, boolean buffer,
boolean limitLength, String encoding)
throws IOException {
// First make sure we know the encoding to handle chars correctly.
// Thanks to Andreas Granzer, [email protected],
// for pointing out the need to have this in the constructor.
if (encoding != null) {
setEncoding(encoding);
}
// Check the content type to make sure it's "multipart/form-data"
// Access header two ways to work around WebSphere oddities
String type = null;
String type1 = req.getHeader("Content-Type");
String type2 = req.getContentType();
// If one value is null, choose the other value
if (type1 == null && type2 != null) {
type = type2;
}
else if (type2 == null && type1 != null) {
type = type1;
}
// If neither value is null, choose the longer value
else if (type1 != null && type2 != null) {
type = (type1.length() > type2.length() ? type1 : type2);
}
if (type == null ||
!type.toLowerCase().startsWith("multipart/form-data")) {
throw new IOException("Posted content type isn't multipart/form-data");
}
// Check the content length to prevent denial of service attacks
int length = req.getContentLength();
if (length > maxSize) {
// throw new IOException("Posted content length of " + length + " exceeds limit of " + maxSize);
throw new ExceededSizeException("Posted content length of " + length + " exceeds limit of " + maxSize);
}
// Get the boundary string; it's included in the content type.
// Should look something like "------------------------12012133613061"
String boundary = extractBoundary(type);
if (boundary == null) {
throw new IOException("Separation boundary was not specified");
}
ServletInputStream in = req.getInputStream();
// If required, wrap the real input stream with classes that
// "enhance" its behaviour for performance and stability
if (buffer) {
in = new BufferedServletInputStream(in);
}
if (limitLength && length > 0) {
in = new LimitedServletInputStream(in, length);
}
// Save our values for later
this.in = in;
this.boundary = boundary;
// Read until we hit the boundary
// Some clients send a preamble (per RFC 2046), so ignore that
// Thanks to Ben Johnson, [email protected], for pointing out
// the need for preamble support.
do {
String line = readLine();
if (line == null) {
throw new IOException("Corrupt form data: premature ending");
}
// See if this line is the boundary, and if so break
if (line.startsWith(boundary)) {
break; // success
}
} while (true);
}
/**
* Sets the encoding used to parse from here onward. The default is
* ISO-8859-1. Encodings are actually best passed into the contructor,
* so even the initial line reads are correct.
*
* @param encoding The encoding to use for parsing
*/
public void setEncoding(String encoding) {
this.encoding = encoding;
}
/**
* Read the next part arriving in the stream. Will be either a
* FilePart
or a ParamPart
, or null
* to indicate there are no more parts to read. The order of arrival
* corresponds to the order of the form elements in the submitted form.
*
* @return either a FilePart
, a ParamPart
or
* null
if there are no more parts to read.
* @exception IOException if an input or output exception has occurred.
*
* @see FilePart
* @see ParamPart
*/
public Part readNextPart() throws IOException {
// Make sure the last file was entirely read from the input
if (lastFilePart != null) {
lastFilePart.getInputStream().close();
lastFilePart = null;
}
// Read the headers; they look like this (not all may be present):
// Content-Disposition: form-data; name="field1"; filename="file1.txt"
// Content-Type: type/subtype
// Content-Transfer-Encoding: binary
Vector headers = new Vector();
String line = readLine();
if (line == null) {
// No parts left, we're done
return null;
}
else if (line.length() == 0) {
// IE4 on Mac sends an empty line at the end; treat that as the end.
// Thanks to Daniel Lemire and Henri Tourigny for this fix.
return null;
}
// Read the following header lines we hit an empty line
// A line starting with whitespace is considered a continuation;
// that requires a little special logic. Thanks to Nic Ferrier for
// identifying a good fix.
while (line != null && line.length() > 0) {
String nextLine = null;
boolean getNextLine = true;
while (getNextLine) {
nextLine = readLine();
if (nextLine != null
&& (nextLine.startsWith(" ")
|| nextLine.startsWith("\t"))) {
line = line + nextLine;
}
else {
getNextLine = false;
}
}
// Add the line to the header list
headers.addElement(line);
line = nextLine;
}
// If we got a null above, it's the end
if (line == null) {
return null;
}
String name = null;
String filename = null;
String origname = null;
String contentType = "text/plain"; // rfc1867 says this is the default
Enumeration enu = headers.elements();
while (enu.hasMoreElements()) {
String headerline = (String) enu.nextElement();
if (headerline.toLowerCase().startsWith("content-disposition:")) {
// Parse the content-disposition line
String[] dispInfo = extractDispositionInfo(headerline);
// String disposition = dispInfo[0]; // not currently used
name = dispInfo[1];
filename = dispInfo[2];
origname = dispInfo[3];
}
else if (headerline.toLowerCase().startsWith("content-type:")) {
// Get the content type, or null if none specified
String type = extractContentType(headerline);
if (type != null) {
contentType = type;
}
}
}
// Now, finally, we read the content (end after reading the boundary)
if (filename == null) {
// This is a parameter, add it to the vector of values
// The encoding is needed to help parse the value
return new ParamPart(name, in, boundary, encoding);
}
else {
// This is a file
if (filename.equals("")) {
filename = null; // empty filename, probably an "empty" file param
}
lastFilePart = new FilePart(name, in, boundary,
contentType, filename, origname);
return lastFilePart;
}
}
/**
* Extracts and returns the boundary token from a line.
*
* @return the boundary token.
*/
private String extractBoundary(String line) {
// Use lastIndexOf() because IE 4.01 on Win98 has been known to send the
// "boundary=" string multiple times. Thanks to David Wall for this fix.
int index = line.lastIndexOf("boundary=");
if (index == -1) {
return null;
}
String boundary = line.substring(index + 9); // 9 for "boundary="
if (boundary.charAt(0) == '"') {
// The boundary is enclosed in quotes, strip them
index = boundary.lastIndexOf('"');
boundary = boundary.substring(1, index);
}
// The real boundary is always preceeded by an extra "--"
boundary = "--" + boundary;
return boundary;
}
/**
* Extracts and returns disposition info from a line, as a String
* array with elements: disposition, name, filename.
*
* @return String[] of elements: disposition, name, filename.
* @exception IOException if the line is malformatted.
*/
private String[] extractDispositionInfo(String line) throws IOException {
// Return the line's data as an array: disposition, name, filename
String[] retval = new String[4];
// Convert the line to a lowercase string without the ending \r\n
// Keep the original line for error messages and for variable names.
String origline = line;
line = origline.toLowerCase();
// Get the content disposition, should be "form-data"
int start = line.indexOf("content-disposition: ");
int end = line.indexOf(";");
if (start == -1 || end == -1) {
throw new IOException("Content disposition corrupt: " + origline);
}
String disposition = line.substring(start + 21, end).trim();
if (!disposition.equals("form-data")) {
throw new IOException("Invalid content disposition: " + disposition);
}
// Get the field name
start = line.indexOf("name=\"", end); // start at last semicolon
end = line.indexOf("\"", start + 7); // skip name=\"
int startOffset = 6;
if (start == -1 || end == -1) {
// Some browsers like lynx don't surround with ""
// Thanks to Deon van der Merwe, [email protected], for noticing
start = line.indexOf("name=", end);
end = line.indexOf(";", start + 6);
if (start == -1) {
throw new IOException("Content disposition corrupt: " + origline);
}
else if (end == -1) {
end = line.length();
}
startOffset = 5; // without quotes we have one fewer char to skip
}
String name = origline.substring(start + startOffset, end);
// Get the filename, if given
String filename = null;
String origname = null;
start = line.indexOf("filename=\"", end + 2); // start after name
end = line.indexOf("\"", start + 10); // skip filename=\"
if (start != -1 && end != -1) { // note the !=
filename = origline.substring(start + 10, end);
origname = filename;
// The filename may contain a full path. Cut to just the filename.
int slash =
Math.max(filename.lastIndexOf('/'), filename.lastIndexOf('\\'));
if (slash > -1) {
filename = filename.substring(slash + 1); // past last slash
}
}
// Return a String array: disposition, name, filename
// empty filename denotes no file posted!
retval[0] = disposition;
retval[1] = name;
retval[2] = filename;
retval[3] = origname;
return retval;
}
/**
* Extracts and returns the content type from a line, or null if the
* line was empty.
*
* @return content type, or null if line was empty.
* @exception IOException if the line is malformatted.
*/
private static String extractContentType(String line) throws IOException {
// Convert the line to a lowercase string
line = line.toLowerCase();
// Get the content type, if any
// Note that Opera at least puts extra info after the type, so handle
// that. For example: Content-Type: text/plain; name="foo"
// Thanks to Leon Poyyayil, [email protected], for noticing this.
int end = line.indexOf(";");
if (end == -1) {
end = line.length();
}
return line.substring(13, end).trim(); // "content-type:" is 13
}
/**
* Read the next line of input.
*
* @return a String containing the next line of input from the stream,
* or null to indicate the end of the stream.
* @exception IOException if an input or output exception has occurred.
*/
private String readLine() throws IOException {
StringBuffer sbuf = new StringBuffer();
int result;
String line;
do {
result = in.readLine(buf, 0, buf.length); // does +=
if (result != -1) {
sbuf.append(new String(buf, 0, result, encoding));
}
} while (result == buf.length); // loop only if the buffer was filled
if (sbuf.length() == 0) {
return null; // nothing read, must be at the end of stream
}
// Cut off the trailing \n or \r\n
// It should always be \r\n but IE5 sometimes does just \n
// Thanks to Luke Blaikie for helping make this work with \n
int len = sbuf.length();
if (len >= 2 && sbuf.charAt(len - 2) == '\r') {
sbuf.setLength(len - 2); // cut \r\n
}
else if (len >= 1 && sbuf.charAt(len - 1) == '\n') {
sbuf.setLength(len - 1); // cut \n
}
return sbuf.toString();
}
}