org.simplejavamail.jakarta.mail.internet.HeaderTokenizer Maven / Gradle / Ivy
Show all versions of outlook-message-parser Show documentation
/*
* Copyright (c) 1997, 2020 Oracle and/or its affiliates. All rights reserved.
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Public License v. 2.0, which is available at
* http://www.eclipse.org/legal/epl-2.0.
*
* This Source Code may also be made available under the following Secondary
* Licenses when the conditions for such availability set forth in the
* Eclipse Public License v. 2.0 are satisfied: GNU General Public License,
* version 2 with the GNU Classpath Exception, which is available at
* https://www.gnu.org/software/classpath/license.html.
*
* SPDX-License-Identifier: EPL-2.0 OR GPL-2.0 WITH Classpath-exception-2.0
*/
package org.simplejavamail.jakarta.mail.internet;
import java.util.*;
/**
* This class tokenizes RFC822 and MIME headers into the basic
* symbols specified by RFC822 and MIME.
*
* This class handles folded headers (ie headers with embedded
* CRLF SPACE sequences). The folds are removed in the returned
* tokens.
*
* @author John Mani
* @author Bill Shannon
*/
public class HeaderTokenizer {
/**
* The Token class represents tokens returned by the
* HeaderTokenizer.
*/
public static class Token {
private int type;
private String value;
/**
* Token type indicating an ATOM.
*/
public static final int ATOM = -1;
/**
* Token type indicating a quoted string. The value
* field contains the string without the quotes.
*/
public static final int QUOTEDSTRING = -2;
/**
* Token type indicating a comment. The value field
* contains the comment string without the comment
* start and end symbols.
*/
public static final int COMMENT = -3;
/**
* Token type indicating end of input.
*/
public static final int EOF = -4;
/**
* Constructor.
* @param type Token type
* @param value Token value
*/
public Token(int type, String value) {
this.type = type;
this.value = value;
}
/**
* Return the type of the token. If the token represents a
* delimiter or a control character, the type is that character
* itself, converted to an integer. Otherwise, it's value is
* one of the following:
*
* ATOM
A sequence of ASCII characters
* delimited by either SPACE, CTL, "(", <"> or the
* specified SPECIALS
* QUOTEDSTRING
A sequence of ASCII characters
* within quotes
* COMMENT
A sequence of ASCII characters
* within "(" and ")".
* EOF
End of header
*
*
* @return the token type
*/
public int getType() {
return type;
}
/**
* Returns the value of the token just read. When the current
* token is a quoted string, this field contains the body of the
* string, without the quotes. When the current token is a comment,
* this field contains the body of the comment.
*
* @return token value
*/
public String getValue() {
return value;
}
}
private String string; // the string to be tokenized
private boolean skipComments; // should comments be skipped ?
private String delimiters; // delimiter string
private int currentPos; // current parse position
private int maxPos; // string length
private int nextPos; // track start of next Token for next()
private int peekPos; // track start of next Token for peek()
/**
* RFC822 specials
*/
public final static String RFC822 = "()<>@,;:\\\"\t .[]";
/**
* MIME specials
*/
public final static String MIME = "()<>@,;:\\\"\t []/?=";
// The EOF Token
private final static Token EOFToken = new Token(Token.EOF, null);
/**
* Constructor that takes a rfc822 style header.
*
* @param header The rfc822 header to be tokenized
* @param delimiters Set of delimiter characters
* to be used to delimit ATOMS. These
* are usually RFC822
or
* MIME
* @param skipComments If true, comments are skipped and
* not returned as tokens
*/
public HeaderTokenizer(String header, String delimiters,
boolean skipComments) {
string = (header == null) ? "" : header; // paranoia ?!
this.skipComments = skipComments;
this.delimiters = delimiters;
currentPos = nextPos = peekPos = 0;
maxPos = string.length();
}
/**
* Constructor. Comments are ignored and not returned as tokens
*
* @param header The header that is tokenized
* @param delimiters The delimiters to be used
*/
public HeaderTokenizer(String header, String delimiters) {
this(header, delimiters, true);
}
/**
* Constructor. The RFC822 defined delimiters - RFC822 - are
* used to delimit ATOMS. Also comments are skipped and not
* returned as tokens
*
* @param header the header string
*/
public HeaderTokenizer(String header) {
this(header, RFC822);
}
/**
* Parses the next token from this String.
*
* Clients sit in a loop calling next() to parse successive
* tokens until an EOF Token is returned.
*
* @return the next Token
* @exception ParseException if the parse fails
*/
public Token next() throws ParseException {
return next('\0', false);
}
/**
* Parses the next token from this String.
* If endOfAtom is not NUL, the token extends until the
* endOfAtom character is seen, or to the end of the header.
* This method is useful when parsing headers that don't
* obey the MIME specification, e.g., by failing to quote
* parameter values that contain spaces.
*
* @param endOfAtom if not NUL, character marking end of token
* @return the next Token
* @exception ParseException if the parse fails
* @since JavaMail 1.5
*/
public Token next(char endOfAtom) throws ParseException {
return next(endOfAtom, false);
}
/**
* Parses the next token from this String.
* endOfAtom is handled as above. If keepEscapes is true,
* any backslash escapes are preserved in the returned string.
* This method is useful when parsing headers that don't
* obey the MIME specification, e.g., by failing to escape
* backslashes in the filename parameter.
*
* @param endOfAtom if not NUL, character marking end of token
* @param keepEscapes keep all backslashes in returned string?
* @return the next Token
* @exception ParseException if the parse fails
* @since JavaMail 1.5
*/
public Token next(char endOfAtom, boolean keepEscapes)
throws ParseException {
Token tk;
currentPos = nextPos; // setup currentPos
tk = getNext(endOfAtom, keepEscapes);
nextPos = peekPos = currentPos; // update currentPos and peekPos
return tk;
}
/**
* Peek at the next token, without actually removing the token
* from the parse stream. Invoking this method multiple times
* will return successive tokens, until next()
is
* called.
*
* @return the next Token
* @exception ParseException if the parse fails
*/
public Token peek() throws ParseException {
Token tk;
currentPos = peekPos; // setup currentPos
tk = getNext('\0', false);
peekPos = currentPos; // update peekPos
return tk;
}
/**
* Return the rest of the Header.
*
* @return String rest of header. null is returned if we are
* already at end of header
*/
public String getRemainder() {
if (nextPos >= string.length())
return null;
return string.substring(nextPos);
}
/*
* Return the next token starting from 'currentPos'. After the
* parse, 'currentPos' is updated to point to the start of the
* next token.
*/
private Token getNext(char endOfAtom, boolean keepEscapes)
throws ParseException {
// If we're already at end of string, return EOF
if (currentPos >= maxPos)
return EOFToken;
// Skip white-space, position currentPos beyond the space
if (skipWhiteSpace() == Token.EOF)
return EOFToken;
char c;
int start;
boolean filter = false;
c = string.charAt(currentPos);
// Check or Skip comments and position currentPos
// beyond the comment
while (c == '(') {
// Parsing comment ..
int nesting;
for (start = ++currentPos, nesting = 1;
nesting > 0 && currentPos < maxPos;
currentPos++) {
c = string.charAt(currentPos);
if (c == '\\') { // Escape sequence
currentPos++; // skip the escaped character
filter = true;
} else if (c == '\r')
filter = true;
else if (c == '(')
nesting++;
else if (c == ')')
nesting--;
}
if (nesting != 0)
throw new ParseException("Unbalanced comments");
if (!skipComments) {
// Return the comment, if we are asked to.
// Note that the comment start & end markers are ignored.
String s;
if (filter) // need to go thru the token again.
s = filterToken(string, start, currentPos-1, keepEscapes);
else
s = string.substring(start,currentPos-1);
return new Token(Token.COMMENT, s);
}
// Skip any whitespace after the comment.
if (skipWhiteSpace() == Token.EOF)
return EOFToken;
c = string.charAt(currentPos);
}
// Check for quoted-string and position currentPos
// beyond the terminating quote
if (c == '"') {
currentPos++; // skip initial quote
return collectString('"', keepEscapes);
}
// Check for SPECIAL or CTL
if (c < 040 || c >= 0177 || delimiters.indexOf(c) >= 0) {
if (endOfAtom > 0 && c != endOfAtom) {
// not expecting a special character here,
// pretend it's a quoted string
return collectString(endOfAtom, keepEscapes);
}
currentPos++; // re-position currentPos
char ch[] = new char[1];
ch[0] = c;
return new Token((int)c, new String(ch));
}
// Check for ATOM
for (start = currentPos; currentPos < maxPos; currentPos++) {
c = string.charAt(currentPos);
// ATOM is delimited by either SPACE, CTL, "(", <">
// or the specified SPECIALS
if (c < 040 || c >= 0177 || c == '(' || c == ' ' ||
c == '"' || delimiters.indexOf(c) >= 0) {
if (endOfAtom > 0 && c != endOfAtom) {
// not the expected atom after all;
// back up and pretend it's a quoted string
currentPos = start;
return collectString(endOfAtom, keepEscapes);
}
break;
}
}
return new Token(Token.ATOM, string.substring(start, currentPos));
}
private Token collectString(char eos, boolean keepEscapes)
throws ParseException {
int start;
boolean filter = false;
for (start = currentPos; currentPos < maxPos; currentPos++) {
char c = string.charAt(currentPos);
if (c == '\\') { // Escape sequence
currentPos++;
filter = true;
} else if (c == '\r')
filter = true;
else if (c == eos) {
currentPos++;
String s;
if (filter)
s = filterToken(string, start, currentPos-1, keepEscapes);
else
s = string.substring(start, currentPos-1);
if (c != '"') { // not a real quoted string
s = trimWhiteSpace(s);
currentPos--; // back up before the eos char
}
return new Token(Token.QUOTEDSTRING, s);
}
}
// ran off the end of the string
// if we're looking for a matching quote, that's an error
if (eos == '"')
throw new ParseException("Unbalanced quoted string");
// otherwise, just return whatever's left
String s;
if (filter)
s = filterToken(string, start, currentPos, keepEscapes);
else
s = string.substring(start, currentPos);
s = trimWhiteSpace(s);
return new Token(Token.QUOTEDSTRING, s);
}
// Skip SPACE, HT, CR and NL
private int skipWhiteSpace() {
char c;
for (; currentPos < maxPos; currentPos++)
if (((c = string.charAt(currentPos)) != ' ') &&
(c != '\t') && (c != '\r') && (c != '\n'))
return currentPos;
return Token.EOF;
}
// Trim SPACE, HT, CR and NL from end of string
private static String trimWhiteSpace(String s) {
char c;
int i;
for (i = s.length() - 1; i >= 0; i--) {
if (((c = s.charAt(i)) != ' ') &&
(c != '\t') && (c != '\r') && (c != '\n'))
break;
}
if (i <= 0)
return "";
else
return s.substring(0, i + 1);
}
/* Process escape sequences and embedded LWSPs from a comment or
* quoted string.
*/
private static String filterToken(String s, int start, int end,
boolean keepEscapes) {
StringBuilder sb = new StringBuilder();
char c;
boolean gotEscape = false;
boolean gotCR = false;
for (int i = start; i < end; i++) {
c = s.charAt(i);
if (c == '\n' && gotCR) {
// This LF is part of an unescaped
// CRLF sequence (i.e, LWSP). Skip it.
gotCR = false;
continue;
}
gotCR = false;
if (!gotEscape) {
// Previous character was NOT '\'
if (c == '\\') // skip this character
gotEscape = true;
else if (c == '\r') // skip this character
gotCR = true;
else // append this character
sb.append(c);
} else {
// Previous character was '\'. So no need to
// bother with any special processing, just
// append this character. If keepEscapes is
// set, keep the backslash. IE6 fails to escape
// backslashes in quoted strings in HTTP headers,
// e.g., in the filename parameter.
if (keepEscapes)
sb.append('\\');
sb.append(c);
gotEscape = false;
}
}
return sb.toString();
}
}