All Downloads are FREE. Search and download functionalities are using the official Maven repository.

javax.mail.internet.MimeUtility Maven / Gradle / Ivy

There is a newer version: 1.6.2
Show newest version
/*
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
 *
 * Copyright (c) 1997-2013 Oracle and/or its affiliates. All rights reserved.
 *
 * The contents of this file are subject to the terms of either the GNU
 * General Public License Version 2 only ("GPL") or the Common Development
 * and Distribution License("CDDL") (collectively, the "License").  You
 * may not use this file except in compliance with the License.  You can
 * obtain a copy of the License at
 * https://glassfish.dev.java.net/public/CDDL+GPL_1_1.html
 * or packager/legal/LICENSE.txt.  See the License for the specific
 * language governing permissions and limitations under the License.
 *
 * When distributing the software, include this License Header Notice in each
 * file and include the License file at packager/legal/LICENSE.txt.
 *
 * GPL Classpath Exception:
 * Oracle designates this particular file as subject to the "Classpath"
 * exception as provided by Oracle in the GPL Version 2 section of the License
 * file that accompanied this code.
 *
 * Modifications:
 * If applicable, add the following below the License Header, with the fields
 * enclosed by brackets [] replaced by your own identifying information:
 * "Portions Copyright [year] [name of copyright owner]"
 *
 * Contributor(s):
 * If you wish your version of this file to be governed by only the CDDL or
 * only the GPL Version 2, indicate your decision by adding "[Contributor]
 * elects to include this software in this distribution under the [CDDL or GPL
 * Version 2] license."  If you don't indicate a single choice of license, a
 * recipient has the option to distribute your version of this file under
 * either the CDDL, the GPL Version 2 or to extend the choice of license to
 * its licensees as provided above.  However, if you add GPL Version 2 code
 * and therefore, elected the GPL Version 2 license, then the option applies
 * only if the new code is made subject to such option by the copyright
 * holder.
 */

package javax.mail.internet;

import javax.mail.MessagingException;
import javax.mail.EncodingAware;
import javax.activation.*;
import java.util.*;
import java.io.*;
import com.sun.mail.util.*;

/**
 * This is a utility class that provides various MIME related
 * functionality. 

* * There are a set of methods to encode and decode MIME headers as * per RFC 2047. Note that, in general, these methods are * not needed when using methods such as * setSubject and setRecipients; JavaMail * will automatically encode and decode data when using these "higher * level" methods. The methods below are only needed when maniuplating * raw MIME headers using setHeader and getHeader * methods. A brief description on handling such headers is given below:

* * RFC 822 mail headers must contain only US-ASCII * characters. Headers that contain non US-ASCII characters must be * encoded so that they contain only US-ASCII characters. Basically, * this process involves using either BASE64 or QP to encode certain * characters. RFC 2047 describes this in detail.

* * In Java, Strings contain (16 bit) Unicode characters. ASCII is a * subset of Unicode (and occupies the range 0 - 127). A String * that contains only ASCII characters is already mail-safe. If the * String contains non US-ASCII characters, it must be encoded. An * additional complexity in this step is that since Unicode is not * yet a widely used charset, one might want to first charset-encode * the String into another charset and then do the transfer-encoding. *

* Note that to get the actual bytes of a mail-safe String (say, * for sending over SMTP), one must do *

 *
 *	byte[] bytes = string.getBytes("iso-8859-1");	
 *
 * 

* * The setHeader and addHeader methods * on MimeMessage and MimeBodyPart assume that the given header values * are Unicode strings that contain only US-ASCII characters. Hence * the callers of those methods must insure that the values they pass * do not contain non US-ASCII characters. The methods in this class * help do this.

* * The getHeader family of methods on MimeMessage and * MimeBodyPart return the raw header value. These might be encoded * as per RFC 2047, and if so, must be decoded into Unicode Strings. * The methods in this class help to do this.

* * Several System properties control strict conformance to the MIME * spec. Note that these are not session properties but must be set * globally as System properties.

* * The mail.mime.decodetext.strict property controls * decoding of MIME encoded words. The MIME spec requires that encoded * words start at the beginning of a whitespace separated word. Some * mailers incorrectly include encoded words in the middle of a word. * If the mail.mime.decodetext.strict System property is * set to "false", an attempt will be made to decode these * illegal encoded words. The default is true.

* * The mail.mime.encodeeol.strict property controls the * choice of Content-Transfer-Encoding for MIME parts that are not of * type "text". Often such parts will contain textual data for which * an encoding that allows normal end of line conventions is appropriate. * In rare cases, such a part will appear to contain entirely textual * data, but will require an encoding that preserves CR and LF characters * without change. If the mail.mime.encodeeol.strict * System property is set to "true", such an encoding will * be used when necessary. The default is false.

* * In addition, the mail.mime.charset System property can * be used to specify the default MIME charset to use for encoded words * and text parts that don't otherwise specify a charset. Normally, the * default MIME charset is derived from the default Java charset, as * specified in the file.encoding System property. Most * applications will have no need to explicitly set the default MIME * charset. In cases where the default MIME charset to be used for * mail messages is different than the charset used for files stored on * the system, this property should be set.

* * The current implementation also supports the following System property. *

* The mail.mime.ignoreunknownencoding property controls * whether unknown values in the Content-Transfer-Encoding * header, as passed to the decode method, cause an exception. * If set to "true", unknown values are ignored and 8bit * encoding is assumed. Otherwise, unknown values cause a MessagingException * to be thrown. * * @author John Mani * @author Bill Shannon */ public class MimeUtility { // This class cannot be instantiated private MimeUtility() { } public static final int ALL = -1; // cached map of whether a charset is compatible with ASCII // Map private static final Map nonAsciiCharsetMap = new HashMap(); private static final boolean decodeStrict = PropUtil.getBooleanSystemProperty("mail.mime.decodetext.strict", true); private static final boolean encodeEolStrict = PropUtil.getBooleanSystemProperty("mail.mime.encodeeol.strict", false); private static final boolean ignoreUnknownEncoding = PropUtil.getBooleanSystemProperty( "mail.mime.ignoreunknownencoding", false); /* * The following two properties allow disabling the fold() * and unfold() methods and reverting to the previous behavior. * They should never need to be changed and are here only because * of my paranoid concern with compatibility. */ private static final boolean foldEncodedWords = PropUtil.getBooleanSystemProperty("mail.mime.foldencodedwords", false); private static final boolean foldText = PropUtil.getBooleanSystemProperty("mail.mime.foldtext", true); /** * Get the Content-Transfer-Encoding that should be applied * to the input stream of this DataSource, to make it mail-safe.

* * The algorithm used here is:
*

    *
  • * If the DataSource implements {@link EncodingAware}, ask it * what encoding to use. If it returns non-null, return that value. *
  • * If the primary type of this datasource is "text" and if all * the bytes in its input stream are US-ASCII, then the encoding * is "7bit". If more than half of the bytes are non-US-ASCII, then * the encoding is "base64". If less than half of the bytes are * non-US-ASCII, then the encoding is "quoted-printable". *
  • * If the primary type of this datasource is not "text", then if * all the bytes of its input stream are US-ASCII, the encoding * is "7bit". If there is even one non-US-ASCII character, the * encoding is "base64". *
* * @param ds the DataSource * @return the encoding. This is either "7bit", * "quoted-printable" or "base64" */ public static String getEncoding(DataSource ds) { ContentType cType = null; InputStream is = null; String encoding = null; if (ds instanceof EncodingAware) { encoding = ((EncodingAware)ds).getEncoding(); if (encoding != null) return encoding; } try { cType = new ContentType(ds.getContentType()); is = ds.getInputStream(); boolean isText = cType.match("text/*"); // if not text, stop processing when we see non-ASCII int i = checkAscii(is, ALL, !isText); switch (i) { case ALL_ASCII: encoding = "7bit"; // all ASCII break; case MOSTLY_ASCII: if (isText && nonAsciiCharset(cType)) encoding = "base64"; // charset isn't compatible with ASCII else encoding = "quoted-printable"; // mostly ASCII break; default: encoding = "base64"; // mostly binary break; } } catch (Exception ex) { return "base64"; // what else ?! } finally { // Close the input stream try { if (is != null) is.close(); } catch (IOException ioex) { } } return encoding; } /** * Determine whether the charset in the Content-Type is compatible * with ASCII or not. A charset is compatible with ASCII if the * encoded byte stream representing the Unicode string "\r\n" is * the ASCII characters CR and LF. For example, the utf-16be * charset is not compatible with ASCII. * * For performance, we keep a static map that caches the results. */ private static boolean nonAsciiCharset(ContentType ct) { String charset = ct.getParameter("charset"); if (charset == null) return false; charset = charset.toLowerCase(Locale.ENGLISH); Boolean bool; synchronized (nonAsciiCharsetMap) { bool = (Boolean)nonAsciiCharsetMap.get(charset); } if (bool == null) { try { byte[] b = "\r\n".getBytes(charset); bool = Boolean.valueOf( b.length != 2 || b[0] != 015 || b[1] != 012); } catch (UnsupportedEncodingException uex) { bool = Boolean.FALSE; // a guess } catch (RuntimeException ex) { bool = Boolean.TRUE; // one of the weird ones? } synchronized (nonAsciiCharsetMap) { nonAsciiCharsetMap.put(charset, bool); } } return bool.booleanValue(); } /** * Same as getEncoding(DataSource) except that instead * of reading the data from an InputStream it uses the * writeTo method to examine the data. This is more * efficient in the common case of a DataHandler * created with an object and a MIME type (for example, a * "text/plain" String) because all the I/O is done in this * thread. In the case requiring an InputStream the * DataHandler uses a thread, a pair of pipe streams, * and the writeTo method to produce the data.

* * @since JavaMail 1.2 */ public static String getEncoding(DataHandler dh) { ContentType cType = null; String encoding = null; /* * Try to pick the most efficient means of determining the * encoding. If this DataHandler was created using a DataSource, * the getEncoding(DataSource) method is typically faster. If * the DataHandler was created with an object, this method is * much faster. To distinguish the two cases, we use a heuristic. * A DataHandler created with an object will always have a null name. * A DataHandler created with a DataSource will usually have a * non-null name. * * XXX - This is actually quite a disgusting hack, but it makes * a common case run over twice as fast. */ if (dh.getName() != null) return getEncoding(dh.getDataSource()); try { cType = new ContentType(dh.getContentType()); } catch (Exception ex) { return "base64"; // what else ?! } if (cType.match("text/*")) { // Check all of the available bytes AsciiOutputStream aos = new AsciiOutputStream(false, false); try { dh.writeTo(aos); } catch (IOException ex) { // ignore it, can't happen } switch (aos.getAscii()) { case ALL_ASCII: encoding = "7bit"; // all ascii break; case MOSTLY_ASCII: encoding = "quoted-printable"; // mostly ascii break; default: encoding = "base64"; // mostly binary break; } } else { // not "text" // Check all of available bytes, break out if we find // at least one non-US-ASCII character AsciiOutputStream aos = new AsciiOutputStream(true, encodeEolStrict); try { dh.writeTo(aos); } catch (IOException ex) { } // ignore it if (aos.getAscii() == ALL_ASCII) // all ascii encoding = "7bit"; else // found atleast one non-ascii character, use b64 encoding = "base64"; } return encoding; } /** * Decode the given input stream. The Input stream returned is * the decoded input stream. All the encodings defined in RFC 2045 * are supported here. They include "base64", "quoted-printable", * "7bit", "8bit", and "binary". In addition, "uuencode" is also * supported.

* * In the current implementation, if the * mail.mime.ignoreunknownencoding system property is set to * "true", unknown encoding values are ignored and the * original InputStream is returned. * * @param is input stream * @param encoding the encoding of the stream. * @return decoded input stream. * @exception MessagingException if the encoding is unknown */ public static InputStream decode(InputStream is, String encoding) throws MessagingException { if (encoding.equalsIgnoreCase("base64")) return new BASE64DecoderStream(is); else if (encoding.equalsIgnoreCase("quoted-printable")) return new QPDecoderStream(is); else if (encoding.equalsIgnoreCase("uuencode") || encoding.equalsIgnoreCase("x-uuencode") || encoding.equalsIgnoreCase("x-uue")) return new UUDecoderStream(is); else if (encoding.equalsIgnoreCase("binary") || encoding.equalsIgnoreCase("7bit") || encoding.equalsIgnoreCase("8bit")) return is; else { if (!ignoreUnknownEncoding) throw new MessagingException("Unknown encoding: " + encoding); return is; } } /** * Wrap an encoder around the given output stream. * All the encodings defined in RFC 2045 are supported here. * They include "base64", "quoted-printable", "7bit", "8bit" and * "binary". In addition, "uuencode" is also supported. * * @param os output stream * @param encoding the encoding of the stream. * @return output stream that applies the * specified encoding. * @exception MessagingException if the encoding is unknown */ public static OutputStream encode(OutputStream os, String encoding) throws MessagingException { if (encoding == null) return os; else if (encoding.equalsIgnoreCase("base64")) return new BASE64EncoderStream(os); else if (encoding.equalsIgnoreCase("quoted-printable")) return new QPEncoderStream(os); else if (encoding.equalsIgnoreCase("uuencode") || encoding.equalsIgnoreCase("x-uuencode") || encoding.equalsIgnoreCase("x-uue")) return new UUEncoderStream(os); else if (encoding.equalsIgnoreCase("binary") || encoding.equalsIgnoreCase("7bit") || encoding.equalsIgnoreCase("8bit")) return os; else throw new MessagingException("Unknown encoding: " +encoding); } /** * Wrap an encoder around the given output stream. * All the encodings defined in RFC 2045 are supported here. * They include "base64", "quoted-printable", "7bit", "8bit" and * "binary". In addition, "uuencode" is also supported. * The filename parameter is used with the "uuencode" * encoding and is included in the encoded output. * * @param os output stream * @param encoding the encoding of the stream. * @param filename name for the file being encoded (only used * with uuencode) * @return output stream that applies the * specified encoding. * @since JavaMail 1.2 */ public static OutputStream encode(OutputStream os, String encoding, String filename) throws MessagingException { if (encoding == null) return os; else if (encoding.equalsIgnoreCase("base64")) return new BASE64EncoderStream(os); else if (encoding.equalsIgnoreCase("quoted-printable")) return new QPEncoderStream(os); else if (encoding.equalsIgnoreCase("uuencode") || encoding.equalsIgnoreCase("x-uuencode") || encoding.equalsIgnoreCase("x-uue")) return new UUEncoderStream(os, filename); else if (encoding.equalsIgnoreCase("binary") || encoding.equalsIgnoreCase("7bit") || encoding.equalsIgnoreCase("8bit")) return os; else throw new MessagingException("Unknown encoding: " +encoding); } /** * Encode a RFC 822 "text" token into mail-safe form as per * RFC 2047.

* * The given Unicode string is examined for non US-ASCII * characters. If the string contains only US-ASCII characters, * it is returned as-is. If the string contains non US-ASCII * characters, it is first character-encoded using the platform's * default charset, then transfer-encoded using either the B or * Q encoding. The resulting bytes are then returned as a Unicode * string containing only ASCII characters.

* * Note that this method should be used to encode only * "unstructured" RFC 822 headers.

* * Example of usage: *

     *
     *  MimePart part = ...
     *  String rawvalue = "FooBar Mailer, Japanese version 1.1"
     *  try {
     *    // If we know for sure that rawvalue contains only US-ASCII 
     *    // characters, we can skip the encoding part
     *    part.setHeader("X-mailer", MimeUtility.encodeText(rawvalue));
     *  } catch (UnsupportedEncodingException e) {
     *    // encoding failure
     *  } catch (MessagingException me) {
     *   // setHeader() failure
     *  }
     *
     * 

* * @param text Unicode string * @return Unicode string containing only US-ASCII characters * @exception UnsupportedEncodingException if the encoding fails */ public static String encodeText(String text) throws UnsupportedEncodingException { return encodeText(text, null, null); } /** * Encode a RFC 822 "text" token into mail-safe form as per * RFC 2047.

* * The given Unicode string is examined for non US-ASCII * characters. If the string contains only US-ASCII characters, * it is returned as-is. If the string contains non US-ASCII * characters, it is first character-encoded using the specified * charset, then transfer-encoded using either the B or Q encoding. * The resulting bytes are then returned as a Unicode string * containing only ASCII characters.

* * Note that this method should be used to encode only * "unstructured" RFC 822 headers. * * @param text the header value * @param charset the charset. If this parameter is null, the * platform's default chatset is used. * @param encoding the encoding to be used. Currently supported * values are "B" and "Q". If this parameter is null, then * the "Q" encoding is used if most of characters to be * encoded are in the ASCII charset, otherwise "B" encoding * is used. * @return Unicode string containing only US-ASCII characters */ public static String encodeText(String text, String charset, String encoding) throws UnsupportedEncodingException { return encodeWord(text, charset, encoding, false); } /** * Decode "unstructured" headers, that is, headers that are defined * as '*text' as per RFC 822.

* * The string is decoded using the algorithm specified in * RFC 2047, Section 6.1. If the charset-conversion fails * for any sequence, an UnsupportedEncodingException is thrown. * If the String is not an RFC 2047 style encoded header, it is * returned as-is

* * Example of usage: *

     *
     *  MimePart part = ...
     *  String rawvalue = null;
     *  String  value = null;
     *  try {
     *    if ((rawvalue = part.getHeader("X-mailer")[0]) != null)
     *      value = MimeUtility.decodeText(rawvalue);
     *  } catch (UnsupportedEncodingException e) {
     *      // Don't care
     *      value = rawvalue;
     *  } catch (MessagingException me) { }
     *
     *  return value;
     *
     * 

* * @param etext the possibly encoded value * @exception UnsupportedEncodingException if the charset * conversion failed. */ public static String decodeText(String etext) throws UnsupportedEncodingException { /* * We look for sequences separated by "linear-white-space". * (as per RFC 2047, Section 6.1) * RFC 822 defines "linear-white-space" as SPACE | HT | CR | NL. */ String lwsp = " \t\n\r"; StringTokenizer st; /* * First, lets do a quick run thru the string and check * whether the sequence "=?" exists at all. If none exists, * we know there are no encoded-words in here and we can just * return the string as-is, without suffering thru the later * decoding logic. * This handles the most common case of unencoded headers * efficiently. */ if (etext.indexOf("=?") == -1) return etext; // Encoded words found. Start decoding ... st = new StringTokenizer(etext, lwsp, true); StringBuffer sb = new StringBuffer(); // decode buffer StringBuffer wsb = new StringBuffer(); // white space buffer boolean prevWasEncoded = false; while (st.hasMoreTokens()) { char c; String s = st.nextToken(); // If whitespace, append it to the whitespace buffer if (((c = s.charAt(0)) == ' ') || (c == '\t') || (c == '\r') || (c == '\n')) wsb.append(c); else { // Check if token is an 'encoded-word' .. String word; try { word = decodeWord(s); // Yes, this IS an 'encoded-word'. if (!prevWasEncoded && wsb.length() > 0) { // if the previous word was also encoded, we // should ignore the collected whitespace. Else // we include the whitespace as well. sb.append(wsb); } prevWasEncoded = true; } catch (ParseException pex) { // This is NOT an 'encoded-word'. word = s; // possibly decode inner encoded words if (!decodeStrict) { String dword = decodeInnerWords(word); if (dword != word) { // if a different String object was returned, // decoding was done. if (prevWasEncoded && word.startsWith("=?")) { // encoded followed by encoded, // throw away whitespace between } else { // include collected whitespace .. if (wsb.length() > 0) sb.append(wsb); } // did original end with encoded? prevWasEncoded = word.endsWith("?="); word = dword; } else { // include collected whitespace .. if (wsb.length() > 0) sb.append(wsb); prevWasEncoded = false; } } else { // include collected whitespace .. if (wsb.length() > 0) sb.append(wsb); prevWasEncoded = false; } } sb.append(word); // append the actual word wsb.setLength(0); // reset wsb for reuse } } sb.append(wsb); // append trailing whitespace return sb.toString(); } /** * Encode a RFC 822 "word" token into mail-safe form as per * RFC 2047.

* * The given Unicode string is examined for non US-ASCII * characters. If the string contains only US-ASCII characters, * it is returned as-is. If the string contains non US-ASCII * characters, it is first character-encoded using the platform's * default charset, then transfer-encoded using either the B or * Q encoding. The resulting bytes are then returned as a Unicode * string containing only ASCII characters.

* * This method is meant to be used when creating RFC 822 "phrases". * The InternetAddress class, for example, uses this to encode * it's 'phrase' component. * * @param word Unicode string * @return Array of Unicode strings containing only US-ASCII * characters. * @exception UnsupportedEncodingException if the encoding fails */ public static String encodeWord(String word) throws UnsupportedEncodingException { return encodeWord(word, null, null); } /** * Encode a RFC 822 "word" token into mail-safe form as per * RFC 2047.

* * The given Unicode string is examined for non US-ASCII * characters. If the string contains only US-ASCII characters, * it is returned as-is. If the string contains non US-ASCII * characters, it is first character-encoded using the specified * charset, then transfer-encoded using either the B or Q encoding. * The resulting bytes are then returned as a Unicode string * containing only ASCII characters.

* * @param word Unicode string * @param charset the MIME charset * @param encoding the encoding to be used. Currently supported * values are "B" and "Q". If this parameter is null, then * the "Q" encoding is used if most of characters to be * encoded are in the ASCII charset, otherwise "B" encoding * is used. * @return Unicode string containing only US-ASCII characters * @exception UnsupportedEncodingException if the encoding fails */ public static String encodeWord(String word, String charset, String encoding) throws UnsupportedEncodingException { return encodeWord(word, charset, encoding, true); } /* * Encode the given string. The parameter 'encodingWord' should * be true if a RFC 822 "word" token is being encoded and false if a * RFC 822 "text" token is being encoded. This is because the * "Q" encoding defined in RFC 2047 has more restrictions when * encoding "word" tokens. (Sigh) */ private static String encodeWord(String string, String charset, String encoding, boolean encodingWord) throws UnsupportedEncodingException { // If 'string' contains only US-ASCII characters, just // return it. int ascii = checkAscii(string); if (ascii == ALL_ASCII) return string; // Else, apply the specified charset conversion. String jcharset; if (charset == null) { // use default charset jcharset = getDefaultJavaCharset(); // the java charset charset = getDefaultMIMECharset(); // the MIME equivalent } else // MIME charset -> java charset jcharset = javaCharset(charset); // If no transfer-encoding is specified, figure one out. if (encoding == null) { if (ascii != MOSTLY_NONASCII) encoding = "Q"; else encoding = "B"; } boolean b64; if (encoding.equalsIgnoreCase("B")) b64 = true; else if (encoding.equalsIgnoreCase("Q")) b64 = false; else throw new UnsupportedEncodingException( "Unknown transfer encoding: " + encoding); StringBuffer outb = new StringBuffer(); // the output buffer doEncode(string, b64, jcharset, // As per RFC 2047, size of an encoded string should not // exceed 75 bytes. // 7 = size of "=?", '?', 'B'/'Q', '?', "?=" 75 - 7 - charset.length(), // the available space "=?" + charset + "?" + encoding + "?", // prefix true, encodingWord, outb); return outb.toString(); } private static void doEncode(String string, boolean b64, String jcharset, int avail, String prefix, boolean first, boolean encodingWord, StringBuffer buf) throws UnsupportedEncodingException { // First find out what the length of the encoded version of // 'string' would be. byte[] bytes = string.getBytes(jcharset); int len; if (b64) // "B" encoding len = BEncoderStream.encodedLength(bytes); else // "Q" len = QEncoderStream.encodedLength(bytes, encodingWord); int size; if ((len > avail) && ((size = string.length()) > 1)) { // If the length is greater than 'avail', split 'string' // into two and recurse. doEncode(string.substring(0, size/2), b64, jcharset, avail, prefix, first, encodingWord, buf); doEncode(string.substring(size/2, size), b64, jcharset, avail, prefix, false, encodingWord, buf); } else { // length <= than 'avail'. Encode the given string ByteArrayOutputStream os = new ByteArrayOutputStream(); OutputStream eos; // the encoder if (b64) // "B" encoding eos = new BEncoderStream(os); else // "Q" encoding eos = new QEncoderStream(os, encodingWord); try { // do the encoding eos.write(bytes); eos.close(); } catch (IOException ioex) { } byte[] encodedBytes = os.toByteArray(); // the encoded stuff // Now write out the encoded (all ASCII) bytes into our // StringBuffer if (!first) // not the first line of this sequence if (foldEncodedWords) buf.append("\r\n "); // start a continuation line else buf.append(" "); // line will be folded later buf.append(prefix); for (int i = 0; i < encodedBytes.length; i++) buf.append((char)encodedBytes[i]); buf.append("?="); // terminate the current sequence } } /** * The string is parsed using the rules in RFC 2047 and RFC 2231 for * parsing an "encoded-word". If the parse fails, a ParseException is * thrown. Otherwise, it is transfer-decoded, and then * charset-converted into Unicode. If the charset-conversion * fails, an UnsupportedEncodingException is thrown.

* * @param eword the encoded value * @exception ParseException if the string is not an * encoded-word as per RFC 2047 and RFC 2231. * @exception UnsupportedEncodingException if the charset * conversion failed. */ public static String decodeWord(String eword) throws ParseException, UnsupportedEncodingException { if (!eword.startsWith("=?")) // not an encoded word throw new ParseException( "encoded word does not start with \"=?\": " + eword); // get charset int start = 2; int pos; if ((pos = eword.indexOf('?', start)) == -1) throw new ParseException( "encoded word does not include charset: " + eword); String charset = eword.substring(start, pos); int lpos = charset.indexOf('*'); // RFC 2231 language specified? if (lpos >= 0) // yes, throw it away charset = charset.substring(0, lpos); charset = javaCharset(charset); // get encoding start = pos+1; if ((pos = eword.indexOf('?', start)) == -1) throw new ParseException( "encoded word does not include encoding: " + eword); String encoding = eword.substring(start, pos); // get encoded-sequence start = pos+1; if ((pos = eword.indexOf("?=", start)) == -1) throw new ParseException( "encoded word does not end with \"?=\": " + eword); /* * XXX - should include this, but leaving it out for compatibility... * if (decodeStrict && pos != eword.length() - 2) throw new ParseException( "encoded word does not end with \"?=\": " + eword);); */ String word = eword.substring(start, pos); try { String decodedWord; if (word.length() > 0) { // Extract the bytes from word ByteArrayInputStream bis = new ByteArrayInputStream(ASCIIUtility.getBytes(word)); // Get the appropriate decoder InputStream is; if (encoding.equalsIgnoreCase("B")) is = new BASE64DecoderStream(bis); else if (encoding.equalsIgnoreCase("Q")) is = new QDecoderStream(bis); else throw new UnsupportedEncodingException( "unknown encoding: " + encoding); // For b64 & q, size of decoded word <= size of word. So // the decoded bytes must fit into the 'bytes' array. This // is certainly more efficient than writing bytes into a // ByteArrayOutputStream and then pulling out the byte[] // from it. int count = bis.available(); byte[] bytes = new byte[count]; // count is set to the actual number of decoded bytes count = is.read(bytes, 0, count); // Finally, convert the decoded bytes into a String using // the specified charset decodedWord = count <= 0 ? "" : new String(bytes, 0, count, charset); } else { // no characters to decode, return empty string decodedWord = ""; } if (pos + 2 < eword.length()) { // there's still more text in the string String rest = eword.substring(pos + 2); if (!decodeStrict) rest = decodeInnerWords(rest); decodedWord += rest; } return decodedWord; } catch (UnsupportedEncodingException uex) { // explicitly catch and rethrow this exception, otherwise // the below IOException catch will swallow this up! throw uex; } catch (IOException ioex) { // Shouldn't happen. throw new ParseException(ioex.toString()); } catch (IllegalArgumentException iex) { /* An unknown charset of the form ISO-XXX-XXX, will cause * the JDK to throw an IllegalArgumentException ... Since the * JDK will attempt to create a classname using this string, * but valid classnames must not contain the character '-', * and this results in an IllegalArgumentException, rather than * the expected UnsupportedEncodingException. Yikes */ throw new UnsupportedEncodingException(charset); } } /** * Look for encoded words within a word. The MIME spec doesn't * allow this, but many broken mailers, especially Japanese mailers, * produce such incorrect encodings. */ private static String decodeInnerWords(String word) throws UnsupportedEncodingException { int start = 0, i; StringBuffer buf = new StringBuffer(); while ((i = word.indexOf("=?", start)) >= 0) { buf.append(word.substring(start, i)); // find first '?' after opening '=?' - end of charset int end = word.indexOf('?', i + 2); if (end < 0) break; // find next '?' after that - end of encoding end = word.indexOf('?', end + 1); if (end < 0) break; // find terminating '?=' end = word.indexOf("?=", end + 1); if (end < 0) break; String s = word.substring(i, end + 2); try { s = decodeWord(s); } catch (ParseException pex) { // ignore it, just use the original string } buf.append(s); start = end + 2; } if (start == 0) return word; if (start < word.length()) buf.append(word.substring(start)); return buf.toString(); } /** * A utility method to quote a word, if the word contains any * characters from the specified 'specials' list.

* * The HeaderTokenizer class defines two special * sets of delimiters - MIME and RFC 822.

* * This method is typically used during the generation of * RFC 822 and MIME header fields. * * @param word word to be quoted * @param specials the set of special characters * @return the possibly quoted word * @see javax.mail.internet.HeaderTokenizer#MIME * @see javax.mail.internet.HeaderTokenizer#RFC822 */ public static String quote(String word, String specials) { int len = word == null ? 0 : word.length(); if (len == 0) return "\"\""; // an empty string is handled specially /* * Look for any "bad" characters, Escape and * quote the entire string if necessary. */ boolean needQuoting = false; for (int i = 0; i < len; i++) { char c = word.charAt(i); if (c == '"' || c == '\\' || c == '\r' || c == '\n') { // need to escape them and then quote the whole string StringBuffer sb = new StringBuffer(len + 3); sb.append('"'); sb.append(word.substring(0, i)); int lastc = 0; for (int j = i; j < len; j++) { char cc = word.charAt(j); if ((cc == '"') || (cc == '\\') || (cc == '\r') || (cc == '\n')) if (cc == '\n' && lastc == '\r') ; // do nothing, CR was already escaped else sb.append('\\'); // Escape the character sb.append(cc); lastc = cc; } sb.append('"'); return sb.toString(); } else if (c < 040 || c >= 0177 || specials.indexOf(c) >= 0) // These characters cause the string to be quoted needQuoting = true; } if (needQuoting) { StringBuffer sb = new StringBuffer(len + 2); sb.append('"').append(word).append('"'); return sb.toString(); } else return word; } /** * Fold a string at linear whitespace so that each line is no longer * than 76 characters, if possible. If there are more than 76 * non-whitespace characters consecutively, the string is folded at * the first whitespace after that sequence. The parameter * used indicates how many characters have been used in * the current line; it is usually the length of the header name.

* * Note that line breaks in the string aren't escaped; they probably * should be. * * @param used characters used in line so far * @param s the string to fold * @return the folded string * @since JavaMail 1.4 */ public static String fold(int used, String s) { if (!foldText) return s; int end; char c; // Strip trailing spaces and newlines for (end = s.length() - 1; end >= 0; end--) { c = s.charAt(end); if (c != ' ' && c != '\t' && c != '\r' && c != '\n') break; } if (end != s.length() - 1) s = s.substring(0, end + 1); // if the string fits now, just return it if (used + s.length() <= 76) return s; // have to actually fold the string StringBuffer sb = new StringBuffer(s.length() + 4); char lastc = 0; while (used + s.length() > 76) { int lastspace = -1; for (int i = 0; i < s.length(); i++) { if (lastspace != -1 && used + i > 76) break; c = s.charAt(i); if (c == ' ' || c == '\t') if (!(lastc == ' ' || lastc == '\t')) lastspace = i; lastc = c; } if (lastspace == -1) { // no space, use the whole thing sb.append(s); s = ""; used = 0; break; } sb.append(s.substring(0, lastspace)); sb.append("\r\n"); lastc = s.charAt(lastspace); sb.append(lastc); s = s.substring(lastspace + 1); used = 1; } sb.append(s); return sb.toString(); } /** * Unfold a folded header. Any line breaks that aren't escaped and * are followed by whitespace are removed. * * @param s the string to unfold * @return the unfolded string * @since JavaMail 1.4 */ public static String unfold(String s) { if (!foldText) return s; StringBuffer sb = null; int i; while ((i = indexOfAny(s, "\r\n")) >= 0) { int start = i; int l = s.length(); i++; // skip CR or NL if (i < l && s.charAt(i - 1) == '\r' && s.charAt(i) == '\n') i++; // skip LF if (start == 0 || s.charAt(start - 1) != '\\') { char c; // if next line starts with whitespace, skip all of it // XXX - always has to be true? if (i < l && ((c = s.charAt(i)) == ' ' || c == '\t')) { i++; // skip whitespace while (i < l && ((c = s.charAt(i)) == ' ' || c == '\t')) i++; if (sb == null) sb = new StringBuffer(s.length()); if (start != 0) { sb.append(s.substring(0, start)); sb.append(' '); } s = s.substring(i); continue; } // it's not a continuation line, just leave it in if (sb == null) sb = new StringBuffer(s.length()); sb.append(s.substring(0, i)); s = s.substring(i); } else { // there's a backslash at "start - 1" // strip it out, but leave in the line break if (sb == null) sb = new StringBuffer(s.length()); sb.append(s.substring(0, start - 1)); sb.append(s.substring(start, i)); s = s.substring(i); } } if (sb != null) { sb.append(s); return sb.toString(); } else return s; } /** * Return the first index of any of the characters in "any" in "s", * or -1 if none are found. * * This should be a method on String. */ private static int indexOfAny(String s, String any) { return indexOfAny(s, any, 0); } private static int indexOfAny(String s, String any, int start) { try { int len = s.length(); for (int i = start; i < len; i++) { if (any.indexOf(s.charAt(i)) >= 0) return i; } return -1; } catch (StringIndexOutOfBoundsException e) { return -1; } } /** * Convert a MIME charset name into a valid Java charset name.

* * @param charset the MIME charset name * @return the Java charset equivalent. If a suitable mapping is * not available, the passed in charset is itself returned. */ public static String javaCharset(String charset) { if (mime2java == null || charset == null) // no mapping table, or charset parameter is null return charset; String alias = (String)mime2java.get(charset.toLowerCase(Locale.ENGLISH)); return alias == null ? charset : alias; } /** * Convert a java charset into its MIME charset name.

* * Note that a future version of JDK (post 1.2) might provide * this functionality, in which case, we may deprecate this * method then. * * @param charset the JDK charset * @return the MIME/IANA equivalent. If a mapping * is not possible, the passed in charset itself * is returned. * @since JavaMail 1.1 */ public static String mimeCharset(String charset) { if (java2mime == null || charset == null) // no mapping table or charset param is null return charset; String alias = (String)java2mime.get(charset.toLowerCase(Locale.ENGLISH)); return alias == null ? charset : alias; } private static String defaultJavaCharset; private static String defaultMIMECharset; /** * Get the default charset corresponding to the system's current * default locale. If the System property mail.mime.charset * is set, a system charset corresponding to this MIME charset will be * returned.

* * @return the default charset of the system's default locale, * as a Java charset. (NOT a MIME charset) * @since JavaMail 1.1 */ public static String getDefaultJavaCharset() { if (defaultJavaCharset == null) { /* * If mail.mime.charset is set, it controls the default * Java charset as well. */ String mimecs = null; try { mimecs = System.getProperty("mail.mime.charset"); } catch (SecurityException ex) { } // ignore it if (mimecs != null && mimecs.length() > 0) { defaultJavaCharset = javaCharset(mimecs); return defaultJavaCharset; } try { defaultJavaCharset = System.getProperty("file.encoding", "8859_1"); } catch (SecurityException sex) { class NullInputStream extends InputStream { public int read() { return 0; } } InputStreamReader reader = new InputStreamReader(new NullInputStream()); defaultJavaCharset = reader.getEncoding(); if (defaultJavaCharset == null) defaultJavaCharset = "8859_1"; } } return defaultJavaCharset; } /* * Get the default MIME charset for this locale. */ static String getDefaultMIMECharset() { if (defaultMIMECharset == null) { try { defaultMIMECharset = System.getProperty("mail.mime.charset"); } catch (SecurityException ex) { } // ignore it } if (defaultMIMECharset == null) defaultMIMECharset = mimeCharset(getDefaultJavaCharset()); return defaultMIMECharset; } // Tables to map MIME charset names to Java names and vice versa. // XXX - Should eventually use J2SE 1.4 java.nio.charset.Charset private static Hashtable mime2java; private static Hashtable java2mime; static { java2mime = new Hashtable(40); mime2java = new Hashtable(10); try { // Use this class's classloader to load the mapping file // XXX - we should use SecuritySupport, but it's in another package InputStream is = javax.mail.internet.MimeUtility.class.getResourceAsStream( "/META-INF/javamail.charset.map"); if (is != null) { try { is = new LineInputStream(is); // Load the JDK-to-MIME charset mapping table loadMappings((LineInputStream)is, java2mime); // Load the MIME-to-JDK charset mapping table loadMappings((LineInputStream)is, mime2java); } finally { try { is.close(); } catch (Exception cex) { // ignore } } } } catch (Exception ex) { } // If we didn't load the tables, e.g., because we didn't have // permission, load them manually. The entries here should be // the same as the default javamail.charset.map. if (java2mime.isEmpty()) { java2mime.put("8859_1", "ISO-8859-1"); java2mime.put("iso8859_1", "ISO-8859-1"); java2mime.put("iso8859-1", "ISO-8859-1"); java2mime.put("8859_2", "ISO-8859-2"); java2mime.put("iso8859_2", "ISO-8859-2"); java2mime.put("iso8859-2", "ISO-8859-2"); java2mime.put("8859_3", "ISO-8859-3"); java2mime.put("iso8859_3", "ISO-8859-3"); java2mime.put("iso8859-3", "ISO-8859-3"); java2mime.put("8859_4", "ISO-8859-4"); java2mime.put("iso8859_4", "ISO-8859-4"); java2mime.put("iso8859-4", "ISO-8859-4"); java2mime.put("8859_5", "ISO-8859-5"); java2mime.put("iso8859_5", "ISO-8859-5"); java2mime.put("iso8859-5", "ISO-8859-5"); java2mime.put("8859_6", "ISO-8859-6"); java2mime.put("iso8859_6", "ISO-8859-6"); java2mime.put("iso8859-6", "ISO-8859-6"); java2mime.put("8859_7", "ISO-8859-7"); java2mime.put("iso8859_7", "ISO-8859-7"); java2mime.put("iso8859-7", "ISO-8859-7"); java2mime.put("8859_8", "ISO-8859-8"); java2mime.put("iso8859_8", "ISO-8859-8"); java2mime.put("iso8859-8", "ISO-8859-8"); java2mime.put("8859_9", "ISO-8859-9"); java2mime.put("iso8859_9", "ISO-8859-9"); java2mime.put("iso8859-9", "ISO-8859-9"); java2mime.put("sjis", "Shift_JIS"); java2mime.put("jis", "ISO-2022-JP"); java2mime.put("iso2022jp", "ISO-2022-JP"); java2mime.put("euc_jp", "euc-jp"); java2mime.put("koi8_r", "koi8-r"); java2mime.put("euc_cn", "euc-cn"); java2mime.put("euc_tw", "euc-tw"); java2mime.put("euc_kr", "euc-kr"); } if (mime2java.isEmpty()) { mime2java.put("iso-2022-cn", "ISO2022CN"); mime2java.put("iso-2022-kr", "ISO2022KR"); mime2java.put("utf-8", "UTF8"); mime2java.put("utf8", "UTF8"); mime2java.put("ja_jp.iso2022-7", "ISO2022JP"); mime2java.put("ja_jp.eucjp", "EUCJIS"); mime2java.put("euc-kr", "KSC5601"); mime2java.put("euckr", "KSC5601"); mime2java.put("us-ascii", "ISO-8859-1"); mime2java.put("x-us-ascii", "ISO-8859-1"); } } private static void loadMappings(LineInputStream is, Hashtable table) { String currLine; while (true) { try { currLine = is.readLine(); } catch (IOException ioex) { break; // error in reading, stop } if (currLine == null) // end of file, stop break; if (currLine.startsWith("--") && currLine.endsWith("--")) // end of this table break; // ignore empty lines and comments if (currLine.trim().length() == 0 || currLine.startsWith("#")) continue; // A valid entry is of the form // where, := SPACE | HT. Parse this StringTokenizer tk = new StringTokenizer(currLine, " \t"); try { String key = tk.nextToken(); String value = tk.nextToken(); table.put(key.toLowerCase(Locale.ENGLISH), value); } catch (NoSuchElementException nex) { } } } static final int ALL_ASCII = 1; static final int MOSTLY_ASCII = 2; static final int MOSTLY_NONASCII = 3; /** * Check if the given string contains non US-ASCII characters. * @param s string * @return ALL_ASCII if all characters in the string * belong to the US-ASCII charset. MOSTLY_ASCII * if more than half of the available characters * are US-ASCII characters. Else MOSTLY_NONASCII. */ static int checkAscii(String s) { int ascii = 0, non_ascii = 0; int l = s.length(); for (int i = 0; i < l; i++) { if (nonascii((int)s.charAt(i))) // non-ascii non_ascii++; else ascii++; } if (non_ascii == 0) return ALL_ASCII; if (ascii > non_ascii) return MOSTLY_ASCII; return MOSTLY_NONASCII; } /** * Check if the given byte array contains non US-ASCII characters. * @param b byte array * @return ALL_ASCII if all characters in the string * belong to the US-ASCII charset. MOSTLY_ASCII * if more than half of the available characters * are US-ASCII characters. Else MOSTLY_NONASCII. * * XXX - this method is no longer used */ static int checkAscii(byte[] b) { int ascii = 0, non_ascii = 0; for (int i=0; i < b.length; i++) { // The '&' operator automatically causes b[i] to be promoted // to an int, and we mask out the higher bytes in the int // so that the resulting value is not a negative integer. if (nonascii(b[i] & 0xff)) // non-ascii non_ascii++; else ascii++; } if (non_ascii == 0) return ALL_ASCII; if (ascii > non_ascii) return MOSTLY_ASCII; return MOSTLY_NONASCII; } /** * Check if the given input stream contains non US-ASCII characters. * Upto max bytes are checked. If max is * set to ALL, then all the bytes available in this * input stream are checked. If breakOnNonAscii is true * the check terminates when the first non-US-ASCII character is * found and MOSTLY_NONASCII is returned. Else, the check continues * till max bytes or till the end of stream. * * @param is the input stream * @param max maximum bytes to check for. The special value * ALL indicates that all the bytes in this input * stream must be checked. * @param breakOnNonAscii if true, then terminate the * the check when the first non-US-ASCII character * is found. * @return ALL_ASCII if all characters in the string * belong to the US-ASCII charset. MOSTLY_ASCII * if more than half of the available characters * are US-ASCII characters. Else MOSTLY_NONASCII. */ static int checkAscii(InputStream is, int max, boolean breakOnNonAscii) { int ascii = 0, non_ascii = 0; int len; int block = 4096; int linelen = 0; boolean longLine = false, badEOL = false; boolean checkEOL = encodeEolStrict && breakOnNonAscii; byte buf[] = null; if (max != 0) { block = (max == ALL) ? 4096 : Math.min(max, 4096); buf = new byte[block]; } while (max != 0) { try { if ((len = is.read(buf, 0, block)) == -1) break; int lastb = 0; for (int i = 0; i < len; i++) { // The '&' operator automatically causes b[i] to // be promoted to an int, and we mask out the higher // bytes in the int so that the resulting value is // not a negative integer. int b = buf[i] & 0xff; if (checkEOL && ((lastb == '\r' && b != '\n') || (lastb != '\r' && b == '\n'))) badEOL = true; if (b == '\r' || b == '\n') linelen = 0; else { linelen++; if (linelen > 998) // 1000 - CRLF longLine = true; } if (nonascii(b)) { // non-ascii if (breakOnNonAscii) // we are done return MOSTLY_NONASCII; else non_ascii++; } else ascii++; lastb = b; } } catch (IOException ioex) { break; } if (max != ALL) max -= len; } if (max == 0 && breakOnNonAscii) // We have been told to break on the first non-ascii character. // We haven't got any non-ascii character yet, but then we // have not checked all of the available bytes either. So we // cannot say for sure that this input stream is ALL_ASCII, // and hence we must play safe and return MOSTLY_NONASCII return MOSTLY_NONASCII; if (non_ascii == 0) { // no non-us-ascii characters so far // If we're looking at non-text data, and we saw CR without LF // or vice versa, consider this mostly non-ASCII so that it // will be base64 encoded (since the quoted-printable encoder // doesn't encode this case properly). if (badEOL) return MOSTLY_NONASCII; // if we've seen a long line, we degrade to mostly ascii else if (longLine) return MOSTLY_ASCII; else return ALL_ASCII; } if (ascii > non_ascii) // mostly ascii return MOSTLY_ASCII; return MOSTLY_NONASCII; } static final boolean nonascii(int b) { return b >= 0177 || (b < 040 && b != '\r' && b != '\n' && b != '\t'); } } /** * An OutputStream that determines whether the data written to * it is all ASCII, mostly ASCII, or mostly non-ASCII. */ class AsciiOutputStream extends OutputStream { private boolean breakOnNonAscii; private int ascii = 0, non_ascii = 0; private int linelen = 0; private boolean longLine = false; private boolean badEOL = false; private boolean checkEOL = false; private int lastb = 0; private int ret = 0; public AsciiOutputStream(boolean breakOnNonAscii, boolean encodeEolStrict) { this.breakOnNonAscii = breakOnNonAscii; checkEOL = encodeEolStrict && breakOnNonAscii; } public void write(int b) throws IOException { check(b); } public void write(byte b[]) throws IOException { write(b, 0, b.length); } public void write(byte b[], int off, int len) throws IOException { len += off; for (int i = off; i < len ; i++) check(b[i]); } private final void check(int b) throws IOException { b &= 0xff; if (checkEOL && ((lastb == '\r' && b != '\n') || (lastb != '\r' && b == '\n'))) badEOL = true; if (b == '\r' || b == '\n') linelen = 0; else { linelen++; if (linelen > 998) // 1000 - CRLF longLine = true; } if (MimeUtility.nonascii(b)) { // non-ascii non_ascii++; if (breakOnNonAscii) { // we are done ret = MimeUtility.MOSTLY_NONASCII; throw new EOFException(); } } else ascii++; lastb = b; } /** * Return ASCII-ness of data stream. */ public int getAscii() { if (ret != 0) return ret; // If we're looking at non-text data, and we saw CR without LF // or vice versa, consider this mostly non-ASCII so that it // will be base64 encoded (since the quoted-printable encoder // doesn't encode this case properly). if (badEOL) return MimeUtility.MOSTLY_NONASCII; else if (non_ascii == 0) { // no non-us-ascii characters so far // if we've seen a long line, we degrade to mostly ascii if (longLine) return MimeUtility.MOSTLY_ASCII; else return MimeUtility.ALL_ASCII; } if (ascii > non_ascii) // mostly ascii return MimeUtility.MOSTLY_ASCII; return MimeUtility.MOSTLY_NONASCII; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy