001    /****************************************************************
002     * Licensed to the Apache Software Foundation (ASF) under one   *
003     * or more contributor license agreements.  See the NOTICE file *
004     * distributed with this work for additional information        *
005     * regarding copyright ownership.  The ASF licenses this file   *
006     * to you under the Apache License, Version 2.0 (the            *
007     * "License"); you may not use this file except in compliance   *
008     * with the License.  You may obtain a copy of the License at   *
009     *                                                              *
010     *   http://www.apache.org/licenses/LICENSE-2.0                 *
011     *                                                              *
012     * Unless required by applicable law or agreed to in writing,   *
013     * software distributed under the License is distributed on an  *
014     * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
015     * KIND, either express or implied.  See the License for the    *
016     * specific language governing permissions and limitations      *
017     * under the License.                                           *
018     ****************************************************************/
019    
020    package org.apache.james.mime4j.codec;
021    
022    import java.nio.ByteBuffer;
023    import java.nio.charset.Charset;
024    import java.util.BitSet;
025    import java.util.Locale;
026    
027    import org.apache.james.mime4j.util.CharsetUtil;
028    
029    /**
030     * Static methods for encoding header field values. This includes encoded-words
031     * as defined in <a href='http://www.faqs.org/rfcs/rfc2047.html'>RFC 2047</a>
032     * or display-names of an e-mail address, for example.
033     */
034    public class EncoderUtil {
035        private static final byte[] BASE64_TABLE = Base64OutputStream.BASE64_TABLE;
036        private static final char BASE64_PAD = '=';
037    
038        private static final BitSet Q_REGULAR_CHARS = initChars("=_?");
039    
040        private static final BitSet Q_RESTRICTED_CHARS = initChars("=_?\"#$%&'(),.:;<>@[\\]^`{|}~");
041    
042        private static final int MAX_USED_CHARACTERS = 50;
043    
044        private static final String ENC_WORD_PREFIX = "=?";
045        private static final String ENC_WORD_SUFFIX = "?=";
046    
047        private static final int ENCODED_WORD_MAX_LENGTH = 75; // RFC 2047
048    
049        private static final BitSet TOKEN_CHARS = initChars("()<>@,;:\\\"/[]?=");
050    
051        private static final BitSet ATEXT_CHARS = initChars("()<>@.,;:\\\"[]");
052    
053        private static BitSet initChars(String specials) {
054            BitSet bs = new BitSet(128);
055            for (char ch = 33; ch < 127; ch++) {
056                if (specials.indexOf(ch) == -1) {
057                    bs.set(ch);
058                }
059            }
060            return bs;
061        }
062    
063        /**
064         * Selects one of the two encodings specified in RFC 2047.
065         */
066        public enum Encoding {
067            /** The B encoding (identical to base64 defined in RFC 2045). */
068            B,
069            /** The Q encoding (similar to quoted-printable defined in RFC 2045). */
070            Q
071        }
072    
073        /**
074         * Indicates the intended usage of an encoded word.
075         */
076        public enum Usage {
077            /**
078             * Encoded word is used to replace a 'text' token in any Subject or
079             * Comments header field.
080             */
081            TEXT_TOKEN,
082            /**
083             * Encoded word is used to replace a 'word' entity within a 'phrase',
084             * for example, one that precedes an address in a From, To, or Cc
085             * header.
086             */
087            WORD_ENTITY
088        }
089    
090        private EncoderUtil() {
091        }
092    
093        /**
094         * Encodes the display-name portion of an address. See <a
095         * href='http://www.faqs.org/rfcs/rfc5322.html'>RFC 5322</a> section 3.4
096         * and <a href='http://www.faqs.org/rfcs/rfc2047.html'>RFC 2047</a> section
097         * 5.3. The specified string should not be folded.
098         *
099         * @param displayName
100         *            display-name to encode.
101         * @return encoded display-name.
102         */
103        public static String encodeAddressDisplayName(String displayName) {
104            // display-name = phrase
105            // phrase = 1*( encoded-word / word )
106            // word = atom / quoted-string
107            // atom = [CFWS] 1*atext [CFWS]
108            // CFWS = comment or folding white space
109    
110            if (isAtomPhrase(displayName)) {
111                return displayName;
112            } else if (hasToBeEncoded(displayName, 0)) {
113                return encodeEncodedWord(displayName, Usage.WORD_ENTITY);
114            } else {
115                return quote(displayName);
116            }
117        }
118    
119        /**
120         * Encodes the local part of an address specification as described in RFC
121         * 5322 section 3.4.1. Leading and trailing CFWS should have been removed
122         * before calling this method. The specified string should not contain any
123         * illegal (control or non-ASCII) characters.
124         *
125         * @param localPart
126         *            the local part to encode
127         * @return the encoded local part.
128         */
129        public static String encodeAddressLocalPart(String localPart) {
130            // local-part = dot-atom / quoted-string
131            // dot-atom = [CFWS] dot-atom-text [CFWS]
132            // CFWS = comment or folding white space
133    
134            if (isDotAtomText(localPart)) {
135                return localPart;
136            } else {
137                return quote(localPart);
138            }
139        }
140    
141        /**
142         * Encodes the specified strings into a header parameter as described in RFC
143         * 2045 section 5.1 and RFC 2183 section 2. The specified strings should not
144         * contain any illegal (control or non-ASCII) characters.
145         *
146         * @param name
147         *            parameter name.
148         * @param value
149         *            parameter value.
150         * @return encoded result.
151         */
152        public static String encodeHeaderParameter(String name, String value) {
153            name = name.toLowerCase(Locale.US);
154    
155            // value := token / quoted-string
156            if (isToken(value)) {
157                return name + "=" + value;
158            } else {
159                return name + "=" + quote(value);
160            }
161        }
162    
163        /**
164         * Shortcut method that encodes the specified text into an encoded-word if
165         * the text has to be encoded.
166         *
167         * @param text
168         *            text to encode.
169         * @param usage
170         *            whether the encoded-word is to be used to replace a text token
171         *            or a word entity (see RFC 822).
172         * @param usedCharacters
173         *            number of characters already used up (<code>0 <= usedCharacters <= 50</code>).
174         * @return the specified text if encoding is not necessary or an encoded
175         *         word or a sequence of encoded words otherwise.
176         */
177        public static String encodeIfNecessary(String text, Usage usage,
178                int usedCharacters) {
179            if (hasToBeEncoded(text, usedCharacters))
180                return encodeEncodedWord(text, usage, usedCharacters);
181            else
182                return text;
183        }
184    
185        /**
186         * Determines if the specified string has to encoded into an encoded-word.
187         * Returns <code>true</code> if the text contains characters that don't
188         * fall into the printable ASCII character set or if the text contains a
189         * 'word' (sequence of non-whitespace characters) longer than 77 characters
190         * (including characters already used up in the line).
191         *
192         * @param text
193         *            text to analyze.
194         * @param usedCharacters
195         *            number of characters already used up (<code>0 <= usedCharacters <= 50</code>).
196         * @return <code>true</code> if the specified text has to be encoded into
197         *         an encoded-word, <code>false</code> otherwise.
198         */
199        public static boolean hasToBeEncoded(String text, int usedCharacters) {
200            if (text == null)
201                throw new IllegalArgumentException();
202            if (usedCharacters < 0 || usedCharacters > MAX_USED_CHARACTERS)
203                throw new IllegalArgumentException();
204    
205            int nonWhiteSpaceCount = usedCharacters;
206    
207            for (int idx = 0; idx < text.length(); idx++) {
208                char ch = text.charAt(idx);
209                if (ch == '\t' || ch == ' ') {
210                    nonWhiteSpaceCount = 0;
211                } else {
212                    nonWhiteSpaceCount++;
213                    if (nonWhiteSpaceCount > 77) {
214                        // Line cannot be folded into multiple lines with no more
215                        // than 78 characters each. Encoding as encoded-words makes
216                        // that possible. One character has to be reserved for
217                        // folding white space; that leaves 77 characters.
218                        return true;
219                    }
220    
221                    if (ch < 32 || ch >= 127) {
222                        // non-printable ascii character has to be encoded
223                        return true;
224                    }
225                }
226            }
227    
228            return false;
229        }
230    
231        /**
232         * Encodes the specified text into an encoded word or a sequence of encoded
233         * words separated by space. The text is separated into a sequence of
234         * encoded words if it does not fit in a single one.
235         * <p>
236         * The charset to encode the specified text into a byte array and the
237         * encoding to use for the encoded-word are detected automatically.
238         * <p>
239         * This method assumes that zero characters have already been used up in the
240         * current line.
241         *
242         * @param text
243         *            text to encode.
244         * @param usage
245         *            whether the encoded-word is to be used to replace a text token
246         *            or a word entity (see RFC 822).
247         * @return the encoded word (or sequence of encoded words if the given text
248         *         does not fit in a single encoded word).
249         * @see #hasToBeEncoded(String, int)
250         */
251        public static String encodeEncodedWord(String text, Usage usage) {
252            return encodeEncodedWord(text, usage, 0, null, null);
253        }
254    
255        /**
256         * Encodes the specified text into an encoded word or a sequence of encoded
257         * words separated by space. The text is separated into a sequence of
258         * encoded words if it does not fit in a single one.
259         * <p>
260         * The charset to encode the specified text into a byte array and the
261         * encoding to use for the encoded-word are detected automatically.
262         *
263         * @param text
264         *            text to encode.
265         * @param usage
266         *            whether the encoded-word is to be used to replace a text token
267         *            or a word entity (see RFC 822).
268         * @param usedCharacters
269         *            number of characters already used up (<code>0 <= usedCharacters <= 50</code>).
270         * @return the encoded word (or sequence of encoded words if the given text
271         *         does not fit in a single encoded word).
272         * @see #hasToBeEncoded(String, int)
273         */
274        public static String encodeEncodedWord(String text, Usage usage,
275                int usedCharacters) {
276            return encodeEncodedWord(text, usage, usedCharacters, null, null);
277        }
278    
279        /**
280         * Encodes the specified text into an encoded word or a sequence of encoded
281         * words separated by space. The text is separated into a sequence of
282         * encoded words if it does not fit in a single one.
283         *
284         * @param text
285         *            text to encode.
286         * @param usage
287         *            whether the encoded-word is to be used to replace a text token
288         *            or a word entity (see RFC 822).
289         * @param usedCharacters
290         *            number of characters already used up (<code>0 <= usedCharacters <= 50</code>).
291         * @param charset
292         *            the Java charset that should be used to encode the specified
293         *            string into a byte array. A suitable charset is detected
294         *            automatically if this parameter is <code>null</code>.
295         * @param encoding
296         *            the encoding to use for the encoded-word (either B or Q). A
297         *            suitable encoding is automatically chosen if this parameter is
298         *            <code>null</code>.
299         * @return the encoded word (or sequence of encoded words if the given text
300         *         does not fit in a single encoded word).
301         * @see #hasToBeEncoded(String, int)
302         */
303        public static String encodeEncodedWord(String text, Usage usage,
304                int usedCharacters, Charset charset, Encoding encoding) {
305            if (text == null)
306                throw new IllegalArgumentException();
307            if (usedCharacters < 0 || usedCharacters > MAX_USED_CHARACTERS)
308                throw new IllegalArgumentException();
309    
310            if (charset == null)
311                charset = determineCharset(text);
312    
313            byte[] bytes = encode(text, charset);
314    
315            if (encoding == null)
316                encoding = determineEncoding(bytes, usage);
317    
318            if (encoding == Encoding.B) {
319                String prefix = ENC_WORD_PREFIX + charset.name() + "?B?";
320                return encodeB(prefix, text, usedCharacters, charset, bytes);
321            } else {
322                String prefix = ENC_WORD_PREFIX + charset.name() + "?Q?";
323                return encodeQ(prefix, text, usage, usedCharacters, charset, bytes);
324            }
325        }
326    
327        /**
328         * Encodes the specified byte array using the B encoding defined in RFC
329         * 2047.
330         *
331         * @param bytes
332         *            byte array to encode.
333         * @return encoded string.
334         */
335        public static String encodeB(byte[] bytes) {
336            StringBuilder sb = new StringBuilder();
337    
338            int idx = 0;
339            final int end = bytes.length;
340            for (; idx < end - 2; idx += 3) {
341                int data = (bytes[idx] & 0xff) << 16 | (bytes[idx + 1] & 0xff) << 8
342                        | bytes[idx + 2] & 0xff;
343                sb.append((char) BASE64_TABLE[data >> 18 & 0x3f]);
344                sb.append((char) BASE64_TABLE[data >> 12 & 0x3f]);
345                sb.append((char) BASE64_TABLE[data >> 6 & 0x3f]);
346                sb.append((char) BASE64_TABLE[data & 0x3f]);
347            }
348    
349            if (idx == end - 2) {
350                int data = (bytes[idx] & 0xff) << 16 | (bytes[idx + 1] & 0xff) << 8;
351                sb.append((char) BASE64_TABLE[data >> 18 & 0x3f]);
352                sb.append((char) BASE64_TABLE[data >> 12 & 0x3f]);
353                sb.append((char) BASE64_TABLE[data >> 6 & 0x3f]);
354                sb.append(BASE64_PAD);
355    
356            } else if (idx == end - 1) {
357                int data = (bytes[idx] & 0xff) << 16;
358                sb.append((char) BASE64_TABLE[data >> 18 & 0x3f]);
359                sb.append((char) BASE64_TABLE[data >> 12 & 0x3f]);
360                sb.append(BASE64_PAD);
361                sb.append(BASE64_PAD);
362            }
363    
364            return sb.toString();
365        }
366    
367        /**
368         * Encodes the specified byte array using the Q encoding defined in RFC
369         * 2047.
370         *
371         * @param bytes
372         *            byte array to encode.
373         * @param usage
374         *            whether the encoded-word is to be used to replace a text token
375         *            or a word entity (see RFC 822).
376         * @return encoded string.
377         */
378        public static String encodeQ(byte[] bytes, Usage usage) {
379            BitSet qChars = usage == Usage.TEXT_TOKEN ? Q_REGULAR_CHARS
380                    : Q_RESTRICTED_CHARS;
381    
382            StringBuilder sb = new StringBuilder();
383    
384            final int end = bytes.length;
385            for (int idx = 0; idx < end; idx++) {
386                int v = bytes[idx] & 0xff;
387                if (v == 32) {
388                    sb.append('_');
389                } else if (!qChars.get(v)) {
390                    sb.append('=');
391                    sb.append(hexDigit(v >>> 4));
392                    sb.append(hexDigit(v & 0xf));
393                } else {
394                    sb.append((char) v);
395                }
396            }
397    
398            return sb.toString();
399        }
400    
401        /**
402         * Tests whether the specified string is a token as defined in RFC 2045
403         * section 5.1.
404         *
405         * @param str
406         *            string to test.
407         * @return <code>true</code> if the specified string is a RFC 2045 token,
408         *         <code>false</code> otherwise.
409         */
410        public static boolean isToken(String str) {
411            // token := 1*<any (US-ASCII) CHAR except SPACE, CTLs, or tspecials>
412            // tspecials := "(" / ")" / "<" / ">" / "@" / "," / ";" / ":" / "\" /
413            // <"> / "/" / "[" / "]" / "?" / "="
414            // CTL := 0.- 31., 127.
415    
416            final int length = str.length();
417            if (length == 0)
418                return false;
419    
420            for (int idx = 0; idx < length; idx++) {
421                char ch = str.charAt(idx);
422                if (!TOKEN_CHARS.get(ch))
423                    return false;
424            }
425    
426            return true;
427        }
428    
429        private static boolean isAtomPhrase(String str) {
430            // atom = [CFWS] 1*atext [CFWS]
431    
432            boolean containsAText = false;
433    
434            final int length = str.length();
435            for (int idx = 0; idx < length; idx++) {
436                char ch = str.charAt(idx);
437                if (ATEXT_CHARS.get(ch)) {
438                    containsAText = true;
439                } else if (!CharsetUtil.isWhitespace(ch)) {
440                    return false;
441                }
442            }
443    
444            return containsAText;
445        }
446    
447        // RFC 5322 section 3.2.3
448        private static boolean isDotAtomText(String str) {
449            // dot-atom-text = 1*atext *("." 1*atext)
450            // atext = ALPHA / DIGIT / "!" / "#" / "$" / "%" / "&" / "'" / "*" /
451            // "+" / "-" / "/" / "=" / "?" / "^" / "_" / "`" / "{" / "|" / "}" / "~"
452    
453            char prev = '.';
454    
455            final int length = str.length();
456            if (length == 0)
457                return false;
458    
459            for (int idx = 0; idx < length; idx++) {
460                char ch = str.charAt(idx);
461    
462                if (ch == '.') {
463                    if (prev == '.' || idx == length - 1)
464                        return false;
465                } else {
466                    if (!ATEXT_CHARS.get(ch))
467                        return false;
468                }
469    
470                prev = ch;
471            }
472    
473            return true;
474        }
475    
476        // RFC 5322 section 3.2.4
477        private static String quote(String str) {
478            // quoted-string = [CFWS] DQUOTE *([FWS] qcontent) [FWS] DQUOTE [CFWS]
479            // qcontent = qtext / quoted-pair
480            // qtext = %d33 / %d35-91 / %d93-126
481            // quoted-pair = ("\" (VCHAR / WSP))
482            // VCHAR = %x21-7E
483            // DQUOTE = %x22
484    
485            String escaped = str.replaceAll("[\\\\\"]", "\\\\$0");
486            return "\"" + escaped + "\"";
487        }
488    
489        private static String encodeB(String prefix, String text,
490                int usedCharacters, Charset charset, byte[] bytes) {
491            int encodedLength = bEncodedLength(bytes);
492    
493            int totalLength = prefix.length() + encodedLength
494                    + ENC_WORD_SUFFIX.length();
495            if (totalLength <= ENCODED_WORD_MAX_LENGTH - usedCharacters) {
496                return prefix + encodeB(bytes) + ENC_WORD_SUFFIX;
497            } else {
498                String part1 = text.substring(0, text.length() / 2);
499                byte[] bytes1 = encode(part1, charset);
500                String word1 = encodeB(prefix, part1, usedCharacters, charset,
501                        bytes1);
502    
503                String part2 = text.substring(text.length() / 2);
504                byte[] bytes2 = encode(part2, charset);
505                String word2 = encodeB(prefix, part2, 0, charset, bytes2);
506    
507                return word1 + " " + word2;
508            }
509        }
510    
511        private static int bEncodedLength(byte[] bytes) {
512            return (bytes.length + 2) / 3 * 4;
513        }
514    
515        private static String encodeQ(String prefix, String text, Usage usage,
516                int usedCharacters, Charset charset, byte[] bytes) {
517            int encodedLength = qEncodedLength(bytes, usage);
518    
519            int totalLength = prefix.length() + encodedLength
520                    + ENC_WORD_SUFFIX.length();
521            if (totalLength <= ENCODED_WORD_MAX_LENGTH - usedCharacters) {
522                return prefix + encodeQ(bytes, usage) + ENC_WORD_SUFFIX;
523            } else {
524                String part1 = text.substring(0, text.length() / 2);
525                byte[] bytes1 = encode(part1, charset);
526                String word1 = encodeQ(prefix, part1, usage, usedCharacters,
527                        charset, bytes1);
528    
529                String part2 = text.substring(text.length() / 2);
530                byte[] bytes2 = encode(part2, charset);
531                String word2 = encodeQ(prefix, part2, usage, 0, charset, bytes2);
532    
533                return word1 + " " + word2;
534            }
535        }
536    
537        private static int qEncodedLength(byte[] bytes, Usage usage) {
538            BitSet qChars = usage == Usage.TEXT_TOKEN ? Q_REGULAR_CHARS
539                    : Q_RESTRICTED_CHARS;
540    
541            int count = 0;
542    
543            for (int idx = 0; idx < bytes.length; idx++) {
544                int v = bytes[idx] & 0xff;
545                if (v == 32) {
546                    count++;
547                } else if (!qChars.get(v)) {
548                    count += 3;
549                } else {
550                    count++;
551                }
552            }
553    
554            return count;
555        }
556    
557        private static byte[] encode(String text, Charset charset) {
558            ByteBuffer buffer = charset.encode(text);
559            byte[] bytes = new byte[buffer.limit()];
560            buffer.get(bytes);
561            return bytes;
562        }
563    
564        private static Charset determineCharset(String text) {
565            // it is an important property of iso-8859-1 that it directly maps
566            // unicode code points 0000 to 00ff to byte values 00 to ff.
567            boolean ascii = true;
568            final int len = text.length();
569            for (int index = 0; index < len; index++) {
570                char ch = text.charAt(index);
571                if (ch > 0xff) {
572                    return CharsetUtil.UTF_8;
573                }
574                if (ch > 0x7f) {
575                    ascii = false;
576                }
577            }
578            return ascii ? CharsetUtil.US_ASCII : CharsetUtil.ISO_8859_1;
579        }
580    
581        private static Encoding determineEncoding(byte[] bytes, Usage usage) {
582            if (bytes.length == 0)
583                return Encoding.Q;
584    
585            BitSet qChars = usage == Usage.TEXT_TOKEN ? Q_REGULAR_CHARS
586                    : Q_RESTRICTED_CHARS;
587    
588            int qEncoded = 0;
589            for (int i = 0; i < bytes.length; i++) {
590                int v = bytes[i] & 0xff;
591                if (v != 32 && !qChars.get(v)) {
592                    qEncoded++;
593                }
594            }
595    
596            int percentage = qEncoded * 100 / bytes.length;
597            return percentage > 30 ? Encoding.B : Encoding.Q;
598        }
599    
600        private static char hexDigit(int i) {
601            return i < 10 ? (char) (i + '0') : (char) (i - 10 + 'A');
602        }
603    }