001    /****************************************************************
002     * Licensed to the Apache Software Foundation (ASF) under one   *
003     * or more contributor license agreements.  See the NOTICE file *
004     * distributed with this work for additional information        *
005     * regarding copyright ownership.  The ASF licenses this file   *
006     * to you under the Apache License, Version 2.0 (the            *
007     * "License"); you may not use this file except in compliance   *
008     * with the License.  You may obtain a copy of the License at   *
009     *                                                              *
010     *   http://www.apache.org/licenses/LICENSE-2.0                 *
011     *                                                              *
012     * Unless required by applicable law or agreed to in writing,   *
013     * software distributed under the License is distributed on an  *
014     * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
015     * KIND, either express or implied.  See the License for the    *
016     * specific language governing permissions and limitations      *
017     * under the License.                                           *
018     ****************************************************************/
019    
020    package org.apache.james.mime4j.stream;
021    
022    import java.util.ArrayList;
023    import java.util.BitSet;
024    import java.util.List;
025    
026    import org.apache.james.mime4j.MimeException;
027    import org.apache.james.mime4j.util.ByteSequence;
028    import org.apache.james.mime4j.util.CharsetUtil;
029    import org.apache.james.mime4j.util.ContentUtil;
030    
031    /**
032     * Low level parser for header field elements. The parsing routines of this class are designed
033     * to produce near zero intermediate garbage and make no intermediate copies of input data.
034     * <p/>
035     * This class is immutable and thread safe.
036     */
037    public class RawFieldParser {
038    
039        public static BitSet INIT_BITSET(int ... b) {
040            BitSet bitset = new BitSet(b.length);
041            for (int i = 0; i < b.length; i++) {
042                bitset.set(b[i]);
043            }
044            return bitset;
045        }
046    
047        static final BitSet COLON                   = INIT_BITSET(':');
048        static final BitSet EQUAL_OR_SEMICOLON      = INIT_BITSET('=', ';');
049        static final BitSet SEMICOLON               = INIT_BITSET(';');
050    
051        public static final RawFieldParser DEFAULT = new RawFieldParser();
052    
053        /**
054         * Parses the sequence of bytes into {@link RawField}.
055         *
056         * @throws MimeException if the input data does not contain a valid MIME field.
057         */
058        public RawField parseField(final ByteSequence raw) throws MimeException {
059            if (raw == null) {
060                return null;
061            }
062            ParserCursor cursor = new ParserCursor(0, raw.length());
063            String name = parseToken(raw, cursor, COLON);
064            if (cursor.atEnd()) {
065                throw new MimeException("Invalid MIME field: no name/value separator found: " +
066                        raw.toString());
067            }
068            return new RawField(raw, cursor.getPos(), name, null);
069        }
070    
071        /**
072         * Parses the field body containing a value with parameters into {@link RawBody}.
073         *
074         * @param field unstructured (raw) field
075         */
076        public RawBody parseRawBody(final RawField field) {
077            ByteSequence buf = field.getRaw();
078            int pos = field.getDelimiterIdx() + 1;
079            if (buf == null) {
080                String body = field.getBody();
081                if (body == null) {
082                    return new RawBody("", null);
083                }
084                buf = ContentUtil.encode(body);
085                pos = 0;
086            }
087            ParserCursor cursor = new ParserCursor(pos, buf.length());
088            return parseRawBody(buf, cursor);
089        }
090    
091        /**
092         * Parses the sequence of bytes containing a value with parameters into {@link RawBody}.
093         *
094         * @param buf buffer with the sequence of bytes to be parsed
095         * @param cursor defines the bounds and current position of the buffer
096         */
097        public RawBody parseRawBody(final ByteSequence buf, final ParserCursor cursor) {
098            String value = parseToken(buf, cursor, SEMICOLON);
099            if (cursor.atEnd()) {
100                return new RawBody(value, new ArrayList<NameValuePair>());
101            }
102            cursor.updatePos(cursor.getPos() + 1);
103            List<NameValuePair> params = parseParameters(buf, cursor);
104            return new RawBody(value, params);
105        }
106    
107        /**
108         * Parses the sequence of bytes containing field parameters delimited with semicolon into
109         * a list of {@link NameValuePair}s.
110         *
111         * @param buf buffer with the sequence of bytes to be parsed
112         * @param cursor defines the bounds and current position of the buffer
113         */
114        public List<NameValuePair> parseParameters(final ByteSequence buf, final ParserCursor cursor) {
115            List<NameValuePair> params = new ArrayList<NameValuePair>();
116            skipWhiteSpace(buf, cursor);
117            while (!cursor.atEnd()) {
118                NameValuePair param = parseParameter(buf, cursor);
119                params.add(param);
120            }
121            return params;
122        }
123    
124        /**
125         * Parses the sequence of bytes containing a field parameter delimited with semicolon into
126         * {@link NameValuePair}.
127         *
128         * @param buf buffer with the sequence of bytes to be parsed
129         * @param cursor defines the bounds and current position of the buffer
130         */
131        public NameValuePair parseParameter(final ByteSequence buf, final ParserCursor cursor) {
132            String name = parseToken(buf, cursor, EQUAL_OR_SEMICOLON);
133            if (cursor.atEnd()) {
134                return new NameValuePair(name, null);
135            }
136            int delim = buf.byteAt(cursor.getPos());
137            cursor.updatePos(cursor.getPos() + 1);
138            if (delim == ';') {
139                return new NameValuePair(name, null);
140            }
141            String value = parseValue(buf, cursor, SEMICOLON);
142            if (!cursor.atEnd()) {
143                cursor.updatePos(cursor.getPos() + 1);
144            }
145            return new NameValuePair(name, value);
146        }
147    
148        /**
149         * Extracts from the sequence of bytes a token terminated with any of the given delimiters
150         * discarding semantically insignificant whitespace characters and comments.
151         *
152         * @param buf buffer with the sequence of bytes to be parsed
153         * @param cursor defines the bounds and current position of the buffer
154         * @param delimiters set of delimiting characters. Can be <code>null</code> if the token
155         *  is not delimited by any character.
156         */
157        public String parseToken(final ByteSequence buf, final ParserCursor cursor, final BitSet delimiters) {
158            StringBuilder dst = new StringBuilder();
159            boolean whitespace = false;
160            while (!cursor.atEnd()) {
161                char current = (char) (buf.byteAt(cursor.getPos()) & 0xff);
162                if (delimiters != null && delimiters.get(current)) {
163                    break;
164                } else if (CharsetUtil.isWhitespace(current)) {
165                    skipWhiteSpace(buf, cursor);
166                    whitespace = true;
167                } else if (current == '(') {
168                    skipComment(buf, cursor);
169                } else {
170                    if (dst.length() > 0 && whitespace) {
171                        dst.append(' ');
172                    }
173                    copyContent(buf, cursor, delimiters, dst);
174                    whitespace = false;
175                }
176            }
177            return dst.toString();
178        }
179    
180        /**
181         * Extracts from the sequence of bytes a value which can be enclosed in quote marks and
182         * terminated with any of the given delimiters discarding semantically insignificant
183         * whitespace characters and comments.
184         *
185         * @param buf buffer with the sequence of bytes to be parsed
186         * @param cursor defines the bounds and current position of the buffer
187         * @param delimiters set of delimiting characters. Can be <code>null</code> if the value
188         *  is not delimited by any character.
189         */
190        public String parseValue(final ByteSequence buf, final ParserCursor cursor, final BitSet delimiters) {
191            StringBuilder dst = new StringBuilder();
192            boolean whitespace = false;
193            while (!cursor.atEnd()) {
194                char current = (char) (buf.byteAt(cursor.getPos()) & 0xff);
195                if (delimiters != null && delimiters.get(current)) {
196                    break;
197                } else if (CharsetUtil.isWhitespace(current)) {
198                    skipWhiteSpace(buf, cursor);
199                    whitespace = true;
200                } else if (current == '(') {
201                    skipComment(buf, cursor);
202                } else if (current == '\"') {
203                    if (dst.length() > 0 && whitespace) {
204                        dst.append(' ');
205                    }
206                    copyQuotedContent(buf, cursor, dst);
207                    whitespace = false;
208                } else {
209                    if (dst.length() > 0 && whitespace) {
210                        dst.append(' ');
211                    }
212                    copyContent(buf, cursor, delimiters, dst);
213                    whitespace = false;
214                }
215            }
216            return dst.toString();
217        }
218    
219        /**
220         * Skips semantically insignificant whitespace characters and moves the cursor to the closest
221         * non-whitespace character.
222         *
223         * @param buf buffer with the sequence of bytes to be parsed
224         * @param cursor defines the bounds and current position of the buffer
225         */
226        public void skipWhiteSpace(final ByteSequence buf, final ParserCursor cursor) {
227            int pos = cursor.getPos();
228            int indexFrom = cursor.getPos();
229            int indexTo = cursor.getUpperBound();
230            for (int i = indexFrom; i < indexTo; i++) {
231                char current = (char) (buf.byteAt(i) & 0xff);
232                if (!CharsetUtil.isWhitespace(current)) {
233                    break;
234                } else {
235                    pos++;
236                }
237            }
238            cursor.updatePos(pos);
239        }
240    
241        /**
242         * Skips semantically insignificant content if the current position is positioned at the
243         * beginning of a comment and moves the cursor past the end of the comment.
244         * Nested comments and escaped characters are recognized and handled appropriately.
245         *
246         * @param buf buffer with the sequence of bytes to be parsed
247         * @param cursor defines the bounds and current position of the buffer
248         */
249        public void skipComment(final ByteSequence buf, final ParserCursor cursor) {
250            if (cursor.atEnd()) {
251                return;
252            }
253            int pos = cursor.getPos();
254            int indexFrom = cursor.getPos();
255            int indexTo = cursor.getUpperBound();
256            char current = (char) (buf.byteAt(pos) & 0xff);
257            if (current != '(') {
258                return;
259            }
260            pos++;
261            indexFrom++;
262    
263            int level = 1;
264            boolean escaped = false;
265            for (int i = indexFrom; i < indexTo; i++, pos++) {
266                current = (char) (buf.byteAt(i) & 0xff);
267                if (escaped) {
268                    escaped = false;
269                } else {
270                    if (current == '\\') {
271                        escaped = true;
272                    } else if (current == '(') {
273                        level++;
274                    } else if (current == ')') {
275                        level--;
276                    }
277                }
278                if (level <= 0) {
279                    pos++;
280                    break;
281                }
282            }
283            cursor.updatePos(pos);
284        }
285    
286        /**
287         * Skips semantically insignificant whitespace characters and comments and moves the cursor
288         * to the closest semantically significant non-whitespace character.
289         * Nested comments and escaped characters are recognized and handled appropriately.
290         *
291         * @param buf buffer with the sequence of bytes to be parsed
292         * @param cursor defines the bounds and current position of the buffer
293         */
294        public void skipAllWhiteSpace(final ByteSequence buf, final ParserCursor cursor) {
295            while (!cursor.atEnd()) {
296                char current = (char) (buf.byteAt(cursor.getPos()) & 0xff);
297                if (CharsetUtil.isWhitespace(current)) {
298                    skipWhiteSpace(buf, cursor);
299                } else if (current == '(') {
300                    skipComment(buf, cursor);
301                } else {
302                    break;
303                }
304            }
305        }
306    
307        /**
308         * Transfers content into the destination buffer until a whitespace character, a comment,
309         * or any of the given delimiters is encountered.
310         *
311         * @param buf buffer with the sequence of bytes to be parsed
312         * @param cursor defines the bounds and current position of the buffer
313         * @param delimiters set of delimiting characters. Can be <code>null</code> if the value
314         *  is delimited by a whitespace or a comment only.
315         * @param dst destination buffer
316         */
317        public void copyContent(final ByteSequence buf, final ParserCursor cursor, final BitSet delimiters,
318                final StringBuilder dst) {
319            int pos = cursor.getPos();
320            int indexFrom = cursor.getPos();
321            int indexTo = cursor.getUpperBound();
322            for (int i = indexFrom; i < indexTo; i++) {
323                char current = (char) (buf.byteAt(i) & 0xff);
324                if ((delimiters != null && delimiters.get(current))
325                        || CharsetUtil.isWhitespace(current) || current == '(') {
326                    break;
327                } else {
328                    pos++;
329                    dst.append(current);
330                }
331            }
332            cursor.updatePos(pos);
333        }
334    
335        /**
336         * Transfers content enclosed with quote marks into the destination buffer.
337         *
338         * @param buf buffer with the sequence of bytes to be parsed
339         * @param cursor defines the bounds and current position of the buffer
340         * @param dst destination buffer
341         */
342        public void copyQuotedContent(final ByteSequence buf, final ParserCursor cursor,
343                final StringBuilder dst) {
344            if (cursor.atEnd()) {
345                return;
346            }
347            int pos = cursor.getPos();
348            int indexFrom = cursor.getPos();
349            int indexTo = cursor.getUpperBound();
350            char current = (char) (buf.byteAt(pos) & 0xff);
351            if (current != '\"') {
352                return;
353            }
354            pos++;
355            indexFrom++;
356            boolean escaped = false;
357            for (int i = indexFrom; i < indexTo; i++, pos++) {
358                current = (char) (buf.byteAt(i) & 0xff);
359                if (escaped) {
360                    if (current != '\"' && current != '\\') {
361                        dst.append('\\');
362                    }
363                    dst.append(current);
364                    escaped = false;
365                } else {
366                    if (current == '\"') {
367                        pos++;
368                        break;
369                    }
370                    if (current == '\\') {
371                        escaped = true;
372                    } else if (current != '\r' && current != '\n') {
373                        dst.append(current);
374                    }
375                }
376            }
377            cursor.updatePos(pos);
378        }
379    
380    }