001    /****************************************************************
002     * Licensed to the Apache Software Foundation (ASF) under one   *
003     * or more contributor license agreements.  See the NOTICE file *
004     * distributed with this work for additional information        *
005     * regarding copyright ownership.  The ASF licenses this file   *
006     * to you under the Apache License, Version 2.0 (the            *
007     * "License"); you may not use this file except in compliance   *
008     * with the License.  You may obtain a copy of the License at   *
009     *                                                              *
010     *   http://www.apache.org/licenses/LICENSE-2.0                 *
011     *                                                              *
012     * Unless required by applicable law or agreed to in writing,   *
013     * software distributed under the License is distributed on an  *
014     * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
015     * KIND, either express or implied.  See the License for the    *
016     * specific language governing permissions and limitations      *
017     * under the License.                                           *
018     ****************************************************************/
019    
020    package org.apache.james.mime4j.stream;
021    
022    import java.io.IOException;
023    import java.io.InputStream;
024    import java.io.InputStreamReader;
025    import java.io.Reader;
026    import java.nio.charset.Charset;
027    import java.util.LinkedList;
028    
029    import org.apache.james.mime4j.MimeException;
030    import org.apache.james.mime4j.codec.DecodeMonitor;
031    import org.apache.james.mime4j.io.LineNumberInputStream;
032    import org.apache.james.mime4j.io.LineNumberSource;
033    import org.apache.james.mime4j.util.CharsetUtil;
034    
035    /**
036     * <p>
037     * Parses MIME (or RFC822) message streams of bytes or characters.
038     * The stream is converted into an event stream.
039     * <p>
040     * <p>
041     * Typical usage:
042     * </p>
043     * <pre>
044     *      MimeTokenStream stream = new MimeTokenStream();
045     *      InputStream instream = new FileInputStream("mime.msg");
046     *      try {
047     *          stream.parse(instream);
048     *          for (int state = stream.getState();
049     *              state != MimeTokenStream.T_END_OF_STREAM;
050     *              state = stream.next()) {
051     *              switch (state) {
052     *              case MimeTokenStream.T_BODY:
053     *                  System.out.println("Body detected, contents = "
054     *                  + stream.getInputStream() + ", header data = "
055     *                  + stream.getBodyDescriptor());
056     *                  break;
057     *              case MimeTokenStream.T_FIELD:
058     *                  System.out.println("Header field detected: "
059     *                  + stream.getField());
060     *                  break;
061     *              case MimeTokenStream.T_START_MULTIPART:
062     *                  System.out.println("Multipart message detexted,"
063     *                  + " header data = "
064     *                  + stream.getBodyDescriptor());
065     *              ...
066     *              }
067     *          }
068     *      } finally {
069     *          instream.close();
070     *      }
071     * </pre>
072     * <p>Instances of {@link MimeTokenStream} are reusable: Invoking the
073     * method {@link #parse(InputStream)} resets the token streams internal
074     * state. However, they are definitely <em>not</em> thread safe. If you
075     * have a multi threaded application, then the suggested use is to have
076     * one instance per thread.</p>
077     */
078    public class MimeTokenStream {
079    
080        private final MimeConfig config;
081        private final DecodeMonitor monitor;
082        private final FieldBuilder fieldBuilder;
083        private final BodyDescriptorBuilder bodyDescBuilder;
084        private final LinkedList<EntityStateMachine> entities = new LinkedList<EntityStateMachine>();
085    
086        private EntityState state = EntityState.T_END_OF_STREAM;
087        private EntityStateMachine currentStateMachine;
088        private RecursionMode recursionMode = RecursionMode.M_RECURSE;
089        private MimeEntity rootentity;
090    
091        /**
092         * Constructs a standard (lax) stream.
093         * Optional validation events will be logged only.
094         * Use {@link MimeConfig#setStrictParsing(boolean)} to turn on strict
095         * parsing mode and pass the config object to
096         * {@link MimeTokenStream#MimeTokenStream(MimeConfig)} to create
097         * a stream that strictly validates the input.
098         */
099        public MimeTokenStream() {
100            this(null);
101        }
102    
103        public MimeTokenStream(final MimeConfig config) {
104            this(config, null, null, null);
105        }
106    
107        public MimeTokenStream(
108                final MimeConfig config,
109                final BodyDescriptorBuilder bodyDescBuilder) {
110            this(config, null, null, bodyDescBuilder);
111        }
112    
113        public MimeTokenStream(
114                final MimeConfig config,
115                final DecodeMonitor monitor,
116                final BodyDescriptorBuilder bodyDescBuilder) {
117            this(config, monitor, null, bodyDescBuilder);
118        }
119    
120        public MimeTokenStream(
121                final MimeConfig config,
122                final DecodeMonitor monitor,
123                final FieldBuilder fieldBuilder,
124                final BodyDescriptorBuilder bodyDescBuilder) {
125            super();
126            this.config = config != null ? config : new MimeConfig();
127            this.fieldBuilder = fieldBuilder != null ? fieldBuilder :
128                new DefaultFieldBuilder(this.config.getMaxHeaderLen());
129            this.monitor = monitor != null ? monitor :
130                (this.config.isStrictParsing() ? DecodeMonitor.STRICT : DecodeMonitor.SILENT);
131            this.bodyDescBuilder = bodyDescBuilder != null ? bodyDescBuilder :
132                new FallbackBodyDescriptorBuilder();
133        }
134    
135        /** Instructs the {@code MimeTokenStream} to parse the given streams contents.
136         * If the {@code MimeTokenStream} has already been in use, resets the streams
137         * internal state.
138         */
139        public void parse(InputStream stream) {
140            doParse(stream, EntityState.T_START_MESSAGE);
141        }
142    
143        /**
144         * <p>Instructs the {@code MimeTokenStream} to parse the given content with
145         * the content type. The message stream is assumed to have no message header
146         * and is expected to begin with a message body. This can be the case when
147         * the message content is transmitted using a different transport protocol
148         * such as HTTP.</p>
149         * <p>If the {@code MimeTokenStream} has already been in use, resets the
150         * streams internal state.</p>
151         * @return a parsed Field representing the input contentType
152         */
153        public Field parseHeadless(InputStream stream, String contentType) {
154            if (contentType == null) {
155                throw new IllegalArgumentException("Content type may not be null");
156            }
157            Field newContentType;
158            try {
159                RawField rawContentType = new RawField("Content-Type", contentType);
160                newContentType = bodyDescBuilder.addField(rawContentType);
161                if (newContentType == null) newContentType = rawContentType;
162            } catch (MimeException ex) {
163                // should never happen
164                throw new IllegalArgumentException(ex.getMessage());
165            }
166    
167            doParse(stream, EntityState.T_END_HEADER);
168            try {
169                next();
170            } catch (IOException e) {
171                // Should never happend: the first next after END_HEADER does not produce IO
172                throw new IllegalStateException(e);
173            } catch (MimeException e) {
174                // This should never happen
175                throw new IllegalStateException(e);
176            }
177            return newContentType;
178        }
179    
180        private void doParse(InputStream stream, EntityState start) {
181            LineNumberSource lineSource = null;
182            if (config.isCountLineNumbers()) {
183                LineNumberInputStream lineInput = new LineNumberInputStream(stream);
184                lineSource = lineInput;
185                stream = lineInput;
186            }
187    
188            rootentity = new MimeEntity(
189                    lineSource,
190                    stream,
191                    config,
192                    start,
193                    EntityState.T_END_MESSAGE,
194                    monitor,
195                    fieldBuilder,
196                    bodyDescBuilder);
197    
198            rootentity.setRecursionMode(recursionMode);
199            currentStateMachine = rootentity;
200            entities.clear();
201            entities.add(currentStateMachine);
202            state = currentStateMachine.getState();
203        }
204    
205        /**
206         * Determines if this parser is currently in raw mode.
207         *
208         * @return <code>true</code> if in raw mode, <code>false</code>
209         *         otherwise.
210         * @see #setRecursionMode(RecursionMode)
211         */
212        public boolean isRaw() {
213            return recursionMode == RecursionMode.M_RAW;
214        }
215    
216        /**
217         * Gets the current recursion mode.
218         * The recursion mode specifies the approach taken to parsing parts.
219         * {@link RecursionMode#M_RAW}  mode does not parse the part at all.
220         * {@link RecursionMode#M_RECURSE} mode recursively parses each mail
221         * when an <code>message/rfc822</code> part is encountered;
222         * {@link RecursionMode#M_NO_RECURSE} does not.
223         * @return {@link RecursionMode#M_RECURSE}, {@link RecursionMode#M_RAW} or
224         *   {@link RecursionMode#M_NO_RECURSE}
225         */
226        public RecursionMode getRecursionMode() {
227            return recursionMode;
228        }
229    
230        /**
231         * Sets the current recursion.
232         * The recursion mode specifies the approach taken to parsing parts.
233         * {@link RecursionMode#M_RAW}  mode does not parse the part at all.
234         * {@link RecursionMode#M_RECURSE} mode recursively parses each mail
235         * when an <code>message/rfc822</code> part is encountered;
236         * {@link RecursionMode#M_NO_RECURSE} does not.
237         * @param mode {@link RecursionMode#M_RECURSE}, {@link RecursionMode#M_RAW} or
238         *   {@link RecursionMode#M_NO_RECURSE}
239         */
240        public void setRecursionMode(RecursionMode mode) {
241            recursionMode = mode;
242            if (currentStateMachine != null) {
243                currentStateMachine.setRecursionMode(mode);
244            }
245        }
246    
247        /**
248         * Finishes the parsing and stops reading lines.
249         * NOTE: No more lines will be parsed but the parser
250         * will still trigger 'end' events to match previously
251         * triggered 'start' events.
252         */
253        public void stop() {
254            rootentity.stop();
255        }
256    
257        /**
258         * Returns the current state.
259         */
260        public EntityState getState() {
261            return state;
262        }
263    
264        /**
265         * This method returns the raw entity, preamble, or epilogue contents.
266         * <p/>
267         * This method is valid, if {@link #getState()} returns either of
268         * {@link EntityState#T_RAW_ENTITY}, {@link EntityState#T_PREAMBLE}, or
269         * {@link EntityState#T_EPILOGUE}.
270         *
271         * @return Data stream, depending on the current state.
272         * @throws IllegalStateException {@link #getState()} returns an
273         *   invalid value.
274         */
275        public InputStream getInputStream() {
276            return currentStateMachine.getContentStream();
277        }
278    
279        /**
280         * This method returns a transfer decoded stream based on the MIME
281         * fields with the standard defaults.
282         * <p/>
283         * This method is valid, if {@link #getState()} returns either of
284         * {@link EntityState#T_RAW_ENTITY}, {@link EntityState#T_PREAMBLE}, or
285         * {@link EntityState#T_EPILOGUE}.
286         *
287         * @return Data stream, depending on the current state.
288         * @throws IllegalStateException {@link #getState()} returns an
289         *   invalid value.
290         */
291        public InputStream getDecodedInputStream() {
292            return currentStateMachine.getDecodedContentStream();
293        }
294    
295        /**
296         * Gets a reader configured for the current body or body part.
297         * The reader will return a transfer and charset decoded
298         * stream of characters based on the MIME fields with the standard
299         * defaults.
300         * This is a conveniance method and relies on {@link #getInputStream()}.
301         * Consult the javadoc for that method for known limitations.
302         *
303         * @return <code>Reader</code>, not null
304         * @see #getInputStream
305         * @throws IllegalStateException {@link #getState()} returns an
306         *   invalid value
307         * @throws UnsupportedCharsetException if there is no JVM support
308         * for decoding the charset
309         * @throws IllegalCharsetNameException if the charset name specified
310         * in the mime type is illegal
311         */
312        public Reader getReader() {
313            final BodyDescriptor bodyDescriptor = getBodyDescriptor();
314            final String mimeCharset = bodyDescriptor.getCharset();
315            final Charset charset;
316            if (mimeCharset == null || "".equals(mimeCharset)) {
317                charset = CharsetUtil.US_ASCII;
318            } else {
319                charset = Charset.forName(mimeCharset);
320            }
321            final InputStream instream = getDecodedInputStream();
322            return new InputStreamReader(instream, charset);
323        }
324    
325        /**
326         * <p>Gets a descriptor for the current entity.
327         * This method is valid if {@link #getState()} returns:</p>
328         * <ul>
329         * <li>{@link EntityState#T_BODY}</li>
330         * <li>{@link EntityState#T_START_MULTIPART}</li>
331         * <li>{@link EntityState#T_EPILOGUE}</li>
332         * <li>{@link EntityState#T_PREAMBLE}</li>
333         * </ul>
334         * @return <code>BodyDescriptor</code>, not nulls
335         */
336        public BodyDescriptor getBodyDescriptor() {
337            return currentStateMachine.getBodyDescriptor();
338        }
339    
340        /**
341         * This method is valid, if {@link #getState()} returns {@link EntityState#T_FIELD}.
342         * @return String with the fields raw contents.
343         * @throws IllegalStateException {@link #getState()} returns another
344         *   value than {@link EntityState#T_FIELD}.
345         */
346        public Field getField() {
347            return currentStateMachine.getField();
348        }
349    
350        /**
351         * This method advances the token stream to the next token.
352         * @throws IllegalStateException The method has been called, although
353         *   {@link #getState()} was already {@link EntityState#T_END_OF_STREAM}.
354         */
355        public EntityState next() throws IOException, MimeException {
356            if (state == EntityState.T_END_OF_STREAM  ||  currentStateMachine == null) {
357                throw new IllegalStateException("No more tokens are available.");
358            }
359            while (currentStateMachine != null) {
360                EntityStateMachine next = currentStateMachine.advance();
361                if (next != null) {
362                    entities.add(next);
363                    currentStateMachine = next;
364                }
365                state = currentStateMachine.getState();
366                if (state != EntityState.T_END_OF_STREAM) {
367                    return state;
368                }
369                entities.removeLast();
370                if (entities.isEmpty()) {
371                    currentStateMachine = null;
372                } else {
373                    currentStateMachine = entities.getLast();
374                    currentStateMachine.setRecursionMode(recursionMode);
375                }
376            }
377            state = EntityState.T_END_OF_STREAM;
378            return state;
379        }
380    
381        /**
382         * Renders a state as a string suitable for logging.
383         * @param state
384         * @return rendered as string, not null
385         */
386        public static final String stateToString(EntityState state) {
387            return MimeEntity.stateToString(state);
388        }
389    
390    
391        public MimeConfig getConfig() {
392            return config;
393        }
394    }