001    package org.apache.fulcrum.parser;
002    
003    
004    /*
005     * Licensed to the Apache Software Foundation (ASF) under one
006     * or more contributor license agreements.  See the NOTICE file
007     * distributed with this work for additional information
008     * regarding copyright ownership.  The ASF licenses this file
009     * to you under the Apache License, Version 2.0 (the
010     * "License"); you may not use this file except in compliance
011     * with the License.  You may obtain a copy of the License at
012     *
013     *   http://www.apache.org/licenses/LICENSE-2.0
014     *
015     * Unless required by applicable law or agreed to in writing,
016     * software distributed under the License is distributed on an
017     * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
018     * KIND, either express or implied.  See the License for the
019     * specific language governing permissions and limitations
020     * under the License.
021     */
022    
023    
024    import java.io.BufferedReader;
025    import java.io.IOException;
026    import java.io.InputStreamReader;
027    import java.io.Reader;
028    import java.io.StreamTokenizer;
029    import java.util.ArrayList;
030    import java.util.Iterator;
031    import java.util.List;
032    import java.util.NoSuchElementException;
033    
034    import org.apache.avalon.framework.logger.LogEnabled;
035    import org.apache.avalon.framework.logger.Logger;
036    
037    /**
038     * DataStreamParser is used to parse a stream with a fixed format and
039     * generate ValueParser objects which can be used to extract the values
040     * in the desired type.
041     *
042     * <p>The class itself is abstract - a concrete subclass which implements
043     * the initTokenizer method such as CSVParser or TSVParser is required
044     * to use the functionality.
045     *
046     * <p>The class implements the java.util.Iterator interface for convenience.
047     * This allows simple use in a Velocity template for example:
048     *
049     * <pre>
050     * #foreach ($row in $datastream)
051     *   Name: $row.Name
052     *   Description: $row.Description
053     * #end
054     * </pre>
055     *
056     * @author <a href="mailto:sean@informage.net">Sean Legassick</a>
057     * @version $Id: DataStreamParser.java 732115 2009-01-06 20:54:04Z tv $
058     */
059    public abstract class DataStreamParser
060        implements Iterator, LogEnabled
061    {
062        /**
063         * The list of column names.
064         */
065        private List            columnNames;
066    
067        /**
068         * The stream tokenizer for reading values from the input reader.
069         */
070        private StreamTokenizer tokenizer;
071    
072        /**
073         * The parameter parser holding the values of columns for the current line.
074         */
075        private ValueParser     lineValues;
076    
077        /**
078         * Indicates whether or not the tokenizer has read anything yet.
079         */
080        private boolean         neverRead = true;
081    
082        /**
083         * The character encoding of the input
084         */
085        private String          characterEncoding;
086    
087        /**
088         * Logger to use
089         */
090        protected Logger log;
091    
092        /**
093         * Create a new DataStreamParser instance. Requires a Reader to read the
094         * comma-separated values from, a list of column names and a
095         * character encoding.
096         *
097         * @param in the input reader.
098         * @param columnNames a list of column names.
099         * @param characterEncoding the character encoding of the input.
100         */
101        public DataStreamParser(Reader in, List columnNames,
102                String characterEncoding)
103        {
104            this.columnNames = columnNames;
105            this.characterEncoding = characterEncoding;
106    
107            if (this.characterEncoding == null)
108            {
109                // try and get the characterEncoding from the reader
110                this.characterEncoding = "US-ASCII";
111                try
112                {
113                    this.characterEncoding = ((InputStreamReader)in).getEncoding();
114                }
115                catch (ClassCastException e)
116                {
117                    // ignore
118                }
119            }
120    
121            tokenizer = new StreamTokenizer(new BufferedReader(in));
122            initTokenizer(tokenizer);
123        }
124    
125        /**
126         * Initialize the StreamTokenizer instance used to read the lines
127         * from the input reader. This must be implemented in subclasses to
128         * set up the tokenizing properties.
129         */
130        protected abstract void initTokenizer(StreamTokenizer tokenizer);
131    
132        /**
133         * Provide a logger
134         * 
135         * @see org.apache.avalon.framework.logger.LogEnabled#enableLogging(org.apache.avalon.framework.logger.Logger)
136         */
137        public void enableLogging(Logger logger)
138        {
139            this.log = logger.getChildLogger("DataStreamParser");
140        }
141    
142        /**
143         * Set the list of column names explicitly.
144         *
145         * @param columnNames A list of column names.
146         */
147        public void setColumnNames(List columnNames)
148        {
149            this.columnNames = columnNames;
150        }
151    
152        /**
153         * Read the list of column names from the input reader using the
154         * tokenizer.
155         *
156         * @exception IOException an IOException occurred.
157         */
158        public void readColumnNames()
159            throws IOException
160        {
161            columnNames = new ArrayList();
162    
163            neverRead = false;
164            tokenizer.nextToken();
165            while (tokenizer.ttype == StreamTokenizer.TT_WORD
166                   || tokenizer.ttype == '"')
167            {
168                columnNames.add(tokenizer.sval);
169                tokenizer.nextToken();
170            }
171        }
172    
173        /**
174         * Determine whether a further row of values exists in the input.
175         *
176         * @return true if the input has more rows.
177         * @exception IOException an IOException occurred.
178         */
179        public boolean hasNextRow()
180            throws IOException
181        {
182            // check for end of line ensures that an empty last line doesn't
183            // give a false positive for hasNextRow
184            if (neverRead || tokenizer.ttype == StreamTokenizer.TT_EOL)
185            {
186                tokenizer.nextToken();
187                tokenizer.pushBack();
188                neverRead = false;
189            }
190            return tokenizer.ttype != StreamTokenizer.TT_EOF;
191        }
192    
193        /**
194         * Returns a ValueParser object containing the next row of values.
195         *
196         * @return a ValueParser object.
197         * @exception IOException an IOException occurred.
198         * @exception NoSuchElementException there are no more rows in the input.
199         */
200        public ValueParser nextRow()
201            throws IOException, NoSuchElementException
202        {
203            if (!hasNextRow())
204            {
205                throw new NoSuchElementException();
206            }
207    
208            if (lineValues == null)
209            {
210                lineValues = new BaseValueParser(characterEncoding);
211            }
212            else
213            {
214                lineValues.clear();
215            }
216    
217            Iterator it = columnNames.iterator();
218            tokenizer.nextToken();
219            while (tokenizer.ttype == StreamTokenizer.TT_WORD
220                   || tokenizer.ttype == '"')
221            {
222                // note this means that if there are more values than
223                // column names, the extra values are discarded.
224                if (it.hasNext())
225                {
226                    String colname = it.next().toString();
227                    String colval  = tokenizer.sval;
228                    if (log.isDebugEnabled())
229                    {
230                        log.debug("DataStreamParser.nextRow(): " +
231                                  colname + '=' + colval);
232                    }
233                    lineValues.add(colname, colval);
234                }
235                tokenizer.nextToken();
236            }
237    
238            return lineValues;
239        }
240    
241        /**
242         * Determine whether a further row of values exists in the input.
243         *
244         * @return true if the input has more rows.
245         */
246        public boolean hasNext()
247        {
248            boolean hasNext = false;
249    
250            try
251            {
252                hasNext = hasNextRow();
253            }
254            catch (IOException e)
255            {
256                log.error("IOException in CSVParser.hasNext", e);
257            }
258    
259            return hasNext;
260        }
261    
262        /**
263         * Returns a ValueParser object containing the next row of values.
264         *
265         * @return a ValueParser object as an Object.
266         * @exception NoSuchElementException there are no more rows in the input
267         *                                   or an IOException occurred.
268         */
269        public Object next()
270            throws NoSuchElementException
271        {
272            Object nextRow = null;
273    
274            try
275            {
276                nextRow = nextRow();
277            }
278            catch (IOException e)
279            {
280                log.error("IOException in CSVParser.next", e);
281                throw new NoSuchElementException();
282            }
283    
284            return nextRow;
285        }
286    
287        /**
288         * The optional Iterator.remove method is not supported.
289         *
290         * @exception UnsupportedOperationException the operation is not supported.
291         */
292        public void remove()
293            throws UnsupportedOperationException
294        {
295            throw new UnsupportedOperationException();
296        }
297    }