View Javadoc

1   package org.apache.turbine.util.parser;
2   
3   /*
4    * Copyright 2001-2004 The Apache Software Foundation.
5    *
6    * Licensed under the Apache License, Version 2.0 (the "License")
7    * you may not use this file except in compliance with the License.
8    * You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  
19  import java.io.BufferedReader;
20  import java.io.IOException;
21  import java.io.InputStreamReader;
22  import java.io.Reader;
23  import java.io.StreamTokenizer;
24  
25  import java.util.ArrayList;
26  import java.util.Iterator;
27  import java.util.List;
28  import java.util.NoSuchElementException;
29  
30  import org.apache.commons.logging.Log;
31  import org.apache.commons.logging.LogFactory;
32  
33  /***
34   * DataStreamParser is used to parse a stream with a fixed format and
35   * generate ValueParser objects which can be used to extract the values
36   * in the desired type.
37   *
38   * <p>The class itself is abstract - a concrete subclass which implements
39   * the initTokenizer method such as CSVParser or TSVParser is required
40   * to use the functionality.
41   *
42   * <p>The class implements the java.util.Iterator interface for convenience.
43   * This allows simple use in a Velocity template for example:
44   *
45   * <pre>
46   * #foreach ($row in $datastream)
47   *   Name: $row.Name
48   *   Description: $row.Description
49   * #end
50   * </pre>
51   *
52   * @author <a href="mailto:sean@informage.net">Sean Legassick</a>
53   * @author <a href="mailto:martin@mvdb.net">Martin van den Bemt</a>
54   * @version $Id: DataStreamParser.java,v 1.1.2.3 2004/08/14 20:11:43 henning Exp $
55   */
56  public abstract class DataStreamParser implements Iterator
57  {
58      /*** Logging */
59      private static Log log = LogFactory.getLog(DataStreamParser.class);
60  
61      /***
62       * Conditional compilation flag.
63       */
64      private static final boolean DEBUG = false;
65  
66      /***
67       * The constant for empty fields
68       */
69      protected static final String EMPTYFIELDNAME = "UNKNOWNFIELD";
70  
71      /***
72       * The list of column names.
73       */
74      private List columnNames;
75  
76      /***
77       * The stream tokenizer for reading values from the input reader.
78       */
79      private StreamTokenizer tokenizer;
80  
81      /***
82       * The parameter parser holding the values of columns for the current line.
83       */
84      private ValueParser lineValues;
85  
86      /***
87       * Indicates whether or not the tokenizer has read anything yet.
88       */
89      private boolean neverRead = true;
90  
91      /***
92       * The character encoding of the input
93       */
94      private String characterEncoding;
95  
96      /***
97       * The fieldseperator, which can be almost any char
98       */
99      private char fieldSeparator;
100 
101     /***
102      * Create a new DataStreamParser instance. Requires a Reader to read the
103      * comma-separated values from, a list of column names and a
104      * character encoding.
105      *
106      * @param in the input reader.
107      * @param columnNames a list of column names.
108      * @param characterEncoding the character encoding of the input.
109      */
110     public DataStreamParser(Reader in, List columnNames,
111                             String characterEncoding)
112     {
113         this.columnNames = columnNames;
114         this.characterEncoding = characterEncoding;
115 
116         if (this.characterEncoding == null)
117         {
118             // try and get the characterEncoding from the reader
119             this.characterEncoding = "US-ASCII";
120             try
121             {
122                 this.characterEncoding = ((InputStreamReader) in).getEncoding();
123             }
124             catch (ClassCastException e)
125             {
126             }
127         }
128 
129         tokenizer = new StreamTokenizer(new BufferedReader(in));
130         initTokenizer(tokenizer);
131     }
132 
133     /***
134      * Initialize the StreamTokenizer instance used to read the lines
135      * from the input reader. This must be implemented in subclasses to
136      * set up other tokenizing properties.
137      *
138      * @param tokenizer the tokenizer to adjust
139      */
140     protected void initTokenizer(StreamTokenizer tokenizer)
141     {
142         // set all numeric characters as ordinary characters
143         // (switches off number parsing)
144         tokenizer.ordinaryChars('0', '9');
145         tokenizer.ordinaryChars('-', '-');
146         tokenizer.ordinaryChars('.', '.');
147 
148         // leave out the comma sign (,), we need it for empty fields
149 
150         tokenizer.wordChars(' ', Integer.MAX_VALUE);
151 
152         // and  set the quote mark as the quoting character
153         tokenizer.quoteChar('"');
154 
155         // and finally say that end of line is significant
156         tokenizer.eolIsSignificant(true);
157     }
158 
159     /***
160      * This method must be called to setup the field seperator
161      * @param fieldSeparator the char which separates the fields
162      */
163     public void setFieldSeparator(char fieldSeparator)
164     {
165         this.fieldSeparator = fieldSeparator;
166         // make this field also an ordinary char by default.
167         tokenizer.ordinaryChar(fieldSeparator);
168     }
169 
170     /***
171      * Set the list of column names explicitly.
172      *
173      * @param columnNames A list of column names.
174      */
175     public void setColumnNames(List columnNames)
176     {
177         this.columnNames = columnNames;
178     }
179 
180     /***
181      * Read the list of column names from the input reader using the
182      * tokenizer. If fieldNames are empty, we use the current fieldNumber
183      * + the EMPTYFIELDNAME to make one up.
184      *
185      * @exception IOException an IOException occurred.
186      */
187     public void readColumnNames()
188             throws IOException
189     {
190         columnNames = new ArrayList();
191         int lastTtype = 0;
192         int fieldCounter = 1;
193 
194         neverRead = false;
195         tokenizer.nextToken();
196         while (tokenizer.ttype == StreamTokenizer.TT_WORD || tokenizer.ttype == StreamTokenizer.TT_EOL
197                 || tokenizer.ttype == '"' || tokenizer.ttype == fieldSeparator)
198         {
199             if (tokenizer.ttype != fieldSeparator && tokenizer.ttype != StreamTokenizer.TT_EOL)
200             {
201                 columnNames.add(tokenizer.sval);
202                 fieldCounter++;
203             }
204             else if (tokenizer.ttype == fieldSeparator && lastTtype == fieldSeparator)
205             {
206                 // we have an empty field name
207                 columnNames.add(EMPTYFIELDNAME + fieldCounter);
208                 fieldCounter++;
209             }
210             else if (lastTtype == fieldSeparator && tokenizer.ttype == StreamTokenizer.TT_EOL)
211             {
212                 columnNames.add(EMPTYFIELDNAME + fieldCounter);
213                 break;
214             }
215             else if (tokenizer.ttype == StreamTokenizer.TT_EOL)
216             {
217                 break;
218             }
219             lastTtype = tokenizer.ttype;
220             tokenizer.nextToken();
221         }
222     }
223 
224     /***
225      * Determine whether a further row of values exists in the input.
226      *
227      * @return true if the input has more rows.
228      * @exception IOException an IOException occurred.
229      */
230     public boolean hasNextRow()
231             throws IOException
232     {
233         // check for end of line ensures that an empty last line doesn't
234         // give a false positive for hasNextRow
235         if (neverRead || tokenizer.ttype == StreamTokenizer.TT_EOL)
236         {
237             tokenizer.nextToken();
238             tokenizer.pushBack();
239             neverRead = false;
240         }
241         return tokenizer.ttype != StreamTokenizer.TT_EOF;
242     }
243 
244     /***
245      * Returns a ValueParser object containing the next row of values.
246      *
247      * @return a ValueParser object.
248      * @exception IOException an IOException occurred.
249      * @exception NoSuchElementException there are no more rows in the input.
250      */
251     public ValueParser nextRow()
252             throws IOException, NoSuchElementException
253     {
254         if (!hasNextRow())
255         {
256             throw new NoSuchElementException();
257         }
258 
259         if (lineValues == null)
260         {
261             lineValues = new BaseValueParser(characterEncoding);
262         }
263         else
264         {
265             lineValues.clear();
266         }
267 
268         Iterator it = columnNames.iterator();
269         tokenizer.nextToken();
270         while (tokenizer.ttype == StreamTokenizer.TT_WORD
271                 || tokenizer.ttype == '"' || tokenizer.ttype == fieldSeparator)
272         {
273             int lastTtype = 0;
274             // note this means that if there are more values than
275             // column names, the extra values are discarded.
276             if (it.hasNext())
277             {
278                 String colname = it.next().toString();
279                 String colval = tokenizer.sval;
280                 if (tokenizer.ttype != fieldSeparator && lastTtype != fieldSeparator)
281                 {
282                     if (DEBUG)
283                     {
284                         log.debug("DataStreamParser.nextRow(): " +
285                                 colname + "=" + colval);
286                     }
287                     lineValues.add(colname, colval);
288                 }
289                 else if (tokenizer.ttype == fieldSeparator && lastTtype != fieldSeparator)
290                 {
291                     lastTtype = tokenizer.ttype;
292                     tokenizer.nextToken();
293                     if (tokenizer.ttype != fieldSeparator && tokenizer.sval != null)
294                     {
295                         lineValues.add(colname, tokenizer.sval);
296                     }
297                     else if (tokenizer.ttype == StreamTokenizer.TT_EOL)
298                     {
299                         tokenizer.pushBack();
300                     }
301                 }
302             }
303             tokenizer.nextToken();
304         }
305 
306         return lineValues;
307     }
308 
309     /***
310      * Determine whether a further row of values exists in the input.
311      *
312      * @return true if the input has more rows.
313      */
314     public boolean hasNext()
315     {
316         boolean hasNext = false;
317 
318         try
319         {
320             hasNext = hasNextRow();
321         }
322         catch (IOException e)
323         {
324             log.error("IOException in CSVParser.hasNext", e);
325         }
326 
327         return hasNext;
328     }
329 
330     /***
331      * Returns a ValueParser object containing the next row of values.
332      *
333      * @return a ValueParser object as an Object.
334      * @exception NoSuchElementException there are no more rows in the input
335      *                                   or an IOException occurred.
336      */
337     public Object next()
338             throws NoSuchElementException
339     {
340         Object nextRow = null;
341 
342         try
343         {
344             nextRow = nextRow();
345         }
346         catch (IOException e)
347         {
348             log.error("IOException in CSVParser.next", e);
349             throw new NoSuchElementException();
350         }
351 
352         return nextRow;
353     }
354 
355     /***
356      * The optional Iterator.remove method is not supported.
357      *
358      * @exception UnsupportedOperationException the operation is not supported.
359      */
360     public void remove()
361             throws UnsupportedOperationException
362     {
363         throw new UnsupportedOperationException();
364     }
365 }