1 package org.apache.turbine.util.parser;
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 import java.io.BufferedReader;
20 import java.io.IOException;
21 import java.io.InputStreamReader;
22 import java.io.Reader;
23 import java.io.StreamTokenizer;
24
25 import java.util.ArrayList;
26 import java.util.Iterator;
27 import java.util.List;
28 import java.util.NoSuchElementException;
29
30 import org.apache.commons.logging.Log;
31 import org.apache.commons.logging.LogFactory;
32
33 /***
34 * DataStreamParser is used to parse a stream with a fixed format and
35 * generate ValueParser objects which can be used to extract the values
36 * in the desired type.
37 *
38 * <p>The class itself is abstract - a concrete subclass which implements
39 * the initTokenizer method such as CSVParser or TSVParser is required
40 * to use the functionality.
41 *
42 * <p>The class implements the java.util.Iterator interface for convenience.
43 * This allows simple use in a Velocity template for example:
44 *
45 * <pre>
46 * #foreach ($row in $datastream)
47 * Name: $row.Name
48 * Description: $row.Description
49 * #end
50 * </pre>
51 *
52 * @author <a href="mailto:sean@informage.net">Sean Legassick</a>
53 * @author <a href="mailto:martin@mvdb.net">Martin van den Bemt</a>
54 * @version $Id: DataStreamParser.java,v 1.1.2.3 2004/08/14 20:11:43 henning Exp $
55 */
56 public abstract class DataStreamParser implements Iterator
57 {
58 /*** Logging */
59 private static Log log = LogFactory.getLog(DataStreamParser.class);
60
61 /***
62 * Conditional compilation flag.
63 */
64 private static final boolean DEBUG = false;
65
66 /***
67 * The constant for empty fields
68 */
69 protected static final String EMPTYFIELDNAME = "UNKNOWNFIELD";
70
71 /***
72 * The list of column names.
73 */
74 private List columnNames;
75
76 /***
77 * The stream tokenizer for reading values from the input reader.
78 */
79 private StreamTokenizer tokenizer;
80
81 /***
82 * The parameter parser holding the values of columns for the current line.
83 */
84 private ValueParser lineValues;
85
86 /***
87 * Indicates whether or not the tokenizer has read anything yet.
88 */
89 private boolean neverRead = true;
90
91 /***
92 * The character encoding of the input
93 */
94 private String characterEncoding;
95
96 /***
97 * The fieldseperator, which can be almost any char
98 */
99 private char fieldSeparator;
100
101 /***
102 * Create a new DataStreamParser instance. Requires a Reader to read the
103 * comma-separated values from, a list of column names and a
104 * character encoding.
105 *
106 * @param in the input reader.
107 * @param columnNames a list of column names.
108 * @param characterEncoding the character encoding of the input.
109 */
110 public DataStreamParser(Reader in, List columnNames,
111 String characterEncoding)
112 {
113 this.columnNames = columnNames;
114 this.characterEncoding = characterEncoding;
115
116 if (this.characterEncoding == null)
117 {
118
119 this.characterEncoding = "US-ASCII";
120 try
121 {
122 this.characterEncoding = ((InputStreamReader) in).getEncoding();
123 }
124 catch (ClassCastException e)
125 {
126 }
127 }
128
129 tokenizer = new StreamTokenizer(new BufferedReader(in));
130 initTokenizer(tokenizer);
131 }
132
133 /***
134 * Initialize the StreamTokenizer instance used to read the lines
135 * from the input reader. This must be implemented in subclasses to
136 * set up other tokenizing properties.
137 *
138 * @param tokenizer the tokenizer to adjust
139 */
140 protected void initTokenizer(StreamTokenizer tokenizer)
141 {
142
143
144 tokenizer.ordinaryChars('0', '9');
145 tokenizer.ordinaryChars('-', '-');
146 tokenizer.ordinaryChars('.', '.');
147
148
149
150 tokenizer.wordChars(' ', Integer.MAX_VALUE);
151
152
153 tokenizer.quoteChar('"');
154
155
156 tokenizer.eolIsSignificant(true);
157 }
158
159 /***
160 * This method must be called to setup the field seperator
161 * @param fieldSeparator the char which separates the fields
162 */
163 public void setFieldSeparator(char fieldSeparator)
164 {
165 this.fieldSeparator = fieldSeparator;
166
167 tokenizer.ordinaryChar(fieldSeparator);
168 }
169
170 /***
171 * Set the list of column names explicitly.
172 *
173 * @param columnNames A list of column names.
174 */
175 public void setColumnNames(List columnNames)
176 {
177 this.columnNames = columnNames;
178 }
179
180 /***
181 * Read the list of column names from the input reader using the
182 * tokenizer. If fieldNames are empty, we use the current fieldNumber
183 * + the EMPTYFIELDNAME to make one up.
184 *
185 * @exception IOException an IOException occurred.
186 */
187 public void readColumnNames()
188 throws IOException
189 {
190 columnNames = new ArrayList();
191 int lastTtype = 0;
192 int fieldCounter = 1;
193
194 neverRead = false;
195 tokenizer.nextToken();
196 while (tokenizer.ttype == StreamTokenizer.TT_WORD || tokenizer.ttype == StreamTokenizer.TT_EOL
197 || tokenizer.ttype == '"' || tokenizer.ttype == fieldSeparator)
198 {
199 if (tokenizer.ttype != fieldSeparator && tokenizer.ttype != StreamTokenizer.TT_EOL)
200 {
201 columnNames.add(tokenizer.sval);
202 fieldCounter++;
203 }
204 else if (tokenizer.ttype == fieldSeparator && lastTtype == fieldSeparator)
205 {
206
207 columnNames.add(EMPTYFIELDNAME + fieldCounter);
208 fieldCounter++;
209 }
210 else if (lastTtype == fieldSeparator && tokenizer.ttype == StreamTokenizer.TT_EOL)
211 {
212 columnNames.add(EMPTYFIELDNAME + fieldCounter);
213 break;
214 }
215 else if (tokenizer.ttype == StreamTokenizer.TT_EOL)
216 {
217 break;
218 }
219 lastTtype = tokenizer.ttype;
220 tokenizer.nextToken();
221 }
222 }
223
224 /***
225 * Determine whether a further row of values exists in the input.
226 *
227 * @return true if the input has more rows.
228 * @exception IOException an IOException occurred.
229 */
230 public boolean hasNextRow()
231 throws IOException
232 {
233
234
235 if (neverRead || tokenizer.ttype == StreamTokenizer.TT_EOL)
236 {
237 tokenizer.nextToken();
238 tokenizer.pushBack();
239 neverRead = false;
240 }
241 return tokenizer.ttype != StreamTokenizer.TT_EOF;
242 }
243
244 /***
245 * Returns a ValueParser object containing the next row of values.
246 *
247 * @return a ValueParser object.
248 * @exception IOException an IOException occurred.
249 * @exception NoSuchElementException there are no more rows in the input.
250 */
251 public ValueParser nextRow()
252 throws IOException, NoSuchElementException
253 {
254 if (!hasNextRow())
255 {
256 throw new NoSuchElementException();
257 }
258
259 if (lineValues == null)
260 {
261 lineValues = new BaseValueParser(characterEncoding);
262 }
263 else
264 {
265 lineValues.clear();
266 }
267
268 Iterator it = columnNames.iterator();
269 tokenizer.nextToken();
270 while (tokenizer.ttype == StreamTokenizer.TT_WORD
271 || tokenizer.ttype == '"' || tokenizer.ttype == fieldSeparator)
272 {
273 int lastTtype = 0;
274
275
276 if (it.hasNext())
277 {
278 String colname = it.next().toString();
279 String colval = tokenizer.sval;
280 if (tokenizer.ttype != fieldSeparator && lastTtype != fieldSeparator)
281 {
282 if (DEBUG)
283 {
284 log.debug("DataStreamParser.nextRow(): " +
285 colname + "=" + colval);
286 }
287 lineValues.add(colname, colval);
288 }
289 else if (tokenizer.ttype == fieldSeparator && lastTtype != fieldSeparator)
290 {
291 lastTtype = tokenizer.ttype;
292 tokenizer.nextToken();
293 if (tokenizer.ttype != fieldSeparator && tokenizer.sval != null)
294 {
295 lineValues.add(colname, tokenizer.sval);
296 }
297 else if (tokenizer.ttype == StreamTokenizer.TT_EOL)
298 {
299 tokenizer.pushBack();
300 }
301 }
302 }
303 tokenizer.nextToken();
304 }
305
306 return lineValues;
307 }
308
309 /***
310 * Determine whether a further row of values exists in the input.
311 *
312 * @return true if the input has more rows.
313 */
314 public boolean hasNext()
315 {
316 boolean hasNext = false;
317
318 try
319 {
320 hasNext = hasNextRow();
321 }
322 catch (IOException e)
323 {
324 log.error("IOException in CSVParser.hasNext", e);
325 }
326
327 return hasNext;
328 }
329
330 /***
331 * Returns a ValueParser object containing the next row of values.
332 *
333 * @return a ValueParser object as an Object.
334 * @exception NoSuchElementException there are no more rows in the input
335 * or an IOException occurred.
336 */
337 public Object next()
338 throws NoSuchElementException
339 {
340 Object nextRow = null;
341
342 try
343 {
344 nextRow = nextRow();
345 }
346 catch (IOException e)
347 {
348 log.error("IOException in CSVParser.next", e);
349 throw new NoSuchElementException();
350 }
351
352 return nextRow;
353 }
354
355 /***
356 * The optional Iterator.remove method is not supported.
357 *
358 * @exception UnsupportedOperationException the operation is not supported.
359 */
360 public void remove()
361 throws UnsupportedOperationException
362 {
363 throw new UnsupportedOperationException();
364 }
365 }