001 package org.apache.fulcrum.parser; 002 003 004 /* 005 * Licensed to the Apache Software Foundation (ASF) under one 006 * or more contributor license agreements. See the NOTICE file 007 * distributed with this work for additional information 008 * regarding copyright ownership. The ASF licenses this file 009 * to you under the Apache License, Version 2.0 (the 010 * "License"); you may not use this file except in compliance 011 * with the License. You may obtain a copy of the License at 012 * 013 * http://www.apache.org/licenses/LICENSE-2.0 014 * 015 * Unless required by applicable law or agreed to in writing, 016 * software distributed under the License is distributed on an 017 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 018 * KIND, either express or implied. See the License for the 019 * specific language governing permissions and limitations 020 * under the License. 021 */ 022 023 024 import java.io.BufferedReader; 025 import java.io.IOException; 026 import java.io.InputStreamReader; 027 import java.io.Reader; 028 import java.io.StreamTokenizer; 029 import java.util.ArrayList; 030 import java.util.Iterator; 031 import java.util.List; 032 import java.util.NoSuchElementException; 033 034 import org.apache.avalon.framework.logger.LogEnabled; 035 import org.apache.avalon.framework.logger.Logger; 036 037 /** 038 * DataStreamParser is used to parse a stream with a fixed format and 039 * generate ValueParser objects which can be used to extract the values 040 * in the desired type. 041 * 042 * <p>The class itself is abstract - a concrete subclass which implements 043 * the initTokenizer method such as CSVParser or TSVParser is required 044 * to use the functionality. 045 * 046 * <p>The class implements the java.util.Iterator interface for convenience. 047 * This allows simple use in a Velocity template for example: 048 * 049 * <pre> 050 * #foreach ($row in $datastream) 051 * Name: $row.Name 052 * Description: $row.Description 053 * #end 054 * </pre> 055 * 056 * @author <a href="mailto:sean@informage.net">Sean Legassick</a> 057 * @version $Id: DataStreamParser.java 732115 2009-01-06 20:54:04Z tv $ 058 */ 059 public abstract class DataStreamParser 060 implements Iterator, LogEnabled 061 { 062 /** 063 * The list of column names. 064 */ 065 private List columnNames; 066 067 /** 068 * The stream tokenizer for reading values from the input reader. 069 */ 070 private StreamTokenizer tokenizer; 071 072 /** 073 * The parameter parser holding the values of columns for the current line. 074 */ 075 private ValueParser lineValues; 076 077 /** 078 * Indicates whether or not the tokenizer has read anything yet. 079 */ 080 private boolean neverRead = true; 081 082 /** 083 * The character encoding of the input 084 */ 085 private String characterEncoding; 086 087 /** 088 * Logger to use 089 */ 090 protected Logger log; 091 092 /** 093 * Create a new DataStreamParser instance. Requires a Reader to read the 094 * comma-separated values from, a list of column names and a 095 * character encoding. 096 * 097 * @param in the input reader. 098 * @param columnNames a list of column names. 099 * @param characterEncoding the character encoding of the input. 100 */ 101 public DataStreamParser(Reader in, List columnNames, 102 String characterEncoding) 103 { 104 this.columnNames = columnNames; 105 this.characterEncoding = characterEncoding; 106 107 if (this.characterEncoding == null) 108 { 109 // try and get the characterEncoding from the reader 110 this.characterEncoding = "US-ASCII"; 111 try 112 { 113 this.characterEncoding = ((InputStreamReader)in).getEncoding(); 114 } 115 catch (ClassCastException e) 116 { 117 // ignore 118 } 119 } 120 121 tokenizer = new StreamTokenizer(new BufferedReader(in)); 122 initTokenizer(tokenizer); 123 } 124 125 /** 126 * Initialize the StreamTokenizer instance used to read the lines 127 * from the input reader. This must be implemented in subclasses to 128 * set up the tokenizing properties. 129 */ 130 protected abstract void initTokenizer(StreamTokenizer tokenizer); 131 132 /** 133 * Provide a logger 134 * 135 * @see org.apache.avalon.framework.logger.LogEnabled#enableLogging(org.apache.avalon.framework.logger.Logger) 136 */ 137 public void enableLogging(Logger logger) 138 { 139 this.log = logger.getChildLogger("DataStreamParser"); 140 } 141 142 /** 143 * Set the list of column names explicitly. 144 * 145 * @param columnNames A list of column names. 146 */ 147 public void setColumnNames(List columnNames) 148 { 149 this.columnNames = columnNames; 150 } 151 152 /** 153 * Read the list of column names from the input reader using the 154 * tokenizer. 155 * 156 * @exception IOException an IOException occurred. 157 */ 158 public void readColumnNames() 159 throws IOException 160 { 161 columnNames = new ArrayList(); 162 163 neverRead = false; 164 tokenizer.nextToken(); 165 while (tokenizer.ttype == StreamTokenizer.TT_WORD 166 || tokenizer.ttype == '"') 167 { 168 columnNames.add(tokenizer.sval); 169 tokenizer.nextToken(); 170 } 171 } 172 173 /** 174 * Determine whether a further row of values exists in the input. 175 * 176 * @return true if the input has more rows. 177 * @exception IOException an IOException occurred. 178 */ 179 public boolean hasNextRow() 180 throws IOException 181 { 182 // check for end of line ensures that an empty last line doesn't 183 // give a false positive for hasNextRow 184 if (neverRead || tokenizer.ttype == StreamTokenizer.TT_EOL) 185 { 186 tokenizer.nextToken(); 187 tokenizer.pushBack(); 188 neverRead = false; 189 } 190 return tokenizer.ttype != StreamTokenizer.TT_EOF; 191 } 192 193 /** 194 * Returns a ValueParser object containing the next row of values. 195 * 196 * @return a ValueParser object. 197 * @exception IOException an IOException occurred. 198 * @exception NoSuchElementException there are no more rows in the input. 199 */ 200 public ValueParser nextRow() 201 throws IOException, NoSuchElementException 202 { 203 if (!hasNextRow()) 204 { 205 throw new NoSuchElementException(); 206 } 207 208 if (lineValues == null) 209 { 210 lineValues = new BaseValueParser(characterEncoding); 211 } 212 else 213 { 214 lineValues.clear(); 215 } 216 217 Iterator it = columnNames.iterator(); 218 tokenizer.nextToken(); 219 while (tokenizer.ttype == StreamTokenizer.TT_WORD 220 || tokenizer.ttype == '"') 221 { 222 // note this means that if there are more values than 223 // column names, the extra values are discarded. 224 if (it.hasNext()) 225 { 226 String colname = it.next().toString(); 227 String colval = tokenizer.sval; 228 if (log.isDebugEnabled()) 229 { 230 log.debug("DataStreamParser.nextRow(): " + 231 colname + '=' + colval); 232 } 233 lineValues.add(colname, colval); 234 } 235 tokenizer.nextToken(); 236 } 237 238 return lineValues; 239 } 240 241 /** 242 * Determine whether a further row of values exists in the input. 243 * 244 * @return true if the input has more rows. 245 */ 246 public boolean hasNext() 247 { 248 boolean hasNext = false; 249 250 try 251 { 252 hasNext = hasNextRow(); 253 } 254 catch (IOException e) 255 { 256 log.error("IOException in CSVParser.hasNext", e); 257 } 258 259 return hasNext; 260 } 261 262 /** 263 * Returns a ValueParser object containing the next row of values. 264 * 265 * @return a ValueParser object as an Object. 266 * @exception NoSuchElementException there are no more rows in the input 267 * or an IOException occurred. 268 */ 269 public Object next() 270 throws NoSuchElementException 271 { 272 Object nextRow = null; 273 274 try 275 { 276 nextRow = nextRow(); 277 } 278 catch (IOException e) 279 { 280 log.error("IOException in CSVParser.next", e); 281 throw new NoSuchElementException(); 282 } 283 284 return nextRow; 285 } 286 287 /** 288 * The optional Iterator.remove method is not supported. 289 * 290 * @exception UnsupportedOperationException the operation is not supported. 291 */ 292 public void remove() 293 throws UnsupportedOperationException 294 { 295 throw new UnsupportedOperationException(); 296 } 297 }