View Javadoc
1   /*
2    * Copyright 2007 Kasper B. Graversen
3    * 
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * 
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    * 
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  package org.supercsv.io;
17  
18  import java.io.IOException;
19  import java.io.Reader;
20  import java.util.List;
21  
22  import org.supercsv.comment.CommentMatcher;
23  import org.supercsv.exception.SuperCsvException;
24  import org.supercsv.prefs.CsvPreference;
25  
26  /**
27   * Reads the CSV file, line by line. If you want the line-reading functionality of this class, but want to define your
28   * own implementation of {@link #readColumns(List)}, then consider writing your own Tokenizer by extending
29   * AbstractTokenizer.
30   * 
31   * @author Kasper B. Graversen
32   * @author James Bassett
33   */
34  public class Tokenizer extends AbstractTokenizer {
35  	
36  	private static final char NEWLINE = '\n';
37  	
38  	private static final char SPACE = ' ';
39  	
40  	private final StringBuilder currentColumn = new StringBuilder();
41  	
42  	/* the raw, untokenized CSV row (may span multiple lines) */
43  	private final StringBuilder currentRow = new StringBuilder();
44  	
45  	private final int quoteChar;
46  	
47  	private final int delimeterChar;
48  	
49  	private final boolean surroundingSpacesNeedQuotes;
50  	
51  	private final boolean ignoreEmptyLines;
52  	
53  	private final CommentMatcher commentMatcher;
54  
55  	private final int maxLinesPerRow;
56  	
57  	/**
58  	 * Enumeration of tokenizer states. QUOTE_MODE is activated between quotes.
59  	 */
60  	private enum TokenizerState {
61  		NORMAL, QUOTE_MODE;
62  	}
63  	
64  	/**
65  	 * Constructs a new <tt>Tokenizer</tt>, which reads the CSV file, line by line.
66  	 * 
67  	 * @param reader
68  	 *            the reader
69  	 * @param preferences
70  	 *            the CSV preferences
71  	 * @throws NullPointerException
72  	 *             if reader or preferences is null
73  	 */
74  	public Tokenizer(final Reader reader, final CsvPreference preferences) {
75  		super(reader, preferences);
76  		this.quoteChar = preferences.getQuoteChar();
77  		this.delimeterChar = preferences.getDelimiterChar();
78  		this.surroundingSpacesNeedQuotes = preferences.isSurroundingSpacesNeedQuotes();
79  		this.ignoreEmptyLines = preferences.isIgnoreEmptyLines();
80  		this.commentMatcher = preferences.getCommentMatcher();
81  		this.maxLinesPerRow = preferences.getMaxLinesPerRow();
82  	}
83  	
84  	/**
85  	 * {@inheritDoc}
86  	 */
87  	public boolean readColumns(final List<String> columns) throws IOException {
88  		
89  		if( columns == null ) {
90  			throw new NullPointerException("columns should not be null");
91  		}
92  		
93  		// clear the reusable List and StringBuilders
94  		columns.clear();
95  		currentColumn.setLength(0);
96  		currentRow.setLength(0);
97  		
98  		// read a line (ignoring empty lines/comments if necessary)
99  		String line;
100 		do {
101 			line = readLine();
102 			if( line == null ) {
103 				return false; // EOF
104 			}
105 		}
106 		while( ignoreEmptyLines && line.length() == 0 || (commentMatcher != null && commentMatcher.isComment(line)) );
107 		
108 		// update the untokenized CSV row
109 		currentRow.append(line);
110 		
111 		// process each character in the line, catering for surrounding quotes (QUOTE_MODE)
112 		TokenizerState state = TokenizerState.NORMAL;
113 		int quoteScopeStartingLine = -1; // the line number where a potential multi-line cell starts
114 		int potentialSpaces = 0; // keep track of spaces (so leading/trailing space can be removed if required)
115 		int charIndex = 0;
116 		while( true ) {
117 			boolean endOfLineReached = charIndex == line.length();
118 			
119 			if( endOfLineReached )
120 			{
121 				if( TokenizerState.NORMAL.equals(state) ) {
122 					/*
123 					 * Newline. Add any required spaces (if surrounding spaces don't need quotes) and return (we've read
124 					 * a line!).
125 					 */
126 					if( !surroundingSpacesNeedQuotes ) {
127 						appendSpaces(currentColumn, potentialSpaces);
128 					}
129 					columns.add(currentColumn.length() > 0 ? currentColumn.toString() : null); // "" -> null
130 					return true;
131 				}
132 				else
133 				{
134 					/*
135 					 * Newline. Doesn't count as newline while in QUOTESCOPE. Add the newline char, reset the charIndex
136 					 * (will update to 0 for next iteration), read in the next line, then then continue to next
137 					 * character.
138 					 */
139 					currentColumn.append(NEWLINE);
140 					currentRow.append(NEWLINE); // specific line terminator lost, \n will have to suffice
141 					
142 					charIndex = 0;
143 
144 					if (maxLinesPerRow > 0 && getLineNumber() - quoteScopeStartingLine + 1 >= maxLinesPerRow) {
145 						/*
146 						 * The quoted section that is being parsed spans too many lines, so to avoid excessive memory
147 						 * usage parsing something that is probably human error anyways, throw an exception. If each
148 						 * row is suppose to be a single line and this has been exceeded, throw a more descriptive
149 						 * exception
150 						 */
151 						String msg = maxLinesPerRow == 1 ?
152 								String.format("unexpected end of line while reading quoted column on line %d",
153 											  getLineNumber()) :
154 								String.format("max number of lines to read exceeded while reading quoted column" +
155 											  " beginning on line %d and ending on line %d",
156 											  quoteScopeStartingLine, getLineNumber());
157 						throw new SuperCsvException(msg);
158 					}
159 					else if( (line = readLine()) == null ) {
160 						throw new SuperCsvException(
161 							String
162 								.format(
163 									"unexpected end of file while reading quoted column beginning on line %d and ending on line %d",
164 									quoteScopeStartingLine, getLineNumber()));
165 					}
166 					
167 					currentRow.append(line); // update untokenized CSV row
168 					
169 				    if (line.length() == 0){
170 				    	// consecutive newlines
171                         continue;
172 				    }
173 				}
174 			}
175 			
176 			final char c = line.charAt(charIndex);
177 			
178 			if( TokenizerState.NORMAL.equals(state) ) {
179 				
180 				/*
181 				 * NORMAL mode (not within quotes).
182 				 */
183 				
184 				if( c == delimeterChar ) {
185 					/*
186 					 * Delimiter. Save the column (trim trailing space if required) then continue to next character.
187 					 */
188 					if( !surroundingSpacesNeedQuotes ) {
189 						appendSpaces(currentColumn, potentialSpaces);
190 					}
191 					columns.add(currentColumn.length() > 0 ? currentColumn.toString() : null); // "" -> null
192 					potentialSpaces = 0;
193 					currentColumn.setLength(0);
194 					
195 				} else if( c == SPACE ) {
196 					/*
197 					 * Space. Remember it, then continue to next character.
198 					 */
199 					potentialSpaces++;
200 					
201 				}
202 				else if( c == quoteChar ) {
203 					/*
204 					 * A single quote ("). Update to QUOTESCOPE (but don't save quote), then continue to next character.
205 					 */
206 					state = TokenizerState.QUOTE_MODE;
207 					quoteScopeStartingLine = getLineNumber();
208 					
209 					// cater for spaces before a quoted section (be lenient!)
210 					if( !surroundingSpacesNeedQuotes || currentColumn.length() > 0 ) {
211 						appendSpaces(currentColumn, potentialSpaces);
212 					}
213 					potentialSpaces = 0;
214 					
215 				} else {
216 					/*
217 					 * Just a normal character. Add any required spaces (but trim any leading spaces if surrounding
218 					 * spaces need quotes), add the character, then continue to next character.
219 					 */
220 					if( !surroundingSpacesNeedQuotes || currentColumn.length() > 0 ) {
221 						appendSpaces(currentColumn, potentialSpaces);
222 					}
223 					
224 					potentialSpaces = 0;
225 					currentColumn.append(c);
226 				}
227 				
228 			} else {
229 				
230 				/*
231 				 * QUOTE_MODE (within quotes).
232 				 */
233 				
234 				if( c == quoteChar ) {
235 					int nextCharIndex = charIndex + 1;
236 					boolean availableCharacters = nextCharIndex < line.length();
237 					boolean nextCharIsQuote = availableCharacters && line.charAt(nextCharIndex) == quoteChar;
238 					if( nextCharIsQuote ) {
239 						/*
240 						 * An escaped quote (""). Add a single quote, then move the cursor so the next iteration of the
241 						 * loop will read the character following the escaped quote.
242 						 */
243 						currentColumn.append(c);
244 						charIndex++;
245 						
246 					} else {
247 						/*
248 						 * A single quote ("). Update to NORMAL (but don't save quote), then continue to next character.
249 						 */
250 						state = TokenizerState.NORMAL;
251 						quoteScopeStartingLine = -1; // reset ready for next multi-line cell
252 					}
253 				} else {
254 					/*
255 					 * Just a normal character, delimiter (they don't count in QUOTESCOPE) or space. Add the character,
256 					 * then continue to next character.
257 					 */
258 					currentColumn.append(c);
259 				}
260 			}
261 			
262 			charIndex++; // read next char of the line
263 		}
264 	}
265 	
266 	/**
267 	 * Appends the required number of spaces to the StringBuilder.
268 	 * 
269 	 * @param sb
270 	 *            the StringBuilder
271 	 * @param spaces
272 	 *            the required number of spaces to append
273 	 */
274 	private static void appendSpaces(final StringBuilder sb, final int spaces) {
275 		for( int i = 0; i < spaces; i++ ) {
276 			sb.append(SPACE);
277 		}
278 	}
279 	
280 	/**
281 	 * {@inheritDoc}
282 	 */
283 	public String getUntokenizedRow() {
284 		return currentRow.toString();
285 	}
286 }