1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16 package org.supercsv.io;
17
18 import java.io.IOException;
19 import java.io.Reader;
20 import java.util.List;
21
22 import org.supercsv.comment.CommentMatcher;
23 import org.supercsv.exception.SuperCsvException;
24 import org.supercsv.prefs.CsvPreference;
25
26
27
28
29
30
31
32
33
34 public class Tokenizer extends AbstractTokenizer {
35
36 private static final char NEWLINE = '\n';
37
38 private static final char SPACE = ' ';
39
40 private final StringBuilder currentColumn = new StringBuilder();
41
42
43 private final StringBuilder currentRow = new StringBuilder();
44
45 private final int quoteChar;
46
47 private final int delimeterChar;
48
49 private final boolean surroundingSpacesNeedQuotes;
50
51 private final boolean ignoreEmptyLines;
52
53 private final CommentMatcher commentMatcher;
54
55 private final int maxLinesPerRow;
56
57
58
59
60 private enum TokenizerState {
61 NORMAL, QUOTE_MODE;
62 }
63
64
65
66
67
68
69
70
71
72
73
74 public Tokenizer(final Reader reader, final CsvPreference preferences) {
75 super(reader, preferences);
76 this.quoteChar = preferences.getQuoteChar();
77 this.delimeterChar = preferences.getDelimiterChar();
78 this.surroundingSpacesNeedQuotes = preferences.isSurroundingSpacesNeedQuotes();
79 this.ignoreEmptyLines = preferences.isIgnoreEmptyLines();
80 this.commentMatcher = preferences.getCommentMatcher();
81 this.maxLinesPerRow = preferences.getMaxLinesPerRow();
82 }
83
84
85
86
87 public boolean readColumns(final List<String> columns) throws IOException {
88
89 if( columns == null ) {
90 throw new NullPointerException("columns should not be null");
91 }
92
93
94 columns.clear();
95 currentColumn.setLength(0);
96 currentRow.setLength(0);
97
98
99 String line;
100 do {
101 line = readLine();
102 if( line == null ) {
103 return false;
104 }
105 }
106 while( ignoreEmptyLines && line.length() == 0 || (commentMatcher != null && commentMatcher.isComment(line)) );
107
108
109 currentRow.append(line);
110
111
112 TokenizerState state = TokenizerState.NORMAL;
113 int quoteScopeStartingLine = -1;
114 int potentialSpaces = 0;
115 int charIndex = 0;
116 while( true ) {
117 boolean endOfLineReached = charIndex == line.length();
118
119 if( endOfLineReached )
120 {
121 if( TokenizerState.NORMAL.equals(state) ) {
122
123
124
125
126 if( !surroundingSpacesNeedQuotes ) {
127 appendSpaces(currentColumn, potentialSpaces);
128 }
129 columns.add(currentColumn.length() > 0 ? currentColumn.toString() : null);
130 return true;
131 }
132 else
133 {
134
135
136
137
138
139 currentColumn.append(NEWLINE);
140 currentRow.append(NEWLINE);
141
142 charIndex = 0;
143
144 if (maxLinesPerRow > 0 && getLineNumber() - quoteScopeStartingLine + 1 >= maxLinesPerRow) {
145
146
147
148
149
150
151 String msg = maxLinesPerRow == 1 ?
152 String.format("unexpected end of line while reading quoted column on line %d",
153 getLineNumber()) :
154 String.format("max number of lines to read exceeded while reading quoted column" +
155 " beginning on line %d and ending on line %d",
156 quoteScopeStartingLine, getLineNumber());
157 throw new SuperCsvException(msg);
158 }
159 else if( (line = readLine()) == null ) {
160 throw new SuperCsvException(
161 String
162 .format(
163 "unexpected end of file while reading quoted column beginning on line %d and ending on line %d",
164 quoteScopeStartingLine, getLineNumber()));
165 }
166
167 currentRow.append(line);
168
169 if (line.length() == 0){
170
171 continue;
172 }
173 }
174 }
175
176 final char c = line.charAt(charIndex);
177
178 if( TokenizerState.NORMAL.equals(state) ) {
179
180
181
182
183
184 if( c == delimeterChar ) {
185
186
187
188 if( !surroundingSpacesNeedQuotes ) {
189 appendSpaces(currentColumn, potentialSpaces);
190 }
191 columns.add(currentColumn.length() > 0 ? currentColumn.toString() : null);
192 potentialSpaces = 0;
193 currentColumn.setLength(0);
194
195 } else if( c == SPACE ) {
196
197
198
199 potentialSpaces++;
200
201 }
202 else if( c == quoteChar ) {
203
204
205
206 state = TokenizerState.QUOTE_MODE;
207 quoteScopeStartingLine = getLineNumber();
208
209
210 if( !surroundingSpacesNeedQuotes || currentColumn.length() > 0 ) {
211 appendSpaces(currentColumn, potentialSpaces);
212 }
213 potentialSpaces = 0;
214
215 } else {
216
217
218
219
220 if( !surroundingSpacesNeedQuotes || currentColumn.length() > 0 ) {
221 appendSpaces(currentColumn, potentialSpaces);
222 }
223
224 potentialSpaces = 0;
225 currentColumn.append(c);
226 }
227
228 } else {
229
230
231
232
233
234 if( c == quoteChar ) {
235 int nextCharIndex = charIndex + 1;
236 boolean availableCharacters = nextCharIndex < line.length();
237 boolean nextCharIsQuote = availableCharacters && line.charAt(nextCharIndex) == quoteChar;
238 if( nextCharIsQuote ) {
239
240
241
242
243 currentColumn.append(c);
244 charIndex++;
245
246 } else {
247
248
249
250 state = TokenizerState.NORMAL;
251 quoteScopeStartingLine = -1;
252 }
253 } else {
254
255
256
257
258 currentColumn.append(c);
259 }
260 }
261
262 charIndex++;
263 }
264 }
265
266
267
268
269
270
271
272
273
274 private static void appendSpaces(final StringBuilder sb, final int spaces) {
275 for( int i = 0; i < spaces; i++ ) {
276 sb.append(SPACE);
277 }
278 }
279
280
281
282
283 public String getUntokenizedRow() {
284 return currentRow.toString();
285 }
286 }