1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16 package org.supercsv.io;
17
18 import static org.junit.Assert.assertEquals;
19 import static org.junit.Assert.assertFalse;
20 import static org.junit.Assert.assertNull;
21 import static org.junit.Assert.assertTrue;
22 import static org.junit.Assert.fail;
23 import static org.supercsv.prefs.CsvPreference.EXCEL_PREFERENCE;
24
25 import java.io.IOException;
26 import java.io.Reader;
27 import java.io.StringReader;
28 import java.util.ArrayList;
29 import java.util.List;
30
31 import org.junit.After;
32 import org.junit.Before;
33 import org.junit.Test;
34 import org.supercsv.comment.CommentMatches;
35 import org.supercsv.comment.CommentStartsWith;
36 import org.supercsv.exception.SuperCsvException;
37 import org.supercsv.prefs.CsvPreference;
38
39 public class TokenizerTest {
40
41 private static final CsvPreference NORMAL_PREFERENCE = EXCEL_PREFERENCE;
42 private static final CsvPreference SPACES_NEED_QUOTES_PREFERENCE = new CsvPreference.Builder(EXCEL_PREFERENCE)
43 .surroundingSpacesNeedQuotes(true).build();
44 private static final CsvPreference DONT_IGNORE_EMPTY_LINES_PREFERENCE = new CsvPreference.Builder(EXCEL_PREFERENCE)
45 .ignoreEmptyLines(false).build();
46
47 private Tokenizer tokenizer;
48 private List<String> columns;
49
50
51
52
53
54
55 @Before
56 public void setUp() {
57 columns = new ArrayList<String>();
58 }
59
60
61
62
63 @After
64 public void tearDown() throws IOException {
65 if( tokenizer != null ) {
66 tokenizer.close();
67 }
68 }
69
70
71
72
73
74
75
76
77
78
79 private static Tokenizer createTokenizer(String input, CsvPreference preference) {
80 final Reader r = input != null ? new StringReader(input) : null;
81 return new Tokenizer(r, preference);
82 }
83
84
85
86
87 @Test(expected = NullPointerException.class)
88 public void testConstructorWithNullReader() throws Exception {
89 createTokenizer(null, NORMAL_PREFERENCE);
90 }
91
92
93
94
95 @Test(expected = NullPointerException.class)
96 public void testConstructorWithNullPreferences() throws Exception {
97 createTokenizer("", null);
98 }
99
100
101
102
103 @Test(expected = NullPointerException.class)
104 public void testReadColumnsWithNullList() throws Exception {
105 tokenizer = createTokenizer("", NORMAL_PREFERENCE);
106 tokenizer.readColumns(null);
107 }
108
109
110
111
112 @Test()
113 public void testGetPreferences() throws Exception {
114 tokenizer = createTokenizer("", NORMAL_PREFERENCE);
115 CsvPreference prefs = tokenizer.getPreferences();
116 assertEquals(NORMAL_PREFERENCE.getDelimiterChar(), prefs.getDelimiterChar());
117 assertEquals(NORMAL_PREFERENCE.getEndOfLineSymbols(), prefs.getEndOfLineSymbols());
118 assertEquals(NORMAL_PREFERENCE.getQuoteChar(), prefs.getQuoteChar());
119 assertEquals(NORMAL_PREFERENCE.isSurroundingSpacesNeedQuotes(), prefs.isSurroundingSpacesNeedQuotes());
120 }
121
122
123
124
125 @Test
126 public void testReadColumnsWithNoData() throws Exception {
127 final String input = "";
128 tokenizer = createTokenizer(input, NORMAL_PREFERENCE);
129 tokenizer.readColumns(columns);
130 assertTrue(columns.isEmpty());
131 assertEquals(input, tokenizer.getUntokenizedRow());
132 }
133
134
135
136
137 @Test
138 public void testEmptyLines() throws Exception {
139
140 final String input = "\n\nthis is the third line\n";
141 tokenizer = createTokenizer(input, NORMAL_PREFERENCE);
142 tokenizer.readColumns(columns);
143 assertTrue(columns.size() == 1);
144 assertEquals("this is the third line", columns.get(0));
145 assertEquals(3, tokenizer.getLineNumber());
146 assertEquals("this is the third line", tokenizer.getUntokenizedRow());
147 }
148
149
150
151
152
153 @Test
154 public void testEmptyLinesWithIgnoreEmptyLines() throws Exception {
155
156 final String input = "\nthis is the second line\n\n";
157 tokenizer = createTokenizer(input, DONT_IGNORE_EMPTY_LINES_PREFERENCE);
158 tokenizer.readColumns(columns);
159 assertTrue(columns.size() == 1);
160 assertNull(columns.get(0));
161 assertEquals(1, tokenizer.getLineNumber());
162 assertEquals("", tokenizer.getUntokenizedRow());
163
164 tokenizer.readColumns(columns);
165 assertTrue(columns.size() == 1);
166 assertEquals("this is the second line", columns.get(0));
167 assertEquals(2, tokenizer.getLineNumber());
168 assertEquals("this is the second line", tokenizer.getUntokenizedRow());
169
170 tokenizer.readColumns(columns);
171 assertTrue(columns.size() == 1);
172 assertNull(columns.get(0));
173 assertEquals(3, tokenizer.getLineNumber());
174 assertEquals("", tokenizer.getUntokenizedRow());
175 }
176
177
178
179
180
181 @Test
182 public void testQuotedFieldWithSurroundingText() throws Exception {
183
184 final String input = "surrounding \"quoted\" text";
185 tokenizer = createTokenizer(input, NORMAL_PREFERENCE);
186 tokenizer.readColumns(columns);
187 assertTrue(columns.size() == 1);
188 assertEquals("surrounding quoted text", columns.get(0));
189 assertEquals(1, tokenizer.getLineNumber());
190 assertEquals(input, tokenizer.getUntokenizedRow());
191
192
193 tokenizer = createTokenizer(input, SPACES_NEED_QUOTES_PREFERENCE);
194 tokenizer.readColumns(columns);
195 assertTrue(columns.size() == 1);
196 assertEquals("surrounding quoted text", columns.get(0));
197 assertEquals(1, tokenizer.getLineNumber());
198 assertEquals(input, tokenizer.getUntokenizedRow());
199 }
200
201
202
203
204
205 @Test
206 public void testQuotedFieldWithTextAfter() throws Exception {
207
208
209 final String input = "\"quoted on 2 lines\nand afterward some\" text";
210 tokenizer = createTokenizer(input, NORMAL_PREFERENCE);
211 tokenizer.readColumns(columns);
212 assertEquals(1, columns.size());
213 assertEquals("quoted on 2 lines\nand afterward some text", columns.get(0));
214 assertEquals(2, tokenizer.getLineNumber());
215 assertEquals(input, tokenizer.getUntokenizedRow());
216
217
218 tokenizer = createTokenizer(input, SPACES_NEED_QUOTES_PREFERENCE);
219 tokenizer.readColumns(columns);
220 assertEquals(1, columns.size());
221 assertEquals("quoted on 2 lines\nand afterward some text", columns.get(0));
222 assertEquals(2, tokenizer.getLineNumber());
223 assertEquals(input, tokenizer.getUntokenizedRow());
224 }
225
226
227
228
229 @Test
230 public void testQuotedNewline() throws Exception {
231
232 final String input = "\"\n\"";
233 tokenizer = createTokenizer(input, NORMAL_PREFERENCE);
234 tokenizer.readColumns(columns);
235 assertTrue(columns.size() == 1);
236 assertEquals("\n", columns.get(0));
237 assertEquals(input, tokenizer.getUntokenizedRow());
238
239
240 tokenizer = createTokenizer(input, SPACES_NEED_QUOTES_PREFERENCE);
241 tokenizer.readColumns(columns);
242 assertTrue(columns.size() == 1);
243 assertEquals("\n", columns.get(0));
244 assertEquals(input, tokenizer.getUntokenizedRow());
245 }
246
247
248
249
250 @Test
251 public void testQuotedNewlines() throws Exception {
252
253 final String input = "\"one line\",\"two\nlines\",\"three\nlines\n!\"";
254 tokenizer = createTokenizer(input, NORMAL_PREFERENCE);
255 tokenizer.readColumns(columns);
256 assertTrue(columns.size() == 3);
257 assertEquals("one line", columns.get(0));
258 assertEquals("two\nlines", columns.get(1));
259 assertEquals("three\nlines\n!", columns.get(2));
260 assertEquals(input, tokenizer.getUntokenizedRow());
261
262
263 tokenizer = createTokenizer(input, SPACES_NEED_QUOTES_PREFERENCE);
264 tokenizer.readColumns(columns);
265 assertTrue(columns.size() == 3);
266 assertEquals("one line", columns.get(0));
267 assertEquals("two\nlines", columns.get(1));
268 assertEquals("three\nlines\n!", columns.get(2));
269 assertEquals(input, tokenizer.getUntokenizedRow());
270 }
271
272
273
274
275
276 @Test
277 public void testQuotedTextWithConsecutiveNewLines() throws Exception {
278
279
280 final String input = "one, \"multiline\n\n\ntext\"";
281 tokenizer = createTokenizer(input, NORMAL_PREFERENCE);
282 tokenizer.readColumns(columns);
283 assertEquals(2, columns.size());
284 assertEquals("one", columns.get(0));
285 assertEquals(" multiline\n\n\ntext", columns.get(1));
286 assertEquals(4, tokenizer.getLineNumber());
287 assertEquals(input, tokenizer.getUntokenizedRow());
288
289
290 tokenizer = createTokenizer(input, SPACES_NEED_QUOTES_PREFERENCE);
291 tokenizer.readColumns(columns);
292 assertTrue(columns.size() == 2);
293 assertEquals("one", columns.get(0));
294 assertEquals("multiline\n\n\ntext", columns.get(1));
295 assertEquals(input, tokenizer.getUntokenizedRow());
296 }
297
298
299
300
301 @Test
302 public void testQuotedFieldWithUnexpectedEOF() throws Exception {
303
304
305 final String input = "\"quoted spanning\ntwo lines with EOF reached before another quote";
306 tokenizer = createTokenizer(input, NORMAL_PREFERENCE);
307 try {
308 tokenizer.readColumns(columns);
309 fail("should have thrown SuperCsvException");
310 }
311 catch(SuperCsvException e) {
312 assertEquals("unexpected end of file while reading quoted column beginning on line 1 and ending on line 2",
313 e.getMessage());
314 }
315 }
316
317
318
319
320
321 @Test
322 public void testQuotedFieldWithUnexpectedNewline() throws Exception {
323
324
325 final String input = "col1,col2\n" +
326 "\"foo\",\"bar\n" +
327 "\"baz\",\"zoo\"\n" +
328 "\"aaa\",\"bbb\"";
329 CsvPreference pref = new CsvPreference.Builder(NORMAL_PREFERENCE)
330 .maxLinesPerRow(1).build();
331
332 tokenizer = createTokenizer(input, pref);
333 try {
334 boolean first = tokenizer.readColumns(columns);
335 assertEquals(true , first);
336
337 tokenizer.readColumns(columns);
338 fail("should have thrown SuperCsvException");
339 }
340 catch(SuperCsvException e) {
341 assertEquals("unexpected end of line while reading quoted column on line 2",
342 e.getMessage());
343 }
344 }
345
346
347
348
349
350 @Test
351 public void testQuotedFieldWithTwoMaxLines() throws Exception {
352
353
354 final String input = "col1,col2\n" +
355 "\"foo\",\"bar\n" +
356 "baz,zoo\n" +
357 "aaa,bbb";
358 CsvPreference pref = new CsvPreference.Builder(NORMAL_PREFERENCE)
359 .maxLinesPerRow(2).build();
360
361 tokenizer = createTokenizer(input, pref);
362 try {
363 boolean first = tokenizer.readColumns(columns);
364 assertEquals(true , first);
365
366 boolean second = tokenizer.readColumns(columns);
367 assertEquals(true , second);
368
369 tokenizer.readColumns(columns);
370 fail("should have thrown SuperCsvException");
371 }
372 catch(SuperCsvException e) {
373 assertEquals("max number of lines to read exceeded while reading quoted column beginning on line 2 and ending on line 3",
374 e.getMessage());
375 }
376 }
377
378 @Test
379 public void testQuotedFieldWithUnexpectedNewlineNoNextLineRead() throws Exception {
380
381
382 final String input = "col1,col2\n" +
383 "\"foo\",\"bar\n" +
384 "\"baz\",\"zoo\"\n" +
385 "\"aaa\",\"bbb\"";
386 CsvPreference pref = new CsvPreference.Builder(NORMAL_PREFERENCE)
387 .maxLinesPerRow(1).build();
388
389 tokenizer = createTokenizer(input, pref);
390 try {
391 final boolean first = tokenizer.readColumns(columns);
392 assertEquals(true , first);
393 assertEquals("[col1, col2]" , columns.toString());
394
395 tokenizer.readColumns(columns);
396 fail("should have thrown SuperCsvException");
397 }
398 catch(SuperCsvException e) {
399 assertEquals("unexpected end of line while reading quoted column on line 2",
400 e.getMessage());
401 }
402 final boolean third = tokenizer.readColumns(columns);
403 assertEquals(true , third);
404 assertEquals("[baz, zoo]" , columns.toString());
405
406 final boolean fourth = tokenizer.readColumns(columns);
407 assertEquals(true , fourth);
408 assertEquals("[aaa, bbb]" , columns.toString());
409
410
411 final boolean fifth = tokenizer.readColumns(columns);
412 assertEquals(false , fifth);
413 }
414
415
416
417
418
419 @Test
420 public void testQuotedFieldWithTwoMaxLinesNoMoreLinesRead() throws Exception {
421
422
423 final String input = "col1,col2\n" +
424 "\"foo,bar\n" +
425 "baz,zoo\n" +
426 "aaa,bbb";
427 CsvPreference pref = new CsvPreference.Builder(NORMAL_PREFERENCE)
428 .maxLinesPerRow(2).build();
429
430 tokenizer = createTokenizer(input, pref);
431 try {
432 boolean first = tokenizer.readColumns(columns);
433 assertEquals(true , first);
434 assertEquals("[col1, col2]" , columns.toString());
435
436
437 boolean second = tokenizer.readColumns(columns);
438 assertEquals(true , second);
439 assertEquals("[\"foo,bar]" , columns.toString());
440
441
442 tokenizer.readColumns(columns);
443 fail("should have thrown SuperCsvException");
444 }
445 catch(SuperCsvException e) {
446 assertEquals("max number of lines to read exceeded while reading quoted column beginning on line 2 and ending on line 3",
447 e.getMessage());
448 }
449 boolean fourth = tokenizer.readColumns(columns);
450 assertEquals(true , fourth);
451 assertEquals("[aaa, bbb]" , columns.toString());
452
453 }
454
455
456
457
458
459
460 @Test
461 public void testQuotedFirstFieldWithLeadingSpace() throws Exception {
462
463
464 final String input = " \"quoted with leading spaces\",two";
465 tokenizer = createTokenizer(input, NORMAL_PREFERENCE);
466 tokenizer.readColumns(columns);
467 assertTrue(columns.size() == 2);
468 assertEquals(" quoted with leading spaces", columns.get(0));
469 assertEquals("two", columns.get(1));
470 assertEquals(input, tokenizer.getUntokenizedRow());
471
472
473 tokenizer = createTokenizer(input, SPACES_NEED_QUOTES_PREFERENCE);
474 tokenizer.readColumns(columns);
475 assertTrue(columns.size() == 2);
476 assertEquals("quoted with leading spaces", columns.get(0));
477 assertEquals("two", columns.get(1));
478 assertEquals(input, tokenizer.getUntokenizedRow());
479 }
480
481
482
483
484
485
486 @Test
487 public void testQuotedLastFieldWithLeadingSpace() throws Exception {
488
489
490 final String input = "one,two, \"quoted with leading spaces\"";
491 tokenizer = createTokenizer(input, NORMAL_PREFERENCE);
492 tokenizer.readColumns(columns);
493 assertTrue(columns.size() == 3);
494 assertEquals("one", columns.get(0));
495 assertEquals("two", columns.get(1));
496 assertEquals(" quoted with leading spaces", columns.get(2));
497 assertEquals(input, tokenizer.getUntokenizedRow());
498
499
500 tokenizer = createTokenizer(input, SPACES_NEED_QUOTES_PREFERENCE);
501 tokenizer.readColumns(columns);
502 assertTrue(columns.size() == 3);
503 assertEquals("one", columns.get(0));
504 assertEquals("two", columns.get(1));
505 assertEquals("quoted with leading spaces", columns.get(2));
506 assertEquals(input, tokenizer.getUntokenizedRow());
507 }
508
509
510
511
512
513
514 @Test
515 public void testQuotedFirstFieldWithTrailingSpace() throws Exception {
516
517
518 final String input = "\"quoted with trailing spaces\" ,two";
519 tokenizer = createTokenizer(input, NORMAL_PREFERENCE);
520 tokenizer.readColumns(columns);
521 assertTrue(columns.size() == 2);
522 assertEquals("quoted with trailing spaces ", columns.get(0));
523 assertEquals("two", columns.get(1));
524 assertEquals(input, tokenizer.getUntokenizedRow());
525
526
527 tokenizer = createTokenizer(input, SPACES_NEED_QUOTES_PREFERENCE);
528 tokenizer.readColumns(columns);
529 assertTrue(columns.size() == 2);
530 assertEquals("quoted with trailing spaces", columns.get(0));
531 assertEquals("two", columns.get(1));
532 assertEquals(input, tokenizer.getUntokenizedRow());
533 }
534
535
536
537
538
539
540 @Test
541 public void testQuotedLastFieldWithTrailingSpace() throws Exception {
542
543
544 final String input = "one,two,\"quoted with trailing spaces\" ";
545 tokenizer = createTokenizer(input, NORMAL_PREFERENCE);
546 tokenizer.readColumns(columns);
547 assertTrue(columns.size() == 3);
548 assertEquals("one", columns.get(0));
549 assertEquals("two", columns.get(1));
550 assertEquals("quoted with trailing spaces ", columns.get(2));
551 assertEquals(input, tokenizer.getUntokenizedRow());
552
553
554 tokenizer = createTokenizer(input, SPACES_NEED_QUOTES_PREFERENCE);
555 tokenizer.readColumns(columns);
556 assertTrue(columns.size() == 3);
557 assertEquals("one", columns.get(0));
558 assertEquals("two", columns.get(1));
559 assertEquals("quoted with trailing spaces", columns.get(2));
560 assertEquals(input, tokenizer.getUntokenizedRow());
561 }
562
563
564
565
566 @Test
567 public void testQuotedSpaces() throws Exception {
568
569 final String input = "\" one \",\" two \",\" three \"";
570 tokenizer = createTokenizer(input, NORMAL_PREFERENCE);
571 tokenizer.readColumns(columns);
572 assertTrue(columns.size() == 3);
573 assertEquals(" one ", columns.get(0));
574 assertEquals(" two ", columns.get(1));
575 assertEquals(" three ", columns.get(2));
576 assertEquals(input, tokenizer.getUntokenizedRow());
577
578
579 tokenizer = createTokenizer(input, SPACES_NEED_QUOTES_PREFERENCE);
580 tokenizer.readColumns(columns);
581 assertTrue(columns.size() == 3);
582 assertEquals(" one ", columns.get(0));
583 assertEquals(" two ", columns.get(1));
584 assertEquals(" three ", columns.get(2));
585 assertEquals(input, tokenizer.getUntokenizedRow());
586 }
587
588
589
590
591 @Test
592 public void testSpaces() throws Exception {
593
594 final String input = " one , two , three ";
595 tokenizer = createTokenizer(input, NORMAL_PREFERENCE);
596 tokenizer.readColumns(columns);
597 assertTrue(columns.size() == 3);
598 assertEquals(" one ", columns.get(0));
599 assertEquals(" two ", columns.get(1));
600 assertEquals(" three ", columns.get(2));
601 assertEquals(input, tokenizer.getUntokenizedRow());
602
603
604 tokenizer = createTokenizer(input, SPACES_NEED_QUOTES_PREFERENCE);
605 tokenizer.readColumns(columns);
606 assertTrue(columns.size() == 3);
607 assertEquals("one", columns.get(0));
608 assertEquals("two", columns.get(1));
609 assertEquals("three", columns.get(2));
610 assertEquals(input, tokenizer.getUntokenizedRow());
611 }
612
613
614
615
616 @Test
617 public void testSpacesAndTabs() throws Exception {
618
619
620 final String input = "\t, \tone\t , \ttwo\t , \tthree\t ";
621 tokenizer = createTokenizer(input, NORMAL_PREFERENCE);
622 tokenizer.readColumns(columns);
623 assertTrue(columns.size() == 4);
624 assertEquals("\t", columns.get(0));
625 assertEquals(" \tone\t ", columns.get(1));
626 assertEquals(" \ttwo\t ", columns.get(2));
627 assertEquals(" \tthree\t ", columns.get(3));
628 assertEquals(input, tokenizer.getUntokenizedRow());
629
630
631 tokenizer = createTokenizer(input, SPACES_NEED_QUOTES_PREFERENCE);
632 tokenizer.readColumns(columns);
633 assertTrue(columns.size() == 4);
634 assertEquals("\t", columns.get(0));
635 assertEquals("\tone\t", columns.get(1));
636 assertEquals("\ttwo\t", columns.get(2));
637 assertEquals("\tthree\t", columns.get(3));
638 assertEquals(input, tokenizer.getUntokenizedRow());
639 }
640
641
642
643
644 @Test
645 public void testSpacesBetweenWords() throws Exception {
646
647 final String input = " one partridge , two turtle doves , three french hens ";
648 tokenizer = createTokenizer(input, NORMAL_PREFERENCE);
649 tokenizer.readColumns(columns);
650 assertTrue(columns.size() == 3);
651 assertEquals(" one partridge ", columns.get(0));
652 assertEquals(" two turtle doves ", columns.get(1));
653 assertEquals(" three french hens ", columns.get(2));
654 assertEquals(input, tokenizer.getUntokenizedRow());
655
656
657 tokenizer = createTokenizer(input, SPACES_NEED_QUOTES_PREFERENCE);
658 tokenizer.readColumns(columns);
659 assertTrue(columns.size() == 3);
660 assertEquals("one partridge", columns.get(0));
661 assertEquals("two turtle doves", columns.get(1));
662 assertEquals("three french hens", columns.get(2));
663 assertEquals(input, tokenizer.getUntokenizedRow());
664 }
665
666
667
668
669 @Test
670 public void testSkipCommentsStartsWith() throws IOException {
671
672 final CsvPreference commentsStartWithPrefs = new CsvPreference.Builder(EXCEL_PREFERENCE).skipComments(
673 new CommentStartsWith("#")).build();
674
675 final String input = "#comment\nnot,a,comment\n# another comment\nalso,not,comment";
676 final Tokenizer tokenizer = createTokenizer(input, commentsStartWithPrefs);
677 tokenizer.readColumns(columns);
678 assertTrue(columns.size() == 3);
679 assertEquals("not", columns.get(0));
680 assertEquals("a", columns.get(1));
681 assertEquals("comment", columns.get(2));
682
683 tokenizer.readColumns(columns);
684 assertTrue(columns.size() == 3);
685 assertEquals("also", columns.get(0));
686 assertEquals("not", columns.get(1));
687 assertEquals("comment", columns.get(2));
688
689 assertFalse(tokenizer.readColumns(columns));
690 }
691
692
693
694
695 @Test
696 public void testSkipCommentsMatches() throws IOException {
697
698 final CsvPreference commentsMatchesPrefs = new CsvPreference.Builder(EXCEL_PREFERENCE).skipComments(
699 new CommentMatches("<!--.*-->")).build();
700
701 final String input = "<!--comment-->\nnot,a,comment\n<!-- another comment-->\nalso,not,comment";
702 final Tokenizer tokenizer = createTokenizer(input, commentsMatchesPrefs);
703 tokenizer.readColumns(columns);
704 assertTrue(columns.size() == 3);
705 assertEquals("not", columns.get(0));
706 assertEquals("a", columns.get(1));
707 assertEquals("comment", columns.get(2));
708
709 tokenizer.readColumns(columns);
710 assertTrue(columns.size() == 3);
711 assertEquals("also", columns.get(0));
712 assertEquals("not", columns.get(1));
713 assertEquals("comment", columns.get(2));
714
715 assertFalse(tokenizer.readColumns(columns));
716
717 }
718
719 }