You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

csv_test.go 15KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599
  1. // Copyright 2021 The Gitea Authors. All rights reserved.
  2. // Use of this source code is governed by a MIT-style
  3. // license that can be found in the LICENSE file.
  4. package csv
  5. import (
  6. "bytes"
  7. "encoding/csv"
  8. "io"
  9. "strconv"
  10. "strings"
  11. "testing"
  12. "code.gitea.io/gitea/modules/markup"
  13. "github.com/stretchr/testify/assert"
  14. )
  15. func TestCreateReader(t *testing.T) {
  16. rd := CreateReader(bytes.NewReader([]byte{}), ',')
  17. assert.Equal(t, ',', rd.Comma)
  18. }
  19. func decodeSlashes(t *testing.T, s string) string {
  20. s = strings.ReplaceAll(s, "\n", "\\n")
  21. s = strings.ReplaceAll(s, "\"", "\\\"")
  22. decoded, err := strconv.Unquote(`"` + s + `"`)
  23. assert.NoError(t, err, "unable to decode string")
  24. return decoded
  25. }
  26. func TestCreateReaderAndDetermineDelimiter(t *testing.T) {
  27. cases := []struct {
  28. csv string
  29. expectedRows [][]string
  30. expectedDelimiter rune
  31. }{
  32. // case 0 - semicolon delimited
  33. {
  34. csv: `a;b;c
  35. 1;2;3
  36. 4;5;6`,
  37. expectedRows: [][]string{
  38. {"a", "b", "c"},
  39. {"1", "2", "3"},
  40. {"4", "5", "6"},
  41. },
  42. expectedDelimiter: ';',
  43. },
  44. // case 1 - tab delimited with empty fields
  45. {
  46. csv: `col1 col2 col3
  47. a, b c
  48. e f
  49. g h i
  50. j l
  51. m n,\t
  52. p q r
  53. u
  54. v w x
  55. y\t\t
  56. `,
  57. expectedRows: [][]string{
  58. {"col1", "col2", "col3"},
  59. {"a,", "b", "c"},
  60. {"", "e", "f"},
  61. {"g", "h", "i"},
  62. {"j", "", "l"},
  63. {"m", "n,", ""},
  64. {"p", "q", "r"},
  65. {"", "", "u"},
  66. {"v", "w", "x"},
  67. {"y", "", ""},
  68. {"", "", ""},
  69. },
  70. expectedDelimiter: '\t',
  71. },
  72. // case 2 - comma delimited with leading spaces
  73. {
  74. csv: ` col1,col2,col3
  75. a, b, c
  76. d,e,f
  77. ,h, i
  78. j, ,\x20
  79. , , `,
  80. expectedRows: [][]string{
  81. {"col1", "col2", "col3"},
  82. {"a", "b", "c"},
  83. {"d", "e", "f"},
  84. {"", "h", "i"},
  85. {"j", "", ""},
  86. {"", "", ""},
  87. },
  88. expectedDelimiter: ',',
  89. },
  90. }
  91. for n, c := range cases {
  92. rd, err := CreateReaderAndDetermineDelimiter(nil, strings.NewReader(decodeSlashes(t, c.csv)))
  93. assert.NoError(t, err, "case %d: should not throw error: %v\n", n, err)
  94. assert.EqualValues(t, c.expectedDelimiter, rd.Comma, "case %d: delimiter should be '%c', got '%c'", n, c.expectedDelimiter, rd.Comma)
  95. rows, err := rd.ReadAll()
  96. assert.NoError(t, err, "case %d: should not throw error: %v\n", n, err)
  97. assert.EqualValues(t, c.expectedRows, rows, "case %d: rows should be equal", n)
  98. }
  99. }
  100. type mockReader struct{}
  101. func (r *mockReader) Read(buf []byte) (int, error) {
  102. return 0, io.ErrShortBuffer
  103. }
  104. func TestDetermineDelimiterShortBufferError(t *testing.T) {
  105. rd, err := CreateReaderAndDetermineDelimiter(nil, &mockReader{})
  106. assert.Error(t, err, "CreateReaderAndDetermineDelimiter() should throw an error")
  107. assert.ErrorIs(t, err, io.ErrShortBuffer)
  108. assert.Nil(t, rd, "CSV reader should be mnil")
  109. }
  110. func TestDetermineDelimiterReadAllError(t *testing.T) {
  111. rd, err := CreateReaderAndDetermineDelimiter(nil, strings.NewReader(`col1,col2
  112. a;b
  113. c@e
  114. f g
  115. h|i
  116. jkl`))
  117. assert.NoError(t, err, "CreateReaderAndDetermineDelimiter() shouldn't throw error")
  118. assert.NotNil(t, rd, "CSV reader should not be mnil")
  119. rows, err := rd.ReadAll()
  120. assert.Error(t, err, "RaadAll() should throw error")
  121. assert.ErrorIs(t, err, csv.ErrFieldCount)
  122. assert.Empty(t, rows, "rows should be empty")
  123. }
  124. func TestDetermineDelimiter(t *testing.T) {
  125. cases := []struct {
  126. csv string
  127. filename string
  128. expectedDelimiter rune
  129. }{
  130. // case 0 - semicolon delmited
  131. {
  132. csv: "a",
  133. filename: "test.csv",
  134. expectedDelimiter: ',',
  135. },
  136. // case 1 - single column/row CSV
  137. {
  138. csv: "a",
  139. filename: "",
  140. expectedDelimiter: ',',
  141. },
  142. // case 2 - single column, single row CSV w/ tsv file extension (so is tabbed delimited)
  143. {
  144. csv: "1,2",
  145. filename: "test.tsv",
  146. expectedDelimiter: '\t',
  147. },
  148. // case 3 - two column, single row CSV w/ no filename, so will guess comma as delimiter
  149. {
  150. csv: "1,2",
  151. filename: "",
  152. expectedDelimiter: ',',
  153. },
  154. // case 4 - semi-colon delimited with csv extension
  155. {
  156. csv: "1;2",
  157. filename: "test.csv",
  158. expectedDelimiter: ';',
  159. },
  160. // case 5 - tabbed delimited with tsv extension
  161. {
  162. csv: "1\t2",
  163. filename: "test.tsv",
  164. expectedDelimiter: '\t',
  165. },
  166. // case 6 - tabbed delimited without any filename
  167. {
  168. csv: "1\t2",
  169. filename: "",
  170. expectedDelimiter: '\t',
  171. },
  172. // case 7 - tabs won't work, only commas as every row has same amount of commas
  173. {
  174. csv: "col1,col2\nfirst\tval,seconed\tval",
  175. filename: "",
  176. expectedDelimiter: ',',
  177. },
  178. // case 8 - While looks like comma delimited, has psv extension
  179. {
  180. csv: "1,2",
  181. filename: "test.psv",
  182. expectedDelimiter: '|',
  183. },
  184. // case 9 - pipe delmiited with no extension
  185. {
  186. csv: "1|2",
  187. filename: "",
  188. expectedDelimiter: '|',
  189. },
  190. // case 10 - semi-colon delimited with commas in values
  191. {
  192. csv: "1,2,3;4,5,6;7,8,9\na;b;c",
  193. filename: "",
  194. expectedDelimiter: ';',
  195. },
  196. // case 11 - semi-colon delimited with newline in content
  197. {
  198. csv: `"1,2,3,4";"a
  199. b";%
  200. c;d;#`,
  201. filename: "",
  202. expectedDelimiter: ';',
  203. },
  204. // case 12 - HTML as single value
  205. {
  206. csv: "<br/>",
  207. filename: "",
  208. expectedDelimiter: ',',
  209. },
  210. // case 13 - tab delimited with commas in values
  211. {
  212. csv: `name email note
  213. John Doe john@doe.com This,note,had,a,lot,of,commas,to,test,delimiters`,
  214. filename: "",
  215. expectedDelimiter: '\t',
  216. },
  217. }
  218. for n, c := range cases {
  219. delimiter := determineDelimiter(&markup.RenderContext{Filename: c.filename}, []byte(decodeSlashes(t, c.csv)))
  220. assert.EqualValues(t, c.expectedDelimiter, delimiter, "case %d: delimiter should be equal, expected '%c' got '%c'", n, c.expectedDelimiter, delimiter)
  221. }
  222. }
  223. func TestRemoveQuotedString(t *testing.T) {
  224. cases := []struct {
  225. text string
  226. expectedText string
  227. }{
  228. // case 0 - quoted text with escaped quotes in 1st column
  229. {
  230. text: `col1,col2,col3
  231. "quoted ""text"" with
  232. new lines
  233. in first column",b,c`,
  234. expectedText: `col1,col2,col3
  235. ,b,c`,
  236. },
  237. // case 1 - quoted text with escaped quotes in 2nd column
  238. {
  239. text: `col1,col2,col3
  240. a,"quoted ""text"" with
  241. new lines
  242. in second column",c`,
  243. expectedText: `col1,col2,col3
  244. a,,c`,
  245. },
  246. // case 2 - quoted text with escaped quotes in last column
  247. {
  248. text: `col1,col2,col3
  249. a,b,"quoted ""text"" with
  250. new lines
  251. in last column"`,
  252. expectedText: `col1,col2,col3
  253. a,b,`,
  254. },
  255. // case 3 - csv with lots of quotes
  256. {
  257. text: `a,"b",c,d,"e
  258. e
  259. e",f
  260. a,bb,c,d,ee ,"f
  261. f"
  262. a,b,"c ""
  263. c",d,e,f`,
  264. expectedText: `a,,c,d,,f
  265. a,bb,c,d,ee ,
  266. a,b,,d,e,f`,
  267. },
  268. // case 4 - csv with pipes and quotes
  269. {
  270. text: `Col1 | Col2 | Col3
  271. abc | "Hello
  272. World"|123
  273. "de
  274. f" | 4.56 | 789`,
  275. expectedText: `Col1 | Col2 | Col3
  276. abc | |123
  277. | 4.56 | 789`,
  278. },
  279. }
  280. for n, c := range cases {
  281. modifiedText := removeQuotedString(decodeSlashes(t, c.text))
  282. assert.EqualValues(t, c.expectedText, modifiedText, "case %d: modified text should be equal", n)
  283. }
  284. }
  285. func TestGuessDelimiter(t *testing.T) {
  286. cases := []struct {
  287. csv string
  288. expectedDelimiter rune
  289. }{
  290. // case 0 - single cell, comma delmited
  291. {
  292. csv: "a",
  293. expectedDelimiter: ',',
  294. },
  295. // case 1 - two cells, comma delimited
  296. {
  297. csv: "1,2",
  298. expectedDelimiter: ',',
  299. },
  300. // case 2 - semicolon delimited
  301. {
  302. csv: "1;2",
  303. expectedDelimiter: ';',
  304. },
  305. // case 3 - tab delimited
  306. {
  307. csv: "1 2",
  308. expectedDelimiter: '\t',
  309. },
  310. // case 4 - pipe delimited
  311. {
  312. csv: "1|2",
  313. expectedDelimiter: '|',
  314. },
  315. // case 5 - semicolon delimited with commas in text
  316. {
  317. csv: `1,2,3;4,5,6;7,8,9
  318. a;b;c`,
  319. expectedDelimiter: ';',
  320. },
  321. // case 6 - semicolon delmited with commas in quoted text
  322. {
  323. csv: `"1,2,3,4";"a
  324. b"
  325. c;d`,
  326. expectedDelimiter: ';',
  327. },
  328. // case 7 - HTML
  329. {
  330. csv: "<br/>",
  331. expectedDelimiter: ',',
  332. },
  333. // case 8 - tab delimited with commas in value
  334. {
  335. csv: `name email note
  336. John Doe john@doe.com This,note,had,a,lot,of,commas,to,test,delimiters`,
  337. expectedDelimiter: '\t',
  338. },
  339. // case 9 - tab delimited with new lines in values, commas in values
  340. {
  341. csv: `1 "some,""more
  342. ""
  343. quoted,
  344. text," a
  345. 2 "some,
  346. quoted,\t
  347. text," b
  348. 3 "some,
  349. quoted,
  350. text" c
  351. 4 "some,
  352. quoted,
  353. text," d`,
  354. expectedDelimiter: '\t',
  355. },
  356. // case 10 - semicolon delmited with quotes and semicolon in value
  357. {
  358. csv: `col1;col2
  359. "this has a literal "" in the text";"and an ; in the text"`,
  360. expectedDelimiter: ';',
  361. },
  362. // case 11 - pipe delimited with quotes
  363. {
  364. csv: `Col1 | Col2 | Col3
  365. abc | "Hello
  366. World"|123
  367. "de
  368. |
  369. f" | 4.56 | 789`,
  370. expectedDelimiter: '|',
  371. },
  372. // case 12 - a tab delimited 6 column CSV, but the values are not quoted and have lots of commas.
  373. // In the previous bestScore algorithm, this would have picked comma as the delimiter, but now it should guess tab
  374. {
  375. csv: `c1 c2 c3 c4 c5 c6
  376. v,k,x,v ym,f,oa,qn,uqijh,n,s,wvygpo uj,kt,j,w,i,fvv,tm,f,ddt,b,mwt,e,t,teq,rd,p,a e,wfuae,t,h,q,im,ix,y h,mrlu,l,dz,ff,zi,af,emh ,gov,bmfelvb,axp,f,u,i,cni,x,z,v,sh,w,jo,,m,h
  377. k,ohf,pgr,tde,m,s te,ek,,v,,ic,kqc,dv,w,oi,j,w,gojjr,ug,,l,j,zl g,qziq,bcajx,zfow,ka,j,re,ohbc k,nzm,qm,ts,auf th,elb,lx,l,q,e,qf asbr,z,k,y,tltobga
  378. g,m,bu,el h,l,jwi,o,wge,fy,rure,c,g,lcxu,fxte,uns,cl,s,o,t,h,rsoy,f bq,s,uov,z,ikkhgyg,,sabs,c,hzue mc,b,,j,t,n sp,mn,,m,t,dysi,eq,pigb,rfa,z w,rfli,sg,,o,wjjjf,f,wxdzfk,x,t,p,zy,p,mg,r,l,h
  379. e,ewbkc,nugd,jj,sf,ih,i,n,jo,b,poem,kw,q,i,x,t,e,uug,k j,xm,sch,ux,h,,fb,f,pq,,mh,,f,v,,oba,w,h,v,eiz,yzd,o,a,c,e,dhp,q a,pbef,epc,k,rdpuw,cw k,j,e,d xf,dz,sviv,w,sqnzew,t,b v,yg,f,cq,ti,g,m,ta,hm,ym,ii,hxy,p,z,r,e,ga,sfs,r,p,l,aar,w,kox,j
  380. l,d,v,pp,q,j,bxip,w,i,im,qa,o e,o h,w,a,a,qzj,nt,qfn,ut,fvhu,ts hu,q,g,p,q,ofpje,fsqa,frp,p,vih,j,w,k,jx, ln,th,ka,l,b,vgk,rv,hkx rj,v,y,cwm,rao,e,l,wvr,ptc,lm,yg,u,k,i,b,zk,b,gv,fls
  381. velxtnhlyuysbnlchosqlhkozkdapjaueexjwrndwb nglvnv kqiv pbshwlmcexdzipopxjyrxhvjalwp pydvipwlkkpdvbtepahskwuornbsb qwbacgq
  382. l,y,u,bf,y,m,eals,n,cop,h,g,vs,jga,opt x,b,zwmn,hh,b,n,pdj,t,d px yn,vtd,u,y,b,ps,yo,qqnem,mxg,m,al,rd,c,k,d,q,f ilxdxa,m,y,,p,p,y,prgmg,q,n,etj,k,ns b,pl,z,jq,hk
  383. p,gc jn,mzr,bw sb,e,r,dy,ur,wzy,r,c,n,yglr,jbdu,r,pqk,k q,d,,,p,l,euhl,dc,rwh,t,tq,z,h,p,s,t,x,fugr,h wi,zxb,jcig,o,t,k mfh,ym,h,e,p,cnvx,uv,zx,x,pq,blt,v,r,u,tr,g,g,xt
  384. nri,p,,t,if,,y,ptlqq a,i w,ovli,um,w,f,re,k,sb,w,jy,zf i,g,p,q,mii,nr,jm,cc i,szl,k,eg,l,d ,ah,w,b,vh
  385. ,,sh,wx,mn,xm,u,d,yy,u,t,m,j,s,b ogadq,g,y,y,i,h,ln,jda,g,cz,s,rv,r,s,s,le,r, y,nu,f,nagj o,h,,adfy,o,nf,ns,gvsvnub,k,b,xyz v,h,g,ef,y,gb c,x,cw,x,go,h,t,x,cu,u,qgrqzrcmn,kq,cd,g,rejp,zcq
  386. skxg,t,vay,d,wug,d,xg,sexc rt g,ag,mjq,fjnyji,iwa,m,ml,b,ua,b,qjxeoc be,s,sh,n,jbzxs,g,n,i,h,y,r,be,mfo,u,p cw,r,,u,zn,eg,r,yac,m,l,edkr,ha,x,g,b,c,tg,c j,ye,u,ejd,maj,ea,bm,u,iy`,
  387. expectedDelimiter: '\t',
  388. },
  389. // case 13 - a CSV with more than 10 lines and since we only use the first 10 lines, it should still get the delimiter as semicolon
  390. {
  391. csv: `col1;col2;col3
  392. 1;1;1
  393. 2;2;2
  394. 3;3;3
  395. 4;4;4
  396. 5;5;5
  397. 6;6;6
  398. 7;7;7
  399. 8;8;8
  400. 9;9;9
  401. 10;10;10
  402. 11 11 11
  403. 12|12|12`,
  404. expectedDelimiter: ';',
  405. },
  406. // case 14 - a really long single line (over 10k) that will get truncated, but since it has commas and semicolons (but more semicolons) it will pick semicolon
  407. {
  408. csv: strings.Repeat("a;b,c;", 1700),
  409. expectedDelimiter: ';',
  410. },
  411. // case 15 - 2 lines that are well over 10k, but since the 2nd line is where this CSV will be truncated (10k sample), it will only use the first line, so semicolon will be picked
  412. {
  413. csv: "col1@col2@col3\na@b@" + strings.Repeat("c", 6000) + "\nd,e," + strings.Repeat("f", 4000),
  414. expectedDelimiter: '@',
  415. },
  416. // case 16 - has all delimiters so should return comma
  417. {
  418. csv: `col1,col2;col3@col4|col5 col6
  419. a b|c@d;e,f`,
  420. expectedDelimiter: ',',
  421. },
  422. // case 16 - nothing works (bad csv) so returns comma by default
  423. {
  424. csv: `col1,col2
  425. a;b
  426. c@e
  427. f g
  428. h|i
  429. jkl`,
  430. expectedDelimiter: ',',
  431. },
  432. }
  433. for n, c := range cases {
  434. delimiter := guessDelimiter([]byte(decodeSlashes(t, c.csv)))
  435. assert.EqualValues(t, c.expectedDelimiter, delimiter, "case %d: delimiter should be equal, expected '%c' got '%c'", n, c.expectedDelimiter, delimiter)
  436. }
  437. }
  438. func TestGuessFromBeforeAfterQuotes(t *testing.T) {
  439. cases := []struct {
  440. csv string
  441. expectedDelimiter rune
  442. }{
  443. // case 0 - tab delimited with new lines in values, commas in values
  444. {
  445. csv: `1 "some,""more
  446. ""
  447. quoted,
  448. text," a
  449. 2 "some,
  450. quoted,\t
  451. text," b
  452. 3 "some,
  453. quoted,
  454. text" c
  455. 4 "some,
  456. quoted,
  457. text," d`,
  458. expectedDelimiter: '\t',
  459. },
  460. // case 1 - semicolon delmited with quotes and semicolon in value
  461. {
  462. csv: `col1;col2
  463. "this has a literal "" in the text";"and an ; in the text"`,
  464. expectedDelimiter: ';',
  465. },
  466. // case 2 - pipe delimited with quotes
  467. {
  468. csv: `Col1 | Col2 | Col3
  469. abc | "Hello
  470. World"|123
  471. "de
  472. |
  473. f" | 4.56 | 789`,
  474. expectedDelimiter: '|',
  475. },
  476. // case 3 - a complicated quoted CSV that is semicolon delmiited
  477. {
  478. csv: `he; she
  479. "he said, ""hey!"""; "she said, ""hey back!"""
  480. but; "be"`,
  481. expectedDelimiter: ';',
  482. },
  483. // case 4 - no delimiter should be found
  484. {
  485. csv: `a,b`,
  486. expectedDelimiter: 0,
  487. },
  488. // case 5 - no limiter should be found
  489. {
  490. csv: `col1
  491. "he said, ""here I am"""`,
  492. expectedDelimiter: 0,
  493. },
  494. // case 6 - delimiter before double quoted string with space
  495. {
  496. csv: `col1|col2
  497. a| "he said, ""here I am"""`,
  498. expectedDelimiter: '|',
  499. },
  500. // case 7 - delimiter before double quoted string without space
  501. {
  502. csv: `col1|col2
  503. a|"he said, ""here I am"""`,
  504. expectedDelimiter: '|',
  505. },
  506. // case 8 - delimiter after double quoted string with space
  507. {
  508. csv: `col1, col2
  509. "abc\n
  510. ", def`,
  511. expectedDelimiter: ',',
  512. },
  513. // case 9 - delimiter after double quoted string without space
  514. {
  515. csv: `col1,col2
  516. "abc\n
  517. ",def`,
  518. expectedDelimiter: ',',
  519. },
  520. }
  521. for n, c := range cases {
  522. delimiter := guessFromBeforeAfterQuotes([]byte(decodeSlashes(t, c.csv)))
  523. assert.EqualValues(t, c.expectedDelimiter, delimiter, "case %d: delimiter should be equal, expected '%c' got '%c'", n, c.expectedDelimiter, delimiter)
  524. }
  525. }
  526. type mockLocale struct{}
  527. func (l mockLocale) Language() string {
  528. return "en"
  529. }
  530. func (l mockLocale) Tr(s string, _ ...interface{}) string {
  531. return s
  532. }
  533. func (l mockLocale) TrN(_cnt interface{}, key1, _keyN string, _args ...interface{}) string {
  534. return key1
  535. }
  536. func TestFormatError(t *testing.T) {
  537. cases := []struct {
  538. err error
  539. expectedMessage string
  540. expectsError bool
  541. }{
  542. {
  543. err: &csv.ParseError{
  544. Err: csv.ErrFieldCount,
  545. },
  546. expectedMessage: "repo.error.csv.invalid_field_count",
  547. expectsError: false,
  548. },
  549. {
  550. err: &csv.ParseError{
  551. Err: csv.ErrBareQuote,
  552. },
  553. expectedMessage: "repo.error.csv.unexpected",
  554. expectsError: false,
  555. },
  556. {
  557. err: bytes.ErrTooLarge,
  558. expectsError: true,
  559. },
  560. }
  561. for n, c := range cases {
  562. message, err := FormatError(c.err, mockLocale{})
  563. if c.expectsError {
  564. assert.Error(t, err, "case %d: expected an error to be returned", n)
  565. } else {
  566. assert.NoError(t, err, "case %d: no error was expected, got error: %v", n, err)
  567. assert.EqualValues(t, c.expectedMessage, message, "case %d: messages should be equal, expected '%s' got '%s'", n, c.expectedMessage, message)
  568. }
  569. }
  570. }