Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

csv_test.go 15KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589
  1. // Copyright 2021 The Gitea Authors. All rights reserved.
  2. // SPDX-License-Identifier: MIT
  3. package csv
  4. import (
  5. "bytes"
  6. "encoding/csv"
  7. "io"
  8. "strconv"
  9. "strings"
  10. "testing"
  11. "code.gitea.io/gitea/modules/git"
  12. "code.gitea.io/gitea/modules/markup"
  13. "code.gitea.io/gitea/modules/translation"
  14. "github.com/stretchr/testify/assert"
  15. )
  16. func TestCreateReader(t *testing.T) {
  17. rd := CreateReader(bytes.NewReader([]byte{}), ',')
  18. assert.Equal(t, ',', rd.Comma)
  19. }
  20. func decodeSlashes(t *testing.T, s string) string {
  21. s = strings.ReplaceAll(s, "\n", "\\n")
  22. s = strings.ReplaceAll(s, "\"", "\\\"")
  23. decoded, err := strconv.Unquote(`"` + s + `"`)
  24. assert.NoError(t, err, "unable to decode string")
  25. return decoded
  26. }
  27. func TestCreateReaderAndDetermineDelimiter(t *testing.T) {
  28. cases := []struct {
  29. csv string
  30. expectedRows [][]string
  31. expectedDelimiter rune
  32. }{
  33. // case 0 - semicolon delimited
  34. {
  35. csv: `a;b;c
  36. 1;2;3
  37. 4;5;6`,
  38. expectedRows: [][]string{
  39. {"a", "b", "c"},
  40. {"1", "2", "3"},
  41. {"4", "5", "6"},
  42. },
  43. expectedDelimiter: ';',
  44. },
  45. // case 1 - tab delimited with empty fields
  46. {
  47. csv: `col1 col2 col3
  48. a, b c
  49. e f
  50. g h i
  51. j l
  52. m n,\t
  53. p q r
  54. u
  55. v w x
  56. y\t\t
  57. `,
  58. expectedRows: [][]string{
  59. {"col1", "col2", "col3"},
  60. {"a,", "b", "c"},
  61. {"", "e", "f"},
  62. {"g", "h", "i"},
  63. {"j", "", "l"},
  64. {"m", "n,", ""},
  65. {"p", "q", "r"},
  66. {"", "", "u"},
  67. {"v", "w", "x"},
  68. {"y", "", ""},
  69. {"", "", ""},
  70. },
  71. expectedDelimiter: '\t',
  72. },
  73. // case 2 - comma delimited with leading spaces
  74. {
  75. csv: ` col1,col2,col3
  76. a, b, c
  77. d,e,f
  78. ,h, i
  79. j, ,\x20
  80. , , `,
  81. expectedRows: [][]string{
  82. {"col1", "col2", "col3"},
  83. {"a", "b", "c"},
  84. {"d", "e", "f"},
  85. {"", "h", "i"},
  86. {"j", "", ""},
  87. {"", "", ""},
  88. },
  89. expectedDelimiter: ',',
  90. },
  91. }
  92. for n, c := range cases {
  93. rd, err := CreateReaderAndDetermineDelimiter(nil, strings.NewReader(decodeSlashes(t, c.csv)))
  94. assert.NoError(t, err, "case %d: should not throw error: %v\n", n, err)
  95. assert.EqualValues(t, c.expectedDelimiter, rd.Comma, "case %d: delimiter should be '%c', got '%c'", n, c.expectedDelimiter, rd.Comma)
  96. rows, err := rd.ReadAll()
  97. assert.NoError(t, err, "case %d: should not throw error: %v\n", n, err)
  98. assert.EqualValues(t, c.expectedRows, rows, "case %d: rows should be equal", n)
  99. }
  100. }
  101. type mockReader struct{}
  102. func (r *mockReader) Read(buf []byte) (int, error) {
  103. return 0, io.ErrShortBuffer
  104. }
  105. func TestDetermineDelimiterShortBufferError(t *testing.T) {
  106. rd, err := CreateReaderAndDetermineDelimiter(nil, &mockReader{})
  107. assert.Error(t, err, "CreateReaderAndDetermineDelimiter() should throw an error")
  108. assert.ErrorIs(t, err, io.ErrShortBuffer)
  109. assert.Nil(t, rd, "CSV reader should be mnil")
  110. }
  111. func TestDetermineDelimiterReadAllError(t *testing.T) {
  112. rd, err := CreateReaderAndDetermineDelimiter(nil, strings.NewReader(`col1,col2
  113. a;b
  114. c@e
  115. f g
  116. h|i
  117. jkl`))
  118. assert.NoError(t, err, "CreateReaderAndDetermineDelimiter() shouldn't throw error")
  119. assert.NotNil(t, rd, "CSV reader should not be mnil")
  120. rows, err := rd.ReadAll()
  121. assert.Error(t, err, "RaadAll() should throw error")
  122. assert.ErrorIs(t, err, csv.ErrFieldCount)
  123. assert.Empty(t, rows, "rows should be empty")
  124. }
  125. func TestDetermineDelimiter(t *testing.T) {
  126. cases := []struct {
  127. csv string
  128. filename string
  129. expectedDelimiter rune
  130. }{
  131. // case 0 - semicolon delmited
  132. {
  133. csv: "a",
  134. filename: "test.csv",
  135. expectedDelimiter: ',',
  136. },
  137. // case 1 - single column/row CSV
  138. {
  139. csv: "a",
  140. filename: "",
  141. expectedDelimiter: ',',
  142. },
  143. // case 2 - single column, single row CSV w/ tsv file extension (so is tabbed delimited)
  144. {
  145. csv: "1,2",
  146. filename: "test.tsv",
  147. expectedDelimiter: '\t',
  148. },
  149. // case 3 - two column, single row CSV w/ no filename, so will guess comma as delimiter
  150. {
  151. csv: "1,2",
  152. filename: "",
  153. expectedDelimiter: ',',
  154. },
  155. // case 4 - semi-colon delimited with csv extension
  156. {
  157. csv: "1;2",
  158. filename: "test.csv",
  159. expectedDelimiter: ';',
  160. },
  161. // case 5 - tabbed delimited with tsv extension
  162. {
  163. csv: "1\t2",
  164. filename: "test.tsv",
  165. expectedDelimiter: '\t',
  166. },
  167. // case 6 - tabbed delimited without any filename
  168. {
  169. csv: "1\t2",
  170. filename: "",
  171. expectedDelimiter: '\t',
  172. },
  173. // case 7 - tabs won't work, only commas as every row has same amount of commas
  174. {
  175. csv: "col1,col2\nfirst\tval,seconed\tval",
  176. filename: "",
  177. expectedDelimiter: ',',
  178. },
  179. // case 8 - While looks like comma delimited, has psv extension
  180. {
  181. csv: "1,2",
  182. filename: "test.psv",
  183. expectedDelimiter: '|',
  184. },
  185. // case 9 - pipe delmiited with no extension
  186. {
  187. csv: "1|2",
  188. filename: "",
  189. expectedDelimiter: '|',
  190. },
  191. // case 10 - semi-colon delimited with commas in values
  192. {
  193. csv: "1,2,3;4,5,6;7,8,9\na;b;c",
  194. filename: "",
  195. expectedDelimiter: ';',
  196. },
  197. // case 11 - semi-colon delimited with newline in content
  198. {
  199. csv: `"1,2,3,4";"a
  200. b";%
  201. c;d;#`,
  202. filename: "",
  203. expectedDelimiter: ';',
  204. },
  205. // case 12 - HTML as single value
  206. {
  207. csv: "<br/>",
  208. filename: "",
  209. expectedDelimiter: ',',
  210. },
  211. // case 13 - tab delimited with commas in values
  212. {
  213. csv: `name email note
  214. John Doe john@doe.com This,note,had,a,lot,of,commas,to,test,delimiters`,
  215. filename: "",
  216. expectedDelimiter: '\t',
  217. },
  218. }
  219. for n, c := range cases {
  220. delimiter := determineDelimiter(&markup.RenderContext{
  221. Ctx: git.DefaultContext,
  222. RelativePath: c.filename,
  223. }, []byte(decodeSlashes(t, c.csv)))
  224. assert.EqualValues(t, c.expectedDelimiter, delimiter, "case %d: delimiter should be equal, expected '%c' got '%c'", n, c.expectedDelimiter, delimiter)
  225. }
  226. }
  227. func TestRemoveQuotedString(t *testing.T) {
  228. cases := []struct {
  229. text string
  230. expectedText string
  231. }{
  232. // case 0 - quoted text with escaped quotes in 1st column
  233. {
  234. text: `col1,col2,col3
  235. "quoted ""text"" with
  236. new lines
  237. in first column",b,c`,
  238. expectedText: `col1,col2,col3
  239. ,b,c`,
  240. },
  241. // case 1 - quoted text with escaped quotes in 2nd column
  242. {
  243. text: `col1,col2,col3
  244. a,"quoted ""text"" with
  245. new lines
  246. in second column",c`,
  247. expectedText: `col1,col2,col3
  248. a,,c`,
  249. },
  250. // case 2 - quoted text with escaped quotes in last column
  251. {
  252. text: `col1,col2,col3
  253. a,b,"quoted ""text"" with
  254. new lines
  255. in last column"`,
  256. expectedText: `col1,col2,col3
  257. a,b,`,
  258. },
  259. // case 3 - csv with lots of quotes
  260. {
  261. text: `a,"b",c,d,"e
  262. e
  263. e",f
  264. a,bb,c,d,ee ,"f
  265. f"
  266. a,b,"c ""
  267. c",d,e,f`,
  268. expectedText: `a,,c,d,,f
  269. a,bb,c,d,ee ,
  270. a,b,,d,e,f`,
  271. },
  272. // case 4 - csv with pipes and quotes
  273. {
  274. text: `Col1 | Col2 | Col3
  275. abc | "Hello
  276. World"|123
  277. "de
  278. f" | 4.56 | 789`,
  279. expectedText: `Col1 | Col2 | Col3
  280. abc | |123
  281. | 4.56 | 789`,
  282. },
  283. }
  284. for n, c := range cases {
  285. modifiedText := removeQuotedString(decodeSlashes(t, c.text))
  286. assert.EqualValues(t, c.expectedText, modifiedText, "case %d: modified text should be equal", n)
  287. }
  288. }
  289. func TestGuessDelimiter(t *testing.T) {
  290. cases := []struct {
  291. csv string
  292. expectedDelimiter rune
  293. }{
  294. // case 0 - single cell, comma delmited
  295. {
  296. csv: "a",
  297. expectedDelimiter: ',',
  298. },
  299. // case 1 - two cells, comma delimited
  300. {
  301. csv: "1,2",
  302. expectedDelimiter: ',',
  303. },
  304. // case 2 - semicolon delimited
  305. {
  306. csv: "1;2",
  307. expectedDelimiter: ';',
  308. },
  309. // case 3 - tab delimited
  310. {
  311. csv: "1\t2",
  312. expectedDelimiter: '\t',
  313. },
  314. // case 4 - pipe delimited
  315. {
  316. csv: "1|2",
  317. expectedDelimiter: '|',
  318. },
  319. // case 5 - semicolon delimited with commas in text
  320. {
  321. csv: `1,2,3;4,5,6;7,8,9
  322. a;b;c`,
  323. expectedDelimiter: ';',
  324. },
  325. // case 6 - semicolon delmited with commas in quoted text
  326. {
  327. csv: `"1,2,3,4";"a
  328. b"
  329. c;d`,
  330. expectedDelimiter: ';',
  331. },
  332. // case 7 - HTML
  333. {
  334. csv: "<br/>",
  335. expectedDelimiter: ',',
  336. },
  337. // case 8 - tab delimited with commas in value
  338. {
  339. csv: `name email note
  340. John Doe john@doe.com This,note,had,a,lot,of,commas,to,test,delimiters`,
  341. expectedDelimiter: '\t',
  342. },
  343. // case 9 - tab delimited with new lines in values, commas in values
  344. {
  345. csv: `1 "some,""more
  346. ""
  347. quoted,
  348. text," a
  349. 2 "some,
  350. quoted,\t
  351. text," b
  352. 3 "some,
  353. quoted,
  354. text" c
  355. 4 "some,
  356. quoted,
  357. text," d`,
  358. expectedDelimiter: '\t',
  359. },
  360. // case 10 - semicolon delmited with quotes and semicolon in value
  361. {
  362. csv: `col1;col2
  363. "this has a literal "" in the text";"and an ; in the text"`,
  364. expectedDelimiter: ';',
  365. },
  366. // case 11 - pipe delimited with quotes
  367. {
  368. csv: `Col1 | Col2 | Col3
  369. abc | "Hello
  370. World"|123
  371. "de
  372. |
  373. f" | 4.56 | 789`,
  374. expectedDelimiter: '|',
  375. },
  376. // case 12 - a tab delimited 6 column CSV, but the values are not quoted and have lots of commas.
  377. // In the previous bestScore algorithm, this would have picked comma as the delimiter, but now it should guess tab
  378. {
  379. csv: `c1 c2 c3 c4 c5 c6
  380. v,k,x,v ym,f,oa,qn,uqijh,n,s,wvygpo uj,kt,j,w,i,fvv,tm,f,ddt,b,mwt,e,t,teq,rd,p,a e,wfuae,t,h,q,im,ix,y h,mrlu,l,dz,ff,zi,af,emh ,gov,bmfelvb,axp,f,u,i,cni,x,z,v,sh,w,jo,,m,h
  381. k,ohf,pgr,tde,m,s te,ek,,v,,ic,kqc,dv,w,oi,j,w,gojjr,ug,,l,j,zl g,qziq,bcajx,zfow,ka,j,re,ohbc k,nzm,qm,ts,auf th,elb,lx,l,q,e,qf asbr,z,k,y,tltobga
  382. g,m,bu,el h,l,jwi,o,wge,fy,rure,c,g,lcxu,fxte,uns,cl,s,o,t,h,rsoy,f bq,s,uov,z,ikkhgyg,,sabs,c,hzue mc,b,,j,t,n sp,mn,,m,t,dysi,eq,pigb,rfa,z w,rfli,sg,,o,wjjjf,f,wxdzfk,x,t,p,zy,p,mg,r,l,h
  383. e,ewbkc,nugd,jj,sf,ih,i,n,jo,b,poem,kw,q,i,x,t,e,uug,k j,xm,sch,ux,h,,fb,f,pq,,mh,,f,v,,oba,w,h,v,eiz,yzd,o,a,c,e,dhp,q a,pbef,epc,k,rdpuw,cw k,j,e,d xf,dz,sviv,w,sqnzew,t,b v,yg,f,cq,ti,g,m,ta,hm,ym,ii,hxy,p,z,r,e,ga,sfs,r,p,l,aar,w,kox,j
  384. l,d,v,pp,q,j,bxip,w,i,im,qa,o e,o h,w,a,a,qzj,nt,qfn,ut,fvhu,ts hu,q,g,p,q,ofpje,fsqa,frp,p,vih,j,w,k,jx, ln,th,ka,l,b,vgk,rv,hkx rj,v,y,cwm,rao,e,l,wvr,ptc,lm,yg,u,k,i,b,zk,b,gv,fls
  385. velxtnhlyuysbnlchosqlhkozkdapjaueexjwrndwb nglvnv kqiv pbshwlmcexdzipopxjyrxhvjalwp pydvipwlkkpdvbtepahskwuornbsb qwbacgq
  386. l,y,u,bf,y,m,eals,n,cop,h,g,vs,jga,opt x,b,zwmn,hh,b,n,pdj,t,d px yn,vtd,u,y,b,ps,yo,qqnem,mxg,m,al,rd,c,k,d,q,f ilxdxa,m,y,,p,p,y,prgmg,q,n,etj,k,ns b,pl,z,jq,hk
  387. p,gc jn,mzr,bw sb,e,r,dy,ur,wzy,r,c,n,yglr,jbdu,r,pqk,k q,d,,,p,l,euhl,dc,rwh,t,tq,z,h,p,s,t,x,fugr,h wi,zxb,jcig,o,t,k mfh,ym,h,e,p,cnvx,uv,zx,x,pq,blt,v,r,u,tr,g,g,xt
  388. nri,p,,t,if,,y,ptlqq a,i w,ovli,um,w,f,re,k,sb,w,jy,zf i,g,p,q,mii,nr,jm,cc i,szl,k,eg,l,d ,ah,w,b,vh
  389. ,,sh,wx,mn,xm,u,d,yy,u,t,m,j,s,b ogadq,g,y,y,i,h,ln,jda,g,cz,s,rv,r,s,s,le,r, y,nu,f,nagj o,h,,adfy,o,nf,ns,gvsvnub,k,b,xyz v,h,g,ef,y,gb c,x,cw,x,go,h,t,x,cu,u,qgrqzrcmn,kq,cd,g,rejp,zcq
  390. skxg,t,vay,d,wug,d,xg,sexc rt g,ag,mjq,fjnyji,iwa,m,ml,b,ua,b,qjxeoc be,s,sh,n,jbzxs,g,n,i,h,y,r,be,mfo,u,p cw,r,,u,zn,eg,r,yac,m,l,edkr,ha,x,g,b,c,tg,c j,ye,u,ejd,maj,ea,bm,u,iy`,
  391. expectedDelimiter: '\t',
  392. },
  393. // case 13 - a CSV with more than 10 lines and since we only use the first 10 lines, it should still get the delimiter as semicolon
  394. {
  395. csv: `col1;col2;col3
  396. 1;1;1
  397. 2;2;2
  398. 3;3;3
  399. 4;4;4
  400. 5;5;5
  401. 6;6;6
  402. 7;7;7
  403. 8;8;8
  404. 9;9;9
  405. 10;10;10
  406. 11 11 11
  407. 12|12|12`,
  408. expectedDelimiter: ';',
  409. },
  410. // case 14 - a really long single line (over 10k) that will get truncated, but since it has commas and semicolons (but more semicolons) it will pick semicolon
  411. {
  412. csv: strings.Repeat("a;b,c;", 1700),
  413. expectedDelimiter: ';',
  414. },
  415. // case 15 - 2 lines that are well over 10k, but since the 2nd line is where this CSV will be truncated (10k sample), it will only use the first line, so semicolon will be picked
  416. {
  417. csv: "col1@col2@col3\na@b@" + strings.Repeat("c", 6000) + "\nd,e," + strings.Repeat("f", 4000),
  418. expectedDelimiter: '@',
  419. },
  420. // case 16 - has all delimiters so should return comma
  421. {
  422. csv: `col1,col2;col3@col4|col5 col6
  423. a b|c@d;e,f`,
  424. expectedDelimiter: ',',
  425. },
  426. // case 16 - nothing works (bad csv) so returns comma by default
  427. {
  428. csv: `col1,col2
  429. a;b
  430. c@e
  431. f g
  432. h|i
  433. jkl`,
  434. expectedDelimiter: ',',
  435. },
  436. }
  437. for n, c := range cases {
  438. delimiter := guessDelimiter([]byte(decodeSlashes(t, c.csv)))
  439. assert.EqualValues(t, c.expectedDelimiter, delimiter, "case %d: delimiter should be equal, expected '%c' got '%c'", n, c.expectedDelimiter, delimiter)
  440. }
  441. }
  442. func TestGuessFromBeforeAfterQuotes(t *testing.T) {
  443. cases := []struct {
  444. csv string
  445. expectedDelimiter rune
  446. }{
  447. // case 0 - tab delimited with new lines in values, commas in values
  448. {
  449. csv: `1 "some,""more
  450. ""
  451. quoted,
  452. text," a
  453. 2 "some,
  454. quoted,\t
  455. text," b
  456. 3 "some,
  457. quoted,
  458. text" c
  459. 4 "some,
  460. quoted,
  461. text," d`,
  462. expectedDelimiter: '\t',
  463. },
  464. // case 1 - semicolon delmited with quotes and semicolon in value
  465. {
  466. csv: `col1;col2
  467. "this has a literal "" in the text";"and an ; in the text"`,
  468. expectedDelimiter: ';',
  469. },
  470. // case 2 - pipe delimited with quotes
  471. {
  472. csv: `Col1 | Col2 | Col3
  473. abc | "Hello
  474. World"|123
  475. "de
  476. |
  477. f" | 4.56 | 789`,
  478. expectedDelimiter: '|',
  479. },
  480. // case 3 - a complicated quoted CSV that is semicolon delmiited
  481. {
  482. csv: `he; she
  483. "he said, ""hey!"""; "she said, ""hey back!"""
  484. but; "be"`,
  485. expectedDelimiter: ';',
  486. },
  487. // case 4 - no delimiter should be found
  488. {
  489. csv: `a,b`,
  490. expectedDelimiter: 0,
  491. },
  492. // case 5 - no limiter should be found
  493. {
  494. csv: `col1
  495. "he said, ""here I am"""`,
  496. expectedDelimiter: 0,
  497. },
  498. // case 6 - delimiter before double quoted string with space
  499. {
  500. csv: `col1|col2
  501. a| "he said, ""here I am"""`,
  502. expectedDelimiter: '|',
  503. },
  504. // case 7 - delimiter before double quoted string without space
  505. {
  506. csv: `col1|col2
  507. a|"he said, ""here I am"""`,
  508. expectedDelimiter: '|',
  509. },
  510. // case 8 - delimiter after double quoted string with space
  511. {
  512. csv: `col1, col2
  513. "abc\n
  514. ", def`,
  515. expectedDelimiter: ',',
  516. },
  517. // case 9 - delimiter after double quoted string without space
  518. {
  519. csv: `col1,col2
  520. "abc\n
  521. ",def`,
  522. expectedDelimiter: ',',
  523. },
  524. }
  525. for n, c := range cases {
  526. delimiter := guessFromBeforeAfterQuotes([]byte(decodeSlashes(t, c.csv)))
  527. assert.EqualValues(t, c.expectedDelimiter, delimiter, "case %d: delimiter should be equal, expected '%c' got '%c'", n, c.expectedDelimiter, delimiter)
  528. }
  529. }
  530. func TestFormatError(t *testing.T) {
  531. cases := []struct {
  532. err error
  533. expectedMessage string
  534. expectsError bool
  535. }{
  536. {
  537. err: &csv.ParseError{
  538. Err: csv.ErrFieldCount,
  539. },
  540. expectedMessage: "repo.error.csv.invalid_field_count",
  541. expectsError: false,
  542. },
  543. {
  544. err: &csv.ParseError{
  545. Err: csv.ErrBareQuote,
  546. },
  547. expectedMessage: "repo.error.csv.unexpected",
  548. expectsError: false,
  549. },
  550. {
  551. err: bytes.ErrTooLarge,
  552. expectsError: true,
  553. },
  554. }
  555. for n, c := range cases {
  556. message, err := FormatError(c.err, &translation.MockLocale{})
  557. if c.expectsError {
  558. assert.Error(t, err, "case %d: expected an error to be returned", n)
  559. } else {
  560. assert.NoError(t, err, "case %d: no error was expected, got error: %v", n, err)
  561. assert.EqualValues(t, c.expectedMessage, message, "case %d: messages should be equal, expected '%s' got '%s'", n, c.expectedMessage, message)
  562. }
  563. }
  564. }