aboutsummaryrefslogtreecommitdiffstats
path: root/services/gitdiff/csv.go
blob: 5781d7e9094aece34a221aca277d28e4d5953c23 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
// Copyright 2021 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT

package gitdiff

import (
	"encoding/csv"
	"errors"
	"io"

	"code.gitea.io/gitea/modules/util"
)

const (
	unmappedColumn           = -1
	maxRowsToInspect int     = 10
	minRatioToMatch  float32 = 0.8
)

// TableDiffCellType represents the type of a TableDiffCell.
type TableDiffCellType uint8

// TableDiffCellType possible values.
const (
	TableDiffCellUnchanged TableDiffCellType = iota + 1
	TableDiffCellChanged
	TableDiffCellAdd
	TableDiffCellDel
	TableDiffCellMovedUnchanged
	TableDiffCellMovedChanged
)

// TableDiffCell represents a cell of a TableDiffRow
type TableDiffCell struct {
	LeftCell  string
	RightCell string
	Type      TableDiffCellType
}

// TableDiffRow represents a row of a TableDiffSection.
type TableDiffRow struct {
	RowIdx int
	Cells  []*TableDiffCell
}

// TableDiffSection represents a section of a DiffFile.
type TableDiffSection struct {
	Rows []*TableDiffRow
}

// csvReader wraps a csv.Reader which buffers the first rows.
type csvReader struct {
	reader *csv.Reader
	buffer [][]string
	line   int
	eof    bool
}

// ErrorUndefinedCell is for when a row, column coordinates do not exist in the CSV
var ErrorUndefinedCell = errors.New("undefined cell")

// createCsvReader creates a csvReader and fills the buffer
func createCsvReader(reader *csv.Reader, bufferRowCount int) (*csvReader, error) {
	csv := &csvReader{reader: reader}
	csv.buffer = make([][]string, bufferRowCount)
	for i := 0; i < bufferRowCount && !csv.eof; i++ {
		row, err := csv.readNextRow()
		if err != nil {
			return nil, err
		}
		csv.buffer[i] = row
	}
	csv.line = bufferRowCount
	return csv, nil
}

// GetRow gets a row from the buffer if present or advances the reader to the requested row. On the end of the file only nil gets returned.
func (csv *csvReader) GetRow(row int) ([]string, error) {
	if row < len(csv.buffer) && row >= 0 {
		return csv.buffer[row], nil
	}
	if csv.eof {
		return nil, nil
	}
	for {
		fields, err := csv.readNextRow()
		if err != nil {
			return nil, err
		}
		if csv.eof {
			return nil, nil
		}
		csv.line++
		if csv.line-1 == row {
			return fields, nil
		}
	}
}

func (csv *csvReader) readNextRow() ([]string, error) {
	if csv.eof {
		return nil, nil
	}
	row, err := csv.reader.Read()
	if err != nil {
		if err != io.EOF {
			return nil, err
		}
		csv.eof = true
	}
	return row, nil
}

// CreateCsvDiff creates a tabular diff based on two CSV readers.
func CreateCsvDiff(diffFile *DiffFile, baseReader, headReader *csv.Reader) ([]*TableDiffSection, error) {
	if baseReader != nil && headReader != nil {
		return createCsvDiff(diffFile, baseReader, headReader)
	}

	if baseReader != nil {
		return createCsvDiffSingle(baseReader, TableDiffCellDel)
	}
	return createCsvDiffSingle(headReader, TableDiffCellAdd)
}

// createCsvDiffSingle creates a tabular diff based on a single CSV reader. All cells are added or deleted.
func createCsvDiffSingle(reader *csv.Reader, celltype TableDiffCellType) ([]*TableDiffSection, error) {
	var rows []*TableDiffRow
	i := 1
	for {
		row, err := reader.Read()
		if err != nil {
			if err == io.EOF {
				break
			}
			return nil, err
		}
		cells := make([]*TableDiffCell, len(row))
		for j := 0; j < len(row); j++ {
			if celltype == TableDiffCellDel {
				cells[j] = &TableDiffCell{LeftCell: row[j], Type: celltype}
			} else {
				cells[j] = &TableDiffCell{RightCell: row[j], Type: celltype}
			}
		}
		rows = append(rows, &TableDiffRow{RowIdx: i, Cells: cells})
		i++
	}

	return []*TableDiffSection{{Rows: rows}}, nil
}

func createCsvDiff(diffFile *DiffFile, baseReader, headReader *csv.Reader) ([]*TableDiffSection, error) {
	// Given the baseReader and headReader, we are going to create CSV Reader for each, baseCSVReader and b respectively
	baseCSVReader, err := createCsvReader(baseReader, maxRowsToInspect)
	if err != nil {
		return nil, err
	}
	headCSVReader, err := createCsvReader(headReader, maxRowsToInspect)
	if err != nil {
		return nil, err
	}

	// Initializing the mappings of base to head (a2bColMap) and head to base (b2aColMap) columns
	a2bColMap, b2aColMap := getColumnMapping(baseCSVReader, headCSVReader)

	// Determines how many cols there will be in the diff table, which includes deleted columns from base and added columns to base
	numDiffTableCols := len(a2bColMap) + countUnmappedColumns(b2aColMap)
	if len(a2bColMap) < len(b2aColMap) {
		numDiffTableCols = len(b2aColMap) + countUnmappedColumns(a2bColMap)
	}

	// createDiffTableRow takes the row # of the `a` line and `b` line of a diff (starting from 1), 0 if the line doesn't exist (undefined)
	// in the base or head respectively.
	// Returns a TableDiffRow which has the row index
	createDiffTableRow := func(aLineNum, bLineNum int) (*TableDiffRow, error) {
		// diffTableCells is a row of the diff table. It will have a cells for added, deleted, changed, and unchanged content, thus either
		// the same size as the head table or bigger
		diffTableCells := make([]*TableDiffCell, numDiffTableCols)
		var bRow *[]string
		if bLineNum > 0 {
			row, err := headCSVReader.GetRow(bLineNum - 1)
			if err != nil {
				return nil, err
			}
			bRow = &row
		}
		var aRow *[]string
		if aLineNum > 0 {
			row, err := baseCSVReader.GetRow(aLineNum - 1)
			if err != nil {
				return nil, err
			}
			aRow = &row
		}
		if aRow == nil && bRow == nil {
			// No content
			return nil, nil
		}

		aIndex := 0      // tracks where we are in the a2bColMap
		bIndex := 0      // tracks where we are in the b2aColMap
		colsAdded := 0   // incremented whenever we found a column was added
		colsDeleted := 0 // incrememted whenever a column was deleted

		// We loop until both the aIndex and bIndex are greater than their col map, which then we are done
		for aIndex < len(a2bColMap) || bIndex < len(b2aColMap) {
			// Starting from where aIndex is currently pointing, we see if the map is -1 (dleeted) and if is, create column to note that, increment, and look at the next aIndex
			for aIndex < len(a2bColMap) && a2bColMap[aIndex] == -1 && (bIndex >= len(b2aColMap) || aIndex <= bIndex) {
				var aCell string
				if aRow != nil {
					if cell, err := getCell(*aRow, aIndex); err != nil {
						if err != ErrorUndefinedCell {
							return nil, err
						}
					} else {
						aCell = cell
					}
				}
				diffTableCells[bIndex+colsDeleted] = &TableDiffCell{LeftCell: aCell, Type: TableDiffCellDel}
				aIndex++
				colsDeleted++
			}

			// aIndex is now pointing to a column that also exists in b, or is at the end of a2bColMap. If the former,
			// we can just increment aIndex until it points to a -1 column or one greater than the current bIndex
			for aIndex < len(a2bColMap) && a2bColMap[aIndex] != -1 {
				aIndex++
			}

			// Starting from where bIndex is currently pointing, we see if the map is -1 (added) and if is, create column to note that, increment, and look at the next aIndex
			for bIndex < len(b2aColMap) && b2aColMap[bIndex] == -1 && (aIndex >= len(a2bColMap) || bIndex < aIndex) {
				var bCell string
				cellType := TableDiffCellAdd
				if bRow != nil {
					if cell, err := getCell(*bRow, bIndex); err != nil {
						if err != ErrorUndefinedCell {
							return nil, err
						}
					} else {
						bCell = cell
					}
				} else {
					cellType = TableDiffCellDel
				}
				diffTableCells[bIndex+colsDeleted] = &TableDiffCell{RightCell: bCell, Type: cellType}
				bIndex++
				colsAdded++
			}

			// aIndex is now pointing to a column that also exists in a, or is at the end of b2aColMap. If the former,
			// we get the a col and b col values (if they exist), figure out if they are the same or not, and if the column moved, and add it to the diff table
			for bIndex < len(b2aColMap) && b2aColMap[bIndex] != -1 && (aIndex >= len(a2bColMap) || bIndex < aIndex) {
				var diffTableCell TableDiffCell

				var aCell *string
				// get the aCell value if the aRow exists
				if aRow != nil {
					if cell, err := getCell(*aRow, b2aColMap[bIndex]); err != nil {
						if err != ErrorUndefinedCell {
							return nil, err
						}
					} else {
						aCell = &cell
						diffTableCell.LeftCell = cell
					}
				} else {
					diffTableCell.Type = TableDiffCellAdd
				}

				var bCell *string
				// get the bCell value if the bRow exists
				if bRow != nil {
					if cell, err := getCell(*bRow, bIndex); err != nil {
						if err != ErrorUndefinedCell {
							return nil, err
						}
					} else {
						bCell = &cell
						diffTableCell.RightCell = cell
					}
				} else {
					diffTableCell.Type = TableDiffCellDel
				}

				// if both a and b have a row that exists, compare the value and determine if the row has moved
				if aCell != nil && bCell != nil {
					moved := ((bIndex + colsDeleted) != (b2aColMap[bIndex] + colsAdded))
					if *aCell != *bCell {
						if moved {
							diffTableCell.Type = TableDiffCellMovedChanged
						} else {
							diffTableCell.Type = TableDiffCellChanged
						}
					} else {
						if moved {
							diffTableCell.Type = TableDiffCellMovedUnchanged
						} else {
							diffTableCell.Type = TableDiffCellUnchanged
						}
						diffTableCell.LeftCell = ""
					}
				}

				// Add the diff column to the diff row
				diffTableCells[bIndex+colsDeleted] = &diffTableCell
				bIndex++
			}
		}

		return &TableDiffRow{RowIdx: bLineNum, Cells: diffTableCells}, nil
	}

	// diffTableSections are TableDiffSections which represent the diffTableSections we get when doing a diff, each will be its own table in the view
	var diffTableSections []*TableDiffSection

	for i, section := range diffFile.Sections {
		// Each section has multiple diffTableRows
		var diffTableRows []*TableDiffRow
		lines := tryMergeLines(section.Lines)
		// Loop through the merged lines to get each row of the CSV diff table for this section
		for j, line := range lines {
			if i == 0 && j == 0 && (line[0] != 1 || line[1] != 1) {
				diffTableRow, err := createDiffTableRow(1, 1)
				if err != nil {
					return nil, err
				}
				if diffTableRow != nil {
					diffTableRows = append(diffTableRows, diffTableRow)
				}
			}
			diffTableRow, err := createDiffTableRow(line[0], line[1])
			if err != nil {
				return nil, err
			}
			if diffTableRow != nil {
				diffTableRows = append(diffTableRows, diffTableRow)
			}
		}

		if len(diffTableRows) > 0 {
			diffTableSections = append(diffTableSections, &TableDiffSection{Rows: diffTableRows})
		}
	}

	return diffTableSections, nil
}

// getColumnMapping creates a mapping of columns between a and b
func getColumnMapping(baseCSVReader, headCSVReader *csvReader) ([]int, []int) {
	baseRow, _ := baseCSVReader.GetRow(0)
	headRow, _ := headCSVReader.GetRow(0)

	base2HeadColMap := []int{}
	head2BaseColMap := []int{}

	if baseRow != nil {
		base2HeadColMap = make([]int, len(baseRow))
	}
	if headRow != nil {
		head2BaseColMap = make([]int, len(headRow))
	}

	// Initializes all head2base mappings to be unmappedColumn (-1)
	for i := 0; i < len(head2BaseColMap); i++ {
		head2BaseColMap[i] = unmappedColumn
	}

	// Loops through the baseRow and see if there is a match in the head row
	for i := 0; i < len(baseRow); i++ {
		base2HeadColMap[i] = unmappedColumn
		baseCell, err := getCell(baseRow, i)
		if err == nil {
			for j := 0; j < len(headRow); j++ {
				if head2BaseColMap[j] == -1 {
					headCell, err := getCell(headRow, j)
					if err == nil && baseCell == headCell {
						base2HeadColMap[i] = j
						head2BaseColMap[j] = i
						break
					}
				}
			}
		}
	}

	tryMapColumnsByContent(baseCSVReader, base2HeadColMap, headCSVReader, head2BaseColMap)
	tryMapColumnsByContent(headCSVReader, head2BaseColMap, baseCSVReader, base2HeadColMap)

	return base2HeadColMap, head2BaseColMap
}

// tryMapColumnsByContent tries to map missing columns by the content of the first lines.
func tryMapColumnsByContent(baseCSVReader *csvReader, base2HeadColMap []int, headCSVReader *csvReader, head2BaseColMap []int) {
	for i := 0; i < len(base2HeadColMap); i++ {
		headStart := 0
		for base2HeadColMap[i] == unmappedColumn && headStart < len(head2BaseColMap) {
			if head2BaseColMap[headStart] == unmappedColumn {
				rows := util.Min(maxRowsToInspect, util.Max(0, util.Min(len(baseCSVReader.buffer), len(headCSVReader.buffer))-1))
				same := 0
				for j := 1; j <= rows; j++ {
					baseCell, baseErr := getCell(baseCSVReader.buffer[j], i)
					headCell, headErr := getCell(headCSVReader.buffer[j], headStart)
					if baseErr == nil && headErr == nil && baseCell == headCell {
						same++
					}
				}
				if (float32(same) / float32(rows)) > minRatioToMatch {
					base2HeadColMap[i] = headStart
					head2BaseColMap[headStart] = i
				}
			}
			headStart++
		}
	}
}

// getCell returns the specific cell or nil if not present.
func getCell(row []string, column int) (string, error) {
	if column < len(row) {
		return row[column], nil
	}
	return "", ErrorUndefinedCell
}

// countUnmappedColumns returns the count of unmapped columns.
func countUnmappedColumns(mapping []int) int {
	count := 0
	for i := 0; i < len(mapping); i++ {
		if mapping[i] == unmappedColumn {
			count++
		}
	}
	return count
}

// tryMergeLines maps the separated line numbers of a git diff. The result is assumed to be ordered.
func tryMergeLines(lines []*DiffLine) [][2]int {
	ids := make([][2]int, len(lines))

	i := 0
	for _, line := range lines {
		if line.Type != DiffLineSection {
			ids[i][0] = line.LeftIdx
			ids[i][1] = line.RightIdx
			i++
		}
	}

	ids = ids[:i]

	result := make([][2]int, len(ids))

	j := 0
	for i = 0; i < len(ids); i++ {
		if ids[i][0] == 0 {
			if j > 0 && result[j-1][1] == 0 {
				temp := j
				for temp > 0 && result[temp-1][1] == 0 {
					temp--
				}
				result[temp][1] = ids[i][1]
				continue
			}
		}
		result[j] = ids[i]
		j++
	}

	return result[:j]
}