diff options
author | Filip Navara <filip.navara@gmail.com> | 2019-04-19 14:17:27 +0200 |
---|---|---|
committer | Lunny Xiao <xiaolunwen@gmail.com> | 2019-04-19 20:17:27 +0800 |
commit | 2af67f6044af1cad7136ce8c123e37ab090ca9bc (patch) | |
tree | 6eaa623db6a0665498d7f05c8bb1a4b4d7b141c7 /modules/git/commit_info.go | |
parent | 19ec2606e91610421a3e9cd87c94748ef07ca468 (diff) | |
download | gitea-2af67f6044af1cad7136ce8c123e37ab090ca9bc.tar.gz gitea-2af67f6044af1cad7136ce8c123e37ab090ca9bc.zip |
Improve listing performance by using go-git (#6478)
* Use go-git for tree reading and commit info lookup.
Signed-off-by: Filip Navara <navara@emclient.com>
* Use TreeEntry.IsRegular() instead of ObjectType that was removed.
Signed-off-by: Filip Navara <navara@emclient.com>
* Use the treePath to optimize commit info search.
Signed-off-by: Filip Navara <navara@emclient.com>
* Extract the latest commit at treePath along with the other commits.
Signed-off-by: Filip Navara <navara@emclient.com>
* Fix listing commit info for a directory that was created in one commit and never modified after.
Signed-off-by: Filip Navara <navara@emclient.com>
* Avoid nearly all external 'git' invocations when doing directory listing (.editorconfig code path is still hit).
Signed-off-by: Filip Navara <navara@emclient.com>
* Use go-git for reading blobs.
Signed-off-by: Filip Navara <navara@emclient.com>
* Make SHA1 type alias for plumbing.Hash in go-git.
Signed-off-by: Filip Navara <navara@emclient.com>
* Make Signature type alias for object.Signature in go-git.
Signed-off-by: Filip Navara <navara@emclient.com>
* Fix GetCommitsInfo for repository with only one commit.
Signed-off-by: Filip Navara <navara@emclient.com>
* Fix PGP signature verification.
Signed-off-by: Filip Navara <navara@emclient.com>
* Fix issues with walking commit graph across merges.
Signed-off-by: Filip Navara <navara@emclient.com>
* Fix typo in condition.
Signed-off-by: Filip Navara <navara@emclient.com>
* Speed up loading branch list by keeping the repository reference (and thus all the loaded packfile indexes).
Signed-off-by: Filip Navara <navara@emclient.com>
* Fix lising submodules.
Signed-off-by: Filip Navara <navara@emclient.com>
* Fix build
Signed-off-by: Filip Navara <navara@emclient.com>
* Add back commit cache because of name-rev
Signed-off-by: Filip Navara <navara@emclient.com>
* Fix tests
Signed-off-by: Filip Navara <navara@emclient.com>
* Fix code style
* Fix spelling
* Address PR feedback
Signed-off-by: Filip Navara <navara@emclient.com>
* Update vendor module list
Signed-off-by: Filip Navara <navara@emclient.com>
* Fix getting trees by commit id
Signed-off-by: Filip Navara <navara@emclient.com>
* Fix remaining unit test failures
* Fix GetTreeBySHA
* Avoid running `git name-rev` if not necessary
Signed-off-by: Filip Navara <navara@emclient.com>
* Move Branch code to git module
* Clean up GPG signature verification and fix it for tagged commits
* Address PR feedback (import formatting, copyright headers)
* Make blob lookup by SHA working
* Update tests to use public API
* Allow getting content from any type of object through the blob interface
* Change test to actually expect the object content that is in the GIT repository
* Change one more test to actually expect the object content that is in the GIT repository
* Add comments
Diffstat (limited to 'modules/git/commit_info.go')
-rw-r--r-- | modules/git/commit_info.go | 454 |
1 files changed, 183 insertions, 271 deletions
diff --git a/modules/git/commit_info.go b/modules/git/commit_info.go index 971082be1f..02c6f710ad 100644 --- a/modules/git/commit_info.go +++ b/modules/git/commit_info.go @@ -5,325 +5,237 @@ package git import ( - "bufio" - "context" - "fmt" - "os/exec" - "path" - "runtime" - "strconv" - "strings" - "sync" - "time" + "github.com/emirpasic/gods/trees/binaryheap" + "gopkg.in/src-d/go-git.v4/plumbing" + "gopkg.in/src-d/go-git.v4/plumbing/object" ) -const ( - // parameters for searching for commit infos. If the untargeted search has - // not found any entries in the past 5 commits, and 12 or fewer entries - // remain, then we'll just let the targeted-searching threads finish off, - // and stop the untargeted search to not interfere. - deferToTargetedSearchColdStreak = 5 - deferToTargetedSearchNumRemainingEntries = 12 -) +// GetCommitsInfo gets information of all commits that are corresponding to these entries +func (tes Entries) GetCommitsInfo(commit *Commit, treePath string, cache LastCommitCache) ([][]interface{}, *Commit, error) { + entryPaths := make([]string, len(tes)+1) + // Get the commit for the treePath itself + entryPaths[0] = "" + for i, entry := range tes { + entryPaths[i+1] = entry.Name() + } -// getCommitsInfoState shared state while getting commit info for entries -type getCommitsInfoState struct { - lock sync.Mutex - /* read-only fields, can be read without the mutex */ - // entries and entryPaths are read-only after initialization, so they can - // safely be read without the mutex - entries []*TreeEntry - // set of filepaths to get info for - entryPaths map[string]struct{} - treePath string - headCommit *Commit + c, err := commit.repo.gogitRepo.CommitObject(plumbing.Hash(commit.ID)) + if err != nil { + return nil, nil, err + } - /* mutable fields, must hold mutex to read or write */ - // map from filepath to commit - commits map[string]*Commit - // set of filepaths that have been or are being searched for in a target search - targetedPaths map[string]struct{} -} + revs, err := getLastCommitForPaths(c, treePath, entryPaths) + if err != nil { + return nil, nil, err + } -func (state *getCommitsInfoState) numRemainingEntries() int { - state.lock.Lock() - defer state.lock.Unlock() - return len(state.entries) - len(state.commits) -} + commit.repo.gogitStorage.Close() -// getTargetEntryPath Returns the next path for a targeted-searching thread to -// search for, or returns the empty string if nothing left to search for -func (state *getCommitsInfoState) getTargetedEntryPath() string { - var targetedEntryPath string - state.lock.Lock() - defer state.lock.Unlock() - for _, entry := range state.entries { - entryPath := path.Join(state.treePath, entry.Name()) - if _, ok := state.commits[entryPath]; ok { - continue - } else if _, ok = state.targetedPaths[entryPath]; ok { - continue + commitsInfo := make([][]interface{}, len(tes)) + for i, entry := range tes { + if rev, ok := revs[entry.Name()]; ok { + entryCommit := convertCommit(rev) + if entry.IsSubModule() { + subModuleURL := "" + if subModule, err := commit.GetSubModule(entry.Name()); err != nil { + return nil, nil, err + } else if subModule != nil { + subModuleURL = subModule.URL + } + subModuleFile := NewSubModuleFile(entryCommit, subModuleURL, entry.ID.String()) + commitsInfo[i] = []interface{}{entry, subModuleFile} + } else { + commitsInfo[i] = []interface{}{entry, entryCommit} + } + } else { + commitsInfo[i] = []interface{}{entry, nil} } - targetedEntryPath = entryPath - state.targetedPaths[entryPath] = struct{}{} - break } - return targetedEntryPath -} -// repeatedly perform targeted searches for unpopulated entries -func targetedSearch(state *getCommitsInfoState, done chan error, cache LastCommitCache) { - for { - entryPath := state.getTargetedEntryPath() - if len(entryPath) == 0 { - done <- nil - return - } - if cache != nil { - commit, err := cache.Get(state.headCommit.repo.Path, state.headCommit.ID.String(), entryPath) - if err == nil && commit != nil { - state.update(entryPath, commit) - continue - } - } - command := NewCommand("rev-list", "-1", state.headCommit.ID.String(), "--", entryPath) - output, err := command.RunInDir(state.headCommit.repo.Path) - if err != nil { - done <- err - return - } - id, err := NewIDFromString(strings.TrimSpace(output)) - if err != nil { - done <- err - return - } - commit, err := state.headCommit.repo.getCommit(id) - if err != nil { - done <- err - return - } - state.update(entryPath, commit) - if cache != nil { - cache.Put(state.headCommit.repo.Path, state.headCommit.ID.String(), entryPath, commit) - } + // Retrieve the commit for the treePath itself (see above). We basically + // get it for free during the tree traversal and it's used for listing + // pages to display information about newest commit for a given path. + var treeCommit *Commit + if rev, ok := revs[""]; ok { + treeCommit = convertCommit(rev) } + return commitsInfo, treeCommit, nil } -func initGetCommitInfoState(entries Entries, headCommit *Commit, treePath string) *getCommitsInfoState { - entryPaths := make(map[string]struct{}, len(entries)) - for _, entry := range entries { - entryPaths[path.Join(treePath, entry.Name())] = struct{}{} - } - if treePath = path.Clean(treePath); treePath == "." { - treePath = "" - } - return &getCommitsInfoState{ - entries: entries, - entryPaths: entryPaths, - commits: make(map[string]*Commit, len(entries)), - targetedPaths: make(map[string]struct{}, len(entries)), - treePath: treePath, - headCommit: headCommit, - } +type commitAndPaths struct { + commit *object.Commit + // Paths that are still on the branch represented by commit + paths []string + // Set of hashes for the paths + hashes map[string]plumbing.Hash } -// GetCommitsInfo gets information of all commits that are corresponding to these entries -func (tes Entries) GetCommitsInfo(commit *Commit, treePath string, cache LastCommitCache) ([][]interface{}, error) { - state := initGetCommitInfoState(tes, commit, treePath) - if err := getCommitsInfo(state, cache); err != nil { +func getCommitTree(c *object.Commit, treePath string) (*object.Tree, error) { + tree, err := c.Tree() + if err != nil { return nil, err } - if len(state.commits) < len(state.entryPaths) { - return nil, fmt.Errorf("could not find commits for all entries") - } - - commitsInfo := make([][]interface{}, len(tes)) - for i, entry := range tes { - commit, ok := state.commits[path.Join(treePath, entry.Name())] - if !ok { - return nil, fmt.Errorf("could not find commit for %s", entry.Name()) - } - switch entry.Type { - case ObjectCommit: - subModuleURL := "" - if subModule, err := state.headCommit.GetSubModule(entry.Name()); err != nil { - return nil, err - } else if subModule != nil { - subModuleURL = subModule.URL - } - subModuleFile := NewSubModuleFile(commit, subModuleURL, entry.ID.String()) - commitsInfo[i] = []interface{}{entry, subModuleFile} - default: - commitsInfo[i] = []interface{}{entry, commit} - } - } - return commitsInfo, nil -} -func (state *getCommitsInfoState) cleanEntryPath(rawEntryPath string) (string, error) { - if rawEntryPath[0] == '"' { - var err error - rawEntryPath, err = strconv.Unquote(rawEntryPath) + // Optimize deep traversals by focusing only on the specific tree + if treePath != "" { + tree, err = tree.Tree(treePath) if err != nil { - return rawEntryPath, err + return nil, err } } - var entryNameStartIndex int - if len(state.treePath) > 0 { - entryNameStartIndex = len(state.treePath) + 1 - } - if index := strings.IndexByte(rawEntryPath[entryNameStartIndex:], '/'); index >= 0 { - return rawEntryPath[:entryNameStartIndex+index], nil - } - return rawEntryPath, nil + return tree, nil } -// update report that the given path was last modified by the given commit. -// Returns whether state.commits was updated -func (state *getCommitsInfoState) update(entryPath string, commit *Commit) bool { - if _, ok := state.entryPaths[entryPath]; !ok { - return false - } - - var updated bool - state.lock.Lock() - defer state.lock.Unlock() - if _, ok := state.commits[entryPath]; !ok { - state.commits[entryPath] = commit - updated = true +func getFullPath(treePath, path string) string { + if treePath != "" { + if path != "" { + return treePath + "/" + path + } + return treePath } - return updated + return path } -const getCommitsInfoPretty = "--pretty=format:%H %ct %s" - -func getCommitsInfo(state *getCommitsInfoState, cache LastCommitCache) error { - ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute) - defer cancel() - - args := []string{"log", state.headCommit.ID.String(), getCommitsInfoPretty, "--name-status", "-c"} - if len(state.treePath) > 0 { - args = append(args, "--", state.treePath) +func getFileHashes(c *object.Commit, treePath string, paths []string) (map[string]plumbing.Hash, error) { + tree, err := getCommitTree(c, treePath) + if err == object.ErrDirectoryNotFound { + // The whole tree didn't exist, so return empty map + return make(map[string]plumbing.Hash), nil } - cmd := exec.CommandContext(ctx, "git", args...) - cmd.Dir = state.headCommit.repo.Path - - readCloser, err := cmd.StdoutPipe() if err != nil { - return err + return nil, err } - if err := cmd.Start(); err != nil { - return err + hashes := make(map[string]plumbing.Hash) + for _, path := range paths { + if path != "" { + entry, err := tree.FindEntry(path) + if err == nil { + hashes[path] = entry.Hash + } + } else { + hashes[path] = tree.Hash + } } - // it's okay to ignore the error returned by cmd.Wait(); we expect the - // subprocess to sometimes have a non-zero exit status, since we may - // prematurely close stdout, resulting in a broken pipe. - defer cmd.Wait() - numThreads := runtime.NumCPU() - done := make(chan error, numThreads) - for i := 0; i < numThreads; i++ { - go targetedSearch(state, done, cache) - } + return hashes, nil +} - scanner := bufio.NewScanner(readCloser) - err = state.processGitLogOutput(scanner) +func getLastCommitForPaths(c *object.Commit, treePath string, paths []string) (map[string]*object.Commit, error) { + // We do a tree traversal with nodes sorted by commit time + seen := make(map[plumbing.Hash]bool) + heap := binaryheap.NewWith(func(a, b interface{}) int { + if a.(*commitAndPaths).commit.Committer.When.Before(b.(*commitAndPaths).commit.Committer.When) { + return 1 + } + return -1 + }) - // it is important that we close stdout here; if we do not close - // stdout, the subprocess will keep running, and the deffered call - // cmd.Wait() may block for a long time. - if closeErr := readCloser.Close(); closeErr != nil && err == nil { - err = closeErr + result := make(map[string]*object.Commit) + initialHashes, err := getFileHashes(c, treePath, paths) + if err != nil { + return nil, err } - for i := 0; i < numThreads; i++ { - doneErr := <-done - if doneErr != nil && err == nil { - err = doneErr + // Start search from the root commit and with full set of paths + heap.Push(&commitAndPaths{c, paths, initialHashes}) + + for { + cIn, ok := heap.Pop() + if !ok { + break } - } - return err -} + current := cIn.(*commitAndPaths) + currentID := current.commit.ID() -func (state *getCommitsInfoState) processGitLogOutput(scanner *bufio.Scanner) error { - // keep a local cache of seen paths to avoid acquiring a lock for paths - // we've already seen - seenPaths := make(map[string]struct{}, len(state.entryPaths)) - // number of consecutive commits without any finds - coldStreak := 0 - var commit *Commit - var err error - for scanner.Scan() { - line := scanner.Text() - if len(line) == 0 { // in-between commits - numRemainingEntries := state.numRemainingEntries() - if numRemainingEntries == 0 { - break - } - if coldStreak >= deferToTargetedSearchColdStreak && - numRemainingEntries <= deferToTargetedSearchNumRemainingEntries { - // stop this untargeted search, and let the targeted-search threads - // finish the work - break - } + if seen[currentID] { continue } - if line[0] >= 'A' && line[0] <= 'X' { // a file was changed by the current commit - // look for the last tab, since for copies (C) and renames (R) two - // filenames are printed: src, then dest - tabIndex := strings.LastIndexByte(line, '\t') - if tabIndex < 1 { - return fmt.Errorf("misformatted line: %s", line) + seen[currentID] = true + + // Load the parent commits for the one we are currently examining + numParents := current.commit.NumParents() + var parents []*object.Commit + for i := 0; i < numParents; i++ { + parent, err := current.commit.Parent(i) + if err != nil { + break } - entryPath, err := state.cleanEntryPath(line[tabIndex+1:]) + parents = append(parents, parent) + } + + // Examine the current commit and set of interesting paths + numOfParentsWithPath := make([]int, len(current.paths)) + pathChanged := make([]bool, len(current.paths)) + parentHashes := make([]map[string]plumbing.Hash, len(parents)) + for j, parent := range parents { + parentHashes[j], err = getFileHashes(parent, treePath, current.paths) if err != nil { - return err + break } - if _, ok := seenPaths[entryPath]; !ok { - if state.update(entryPath, commit) { - coldStreak = 0 + + for i, path := range current.paths { + if parentHashes[j][path] != plumbing.ZeroHash { + numOfParentsWithPath[i]++ + if parentHashes[j][path] != current.hashes[path] { + pathChanged[i] = true + } } - seenPaths[entryPath] = struct{}{} } - continue } - // a new commit - commit, err = parseCommitInfo(line) - if err != nil { - return err + var remainingPaths []string + for i, path := range current.paths { + switch numOfParentsWithPath[i] { + case 0: + // The path didn't exist in any parent, so it must have been created by + // this commit. The results could already contain some newer change from + // different path, so don't override that. + if result[path] == nil { + result[path] = current.commit + } + case 1: + // The file is present on exactly one parent, so check if it was changed + // and save the revision if it did. + if pathChanged[i] { + if result[path] == nil { + result[path] = current.commit + } + } else { + remainingPaths = append(remainingPaths, path) + } + default: + // The file is present on more than one of the parent paths, so this is + // a merge. We have to examine all the parent trees to find out where + // the change occurred. pathChanged[i] would tell us that the file was + // changed during the merge, but it wouldn't tell us the relevant commit + // that introduced it. + remainingPaths = append(remainingPaths, path) + } } - coldStreak++ - } - return scanner.Err() -} -// parseCommitInfo parse a commit from a line of `git log` output. Expects the -// line to be formatted according to getCommitsInfoPretty. -func parseCommitInfo(line string) (*Commit, error) { - if len(line) < 43 { - return nil, fmt.Errorf("invalid git output: %s", line) - } - ref, err := NewIDFromString(line[:40]) - if err != nil { - return nil, err - } - spaceIndex := strings.IndexByte(line[41:], ' ') - if spaceIndex < 0 { - return nil, fmt.Errorf("invalid git output: %s", line) - } - unixSeconds, err := strconv.Atoi(line[41 : 41+spaceIndex]) - if err != nil { - return nil, err + if len(remainingPaths) > 0 { + // Add the parent nodes along with remaining paths to the heap for further + // processing. + for j, parent := range parents { + if seen[parent.ID()] { + continue + } + + // Combine remainingPath with paths available on the parent branch + // and make union of them + var remainingPathsForParent []string + for _, path := range remainingPaths { + if parentHashes[j][path] != plumbing.ZeroHash { + remainingPathsForParent = append(remainingPathsForParent, path) + } + } + + heap.Push(&commitAndPaths{parent, remainingPathsForParent, parentHashes[j]}) + } + } } - message := line[spaceIndex+42:] - return &Commit{ - ID: ref, - CommitMessage: message, - Committer: &Signature{ - When: time.Unix(int64(unixSeconds), 0), - }, - }, nil + + return result, nil } |