Nevar pievienot vairāk kā 25 tēmas Tēmai ir jāsākas ar burtu vai ciparu, tā var saturēt domu zīmes ('-') un var būt līdz 35 simboliem gara.

repo_language_stats_nogogit.go 6.1KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232
  1. // Copyright 2020 The Gitea Authors. All rights reserved.
  2. // SPDX-License-Identifier: MIT
  3. //go:build !gogit
  4. package git
  5. import (
  6. "bufio"
  7. "bytes"
  8. "io"
  9. "math"
  10. "strings"
  11. "code.gitea.io/gitea/modules/analyze"
  12. "code.gitea.io/gitea/modules/log"
  13. "github.com/go-enry/go-enry/v2"
  14. )
  15. // GetLanguageStats calculates language stats for git repository at specified commit
  16. func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, error) {
  17. // We will feed the commit IDs in order into cat-file --batch, followed by blobs as necessary.
  18. // so let's create a batch stdin and stdout
  19. batchStdinWriter, batchReader, cancel := repo.CatFileBatch(repo.Ctx)
  20. defer cancel()
  21. writeID := func(id string) error {
  22. _, err := batchStdinWriter.Write([]byte(id + "\n"))
  23. return err
  24. }
  25. if err := writeID(commitID); err != nil {
  26. return nil, err
  27. }
  28. shaBytes, typ, size, err := ReadBatchLine(batchReader)
  29. if typ != "commit" {
  30. log.Debug("Unable to get commit for: %s. Err: %v", commitID, err)
  31. return nil, ErrNotExist{commitID, ""}
  32. }
  33. sha, err := NewIDFromString(string(shaBytes))
  34. if err != nil {
  35. log.Debug("Unable to get commit for: %s. Err: %v", commitID, err)
  36. return nil, ErrNotExist{commitID, ""}
  37. }
  38. commit, err := CommitFromReader(repo, sha, io.LimitReader(batchReader, size))
  39. if err != nil {
  40. log.Debug("Unable to get commit for: %s. Err: %v", commitID, err)
  41. return nil, err
  42. }
  43. if _, err = batchReader.Discard(1); err != nil {
  44. return nil, err
  45. }
  46. tree := commit.Tree
  47. entries, err := tree.ListEntriesRecursiveWithSize()
  48. if err != nil {
  49. return nil, err
  50. }
  51. checker, deferable := repo.CheckAttributeReader(commitID)
  52. defer deferable()
  53. contentBuf := bytes.Buffer{}
  54. var content []byte
  55. // sizes contains the current calculated size of all files by language
  56. sizes := make(map[string]int64)
  57. // by default we will only count the sizes of programming languages or markup languages
  58. // unless they are explicitly set using linguist-language
  59. includedLanguage := map[string]bool{}
  60. // or if there's only one language in the repository
  61. firstExcludedLanguage := ""
  62. firstExcludedLanguageSize := int64(0)
  63. for _, f := range entries {
  64. select {
  65. case <-repo.Ctx.Done():
  66. return sizes, repo.Ctx.Err()
  67. default:
  68. }
  69. contentBuf.Reset()
  70. content = contentBuf.Bytes()
  71. if f.Size() == 0 {
  72. continue
  73. }
  74. notVendored := false
  75. notGenerated := false
  76. if checker != nil {
  77. attrs, err := checker.CheckPath(f.Name())
  78. if err == nil {
  79. if vendored, has := attrs["linguist-vendored"]; has {
  80. if vendored == "set" || vendored == "true" {
  81. continue
  82. }
  83. notVendored = vendored == "false"
  84. }
  85. if generated, has := attrs["linguist-generated"]; has {
  86. if generated == "set" || generated == "true" {
  87. continue
  88. }
  89. notGenerated = generated == "false"
  90. }
  91. if language, has := attrs["linguist-language"]; has && language != "unspecified" && language != "" {
  92. // group languages, such as Pug -> HTML; SCSS -> CSS
  93. group := enry.GetLanguageGroup(language)
  94. if len(group) != 0 {
  95. language = group
  96. }
  97. // this language will always be added to the size
  98. sizes[language] += f.Size()
  99. continue
  100. } else if language, has := attrs["gitlab-language"]; has && language != "unspecified" && language != "" {
  101. // strip off a ? if present
  102. if idx := strings.IndexByte(language, '?'); idx >= 0 {
  103. language = language[:idx]
  104. }
  105. if len(language) != 0 {
  106. // group languages, such as Pug -> HTML; SCSS -> CSS
  107. group := enry.GetLanguageGroup(language)
  108. if len(group) != 0 {
  109. language = group
  110. }
  111. // this language will always be added to the size
  112. sizes[language] += f.Size()
  113. continue
  114. }
  115. }
  116. }
  117. }
  118. if (!notVendored && analyze.IsVendor(f.Name())) || enry.IsDotFile(f.Name()) ||
  119. enry.IsDocumentation(f.Name()) || enry.IsConfiguration(f.Name()) {
  120. continue
  121. }
  122. // If content can not be read or file is too big just do detection by filename
  123. if f.Size() <= bigFileSize {
  124. if err := writeID(f.ID.String()); err != nil {
  125. return nil, err
  126. }
  127. _, _, size, err := ReadBatchLine(batchReader)
  128. if err != nil {
  129. log.Debug("Error reading blob: %s Err: %v", f.ID.String(), err)
  130. return nil, err
  131. }
  132. sizeToRead := size
  133. discard := int64(1)
  134. if size > fileSizeLimit {
  135. sizeToRead = fileSizeLimit
  136. discard = size - fileSizeLimit + 1
  137. }
  138. _, err = contentBuf.ReadFrom(io.LimitReader(batchReader, sizeToRead))
  139. if err != nil {
  140. return nil, err
  141. }
  142. content = contentBuf.Bytes()
  143. err = discardFull(batchReader, discard)
  144. if err != nil {
  145. return nil, err
  146. }
  147. }
  148. if !notGenerated && enry.IsGenerated(f.Name(), content) {
  149. continue
  150. }
  151. // FIXME: Why can't we split this and the IsGenerated tests to avoid reading the blob unless absolutely necessary?
  152. // - eg. do the all the detection tests using filename first before reading content.
  153. language := analyze.GetCodeLanguage(f.Name(), content)
  154. if language == "" {
  155. continue
  156. }
  157. // group languages, such as Pug -> HTML; SCSS -> CSS
  158. group := enry.GetLanguageGroup(language)
  159. if group != "" {
  160. language = group
  161. }
  162. included, checked := includedLanguage[language]
  163. if !checked {
  164. langType := enry.GetLanguageType(language)
  165. included = langType == enry.Programming || langType == enry.Markup
  166. includedLanguage[language] = included
  167. }
  168. if included {
  169. sizes[language] += f.Size()
  170. } else if len(sizes) == 0 && (firstExcludedLanguage == "" || firstExcludedLanguage == language) {
  171. firstExcludedLanguage = language
  172. firstExcludedLanguageSize += f.Size()
  173. }
  174. continue
  175. }
  176. // If there are no included languages add the first excluded language
  177. if len(sizes) == 0 && firstExcludedLanguage != "" {
  178. sizes[firstExcludedLanguage] = firstExcludedLanguageSize
  179. }
  180. return mergeLanguageStats(sizes), nil
  181. }
  182. func discardFull(rd *bufio.Reader, discard int64) error {
  183. if discard > math.MaxInt32 {
  184. n, err := rd.Discard(math.MaxInt32)
  185. discard -= int64(n)
  186. if err != nil {
  187. return err
  188. }
  189. }
  190. for discard > 0 {
  191. n, err := rd.Discard(int(discard))
  192. discard -= int64(n)
  193. if err != nil {
  194. return err
  195. }
  196. }
  197. return nil
  198. }