You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

bleve.go 11KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353
  1. // Copyright 2019 The Gitea Authors. All rights reserved.
  2. // SPDX-License-Identifier: MIT
  3. package bleve
  4. import (
  5. "bufio"
  6. "context"
  7. "fmt"
  8. "io"
  9. "strconv"
  10. "strings"
  11. "time"
  12. repo_model "code.gitea.io/gitea/models/repo"
  13. "code.gitea.io/gitea/modules/analyze"
  14. "code.gitea.io/gitea/modules/charset"
  15. "code.gitea.io/gitea/modules/git"
  16. "code.gitea.io/gitea/modules/indexer/code/internal"
  17. indexer_internal "code.gitea.io/gitea/modules/indexer/internal"
  18. inner_bleve "code.gitea.io/gitea/modules/indexer/internal/bleve"
  19. "code.gitea.io/gitea/modules/log"
  20. "code.gitea.io/gitea/modules/setting"
  21. "code.gitea.io/gitea/modules/timeutil"
  22. "code.gitea.io/gitea/modules/typesniffer"
  23. "github.com/blevesearch/bleve/v2"
  24. analyzer_custom "github.com/blevesearch/bleve/v2/analysis/analyzer/custom"
  25. analyzer_keyword "github.com/blevesearch/bleve/v2/analysis/analyzer/keyword"
  26. "github.com/blevesearch/bleve/v2/analysis/token/camelcase"
  27. "github.com/blevesearch/bleve/v2/analysis/token/lowercase"
  28. "github.com/blevesearch/bleve/v2/analysis/token/unicodenorm"
  29. "github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
  30. "github.com/blevesearch/bleve/v2/mapping"
  31. "github.com/blevesearch/bleve/v2/search/query"
  32. "github.com/go-enry/go-enry/v2"
  33. )
  34. const (
  35. unicodeNormalizeName = "unicodeNormalize"
  36. maxBatchSize = 16
  37. )
  38. func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
  39. return m.AddCustomTokenFilter(unicodeNormalizeName, map[string]any{
  40. "type": unicodenorm.Name,
  41. "form": unicodenorm.NFC,
  42. })
  43. }
  44. // RepoIndexerData data stored in the repo indexer
  45. type RepoIndexerData struct {
  46. RepoID int64
  47. CommitID string
  48. Content string
  49. Language string
  50. UpdatedAt time.Time
  51. }
  52. // Type returns the document type, for bleve's mapping.Classifier interface.
  53. func (d *RepoIndexerData) Type() string {
  54. return repoIndexerDocType
  55. }
  56. const (
  57. repoIndexerAnalyzer = "repoIndexerAnalyzer"
  58. repoIndexerDocType = "repoIndexerDocType"
  59. repoIndexerLatestVersion = 6
  60. )
  61. // generateBleveIndexMapping generates a bleve index mapping for the repo indexer
  62. func generateBleveIndexMapping() (mapping.IndexMapping, error) {
  63. docMapping := bleve.NewDocumentMapping()
  64. numericFieldMapping := bleve.NewNumericFieldMapping()
  65. numericFieldMapping.IncludeInAll = false
  66. docMapping.AddFieldMappingsAt("RepoID", numericFieldMapping)
  67. textFieldMapping := bleve.NewTextFieldMapping()
  68. textFieldMapping.IncludeInAll = false
  69. docMapping.AddFieldMappingsAt("Content", textFieldMapping)
  70. termFieldMapping := bleve.NewTextFieldMapping()
  71. termFieldMapping.IncludeInAll = false
  72. termFieldMapping.Analyzer = analyzer_keyword.Name
  73. docMapping.AddFieldMappingsAt("Language", termFieldMapping)
  74. docMapping.AddFieldMappingsAt("CommitID", termFieldMapping)
  75. timeFieldMapping := bleve.NewDateTimeFieldMapping()
  76. timeFieldMapping.IncludeInAll = false
  77. docMapping.AddFieldMappingsAt("UpdatedAt", timeFieldMapping)
  78. mapping := bleve.NewIndexMapping()
  79. if err := addUnicodeNormalizeTokenFilter(mapping); err != nil {
  80. return nil, err
  81. } else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]any{
  82. "type": analyzer_custom.Name,
  83. "char_filters": []string{},
  84. "tokenizer": unicode.Name,
  85. "token_filters": []string{unicodeNormalizeName, camelcase.Name, lowercase.Name},
  86. }); err != nil {
  87. return nil, err
  88. }
  89. mapping.DefaultAnalyzer = repoIndexerAnalyzer
  90. mapping.AddDocumentMapping(repoIndexerDocType, docMapping)
  91. mapping.AddDocumentMapping("_all", bleve.NewDocumentDisabledMapping())
  92. return mapping, nil
  93. }
  94. var _ internal.Indexer = &Indexer{}
  95. // Indexer represents a bleve indexer implementation
  96. type Indexer struct {
  97. inner *inner_bleve.Indexer
  98. indexer_internal.Indexer // do not composite inner_bleve.Indexer directly to avoid exposing too much
  99. }
  100. // NewIndexer creates a new bleve local indexer
  101. func NewIndexer(indexDir string) *Indexer {
  102. inner := inner_bleve.NewIndexer(indexDir, repoIndexerLatestVersion, generateBleveIndexMapping)
  103. return &Indexer{
  104. Indexer: inner,
  105. inner: inner,
  106. }
  107. }
  108. func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserError, batchReader *bufio.Reader, commitSha string,
  109. update internal.FileUpdate, repo *repo_model.Repository, batch *inner_bleve.FlushingBatch,
  110. ) error {
  111. // Ignore vendored files in code search
  112. if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) {
  113. return nil
  114. }
  115. size := update.Size
  116. var err error
  117. if !update.Sized {
  118. var stdout string
  119. stdout, _, err = git.NewCommand(ctx, "cat-file", "-s").AddDynamicArguments(update.BlobSha).RunStdString(&git.RunOpts{Dir: repo.RepoPath()})
  120. if err != nil {
  121. return err
  122. }
  123. if size, err = strconv.ParseInt(strings.TrimSpace(stdout), 10, 64); err != nil {
  124. return fmt.Errorf("Misformatted git cat-file output: %w", err)
  125. }
  126. }
  127. if size > setting.Indexer.MaxIndexerFileSize {
  128. return b.addDelete(update.Filename, repo, batch)
  129. }
  130. if _, err := batchWriter.Write([]byte(update.BlobSha + "\n")); err != nil {
  131. return err
  132. }
  133. _, _, size, err = git.ReadBatchLine(batchReader)
  134. if err != nil {
  135. return err
  136. }
  137. fileContents, err := io.ReadAll(io.LimitReader(batchReader, size))
  138. if err != nil {
  139. return err
  140. } else if !typesniffer.DetectContentType(fileContents).IsText() {
  141. // FIXME: UTF-16 files will probably fail here
  142. return nil
  143. }
  144. if _, err = batchReader.Discard(1); err != nil {
  145. return err
  146. }
  147. id := internal.FilenameIndexerID(repo.ID, update.Filename)
  148. return batch.Index(id, &RepoIndexerData{
  149. RepoID: repo.ID,
  150. CommitID: commitSha,
  151. Content: string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})),
  152. Language: analyze.GetCodeLanguage(update.Filename, fileContents),
  153. UpdatedAt: time.Now().UTC(),
  154. })
  155. }
  156. func (b *Indexer) addDelete(filename string, repo *repo_model.Repository, batch *inner_bleve.FlushingBatch) error {
  157. id := internal.FilenameIndexerID(repo.ID, filename)
  158. return batch.Delete(id)
  159. }
  160. // Index indexes the data
  161. func (b *Indexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *internal.RepoChanges) error {
  162. batch := inner_bleve.NewFlushingBatch(b.inner.Indexer, maxBatchSize)
  163. if len(changes.Updates) > 0 {
  164. // Now because of some insanity with git cat-file not immediately failing if not run in a valid git directory we need to run git rev-parse first!
  165. if err := git.EnsureValidGitRepository(ctx, repo.RepoPath()); err != nil {
  166. log.Error("Unable to open git repo: %s for %-v: %v", repo.RepoPath(), repo, err)
  167. return err
  168. }
  169. batchWriter, batchReader, cancel := git.CatFileBatch(ctx, repo.RepoPath())
  170. defer cancel()
  171. for _, update := range changes.Updates {
  172. if err := b.addUpdate(ctx, batchWriter, batchReader, sha, update, repo, batch); err != nil {
  173. return err
  174. }
  175. }
  176. cancel()
  177. }
  178. for _, filename := range changes.RemovedFilenames {
  179. if err := b.addDelete(filename, repo, batch); err != nil {
  180. return err
  181. }
  182. }
  183. return batch.Flush()
  184. }
  185. // Delete deletes indexes by ids
  186. func (b *Indexer) Delete(_ context.Context, repoID int64) error {
  187. query := inner_bleve.NumericEqualityQuery(repoID, "RepoID")
  188. searchRequest := bleve.NewSearchRequestOptions(query, 2147483647, 0, false)
  189. result, err := b.inner.Indexer.Search(searchRequest)
  190. if err != nil {
  191. return err
  192. }
  193. batch := inner_bleve.NewFlushingBatch(b.inner.Indexer, maxBatchSize)
  194. for _, hit := range result.Hits {
  195. if err = batch.Delete(hit.ID); err != nil {
  196. return err
  197. }
  198. }
  199. return batch.Flush()
  200. }
  201. // Search searches for files in the specified repo.
  202. // Returns the matching file-paths
  203. func (b *Indexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
  204. var (
  205. indexerQuery query.Query
  206. keywordQuery query.Query
  207. )
  208. if isMatch {
  209. prefixQuery := bleve.NewPrefixQuery(keyword)
  210. prefixQuery.FieldVal = "Content"
  211. keywordQuery = prefixQuery
  212. } else {
  213. phraseQuery := bleve.NewMatchPhraseQuery(keyword)
  214. phraseQuery.FieldVal = "Content"
  215. phraseQuery.Analyzer = repoIndexerAnalyzer
  216. keywordQuery = phraseQuery
  217. }
  218. if len(repoIDs) > 0 {
  219. repoQueries := make([]query.Query, 0, len(repoIDs))
  220. for _, repoID := range repoIDs {
  221. repoQueries = append(repoQueries, inner_bleve.NumericEqualityQuery(repoID, "RepoID"))
  222. }
  223. indexerQuery = bleve.NewConjunctionQuery(
  224. bleve.NewDisjunctionQuery(repoQueries...),
  225. keywordQuery,
  226. )
  227. } else {
  228. indexerQuery = keywordQuery
  229. }
  230. // Save for reuse without language filter
  231. facetQuery := indexerQuery
  232. if len(language) > 0 {
  233. languageQuery := bleve.NewMatchQuery(language)
  234. languageQuery.FieldVal = "Language"
  235. languageQuery.Analyzer = analyzer_keyword.Name
  236. indexerQuery = bleve.NewConjunctionQuery(
  237. indexerQuery,
  238. languageQuery,
  239. )
  240. }
  241. from := (page - 1) * pageSize
  242. searchRequest := bleve.NewSearchRequestOptions(indexerQuery, pageSize, from, false)
  243. searchRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
  244. searchRequest.IncludeLocations = true
  245. if len(language) == 0 {
  246. searchRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10))
  247. }
  248. result, err := b.inner.Indexer.SearchInContext(ctx, searchRequest)
  249. if err != nil {
  250. return 0, nil, nil, err
  251. }
  252. total := int64(result.Total)
  253. searchResults := make([]*internal.SearchResult, len(result.Hits))
  254. for i, hit := range result.Hits {
  255. startIndex, endIndex := -1, -1
  256. for _, locations := range hit.Locations["Content"] {
  257. location := locations[0]
  258. locationStart := int(location.Start)
  259. locationEnd := int(location.End)
  260. if startIndex < 0 || locationStart < startIndex {
  261. startIndex = locationStart
  262. }
  263. if endIndex < 0 || locationEnd > endIndex {
  264. endIndex = locationEnd
  265. }
  266. }
  267. language := hit.Fields["Language"].(string)
  268. var updatedUnix timeutil.TimeStamp
  269. if t, err := time.Parse(time.RFC3339, hit.Fields["UpdatedAt"].(string)); err == nil {
  270. updatedUnix = timeutil.TimeStamp(t.Unix())
  271. }
  272. searchResults[i] = &internal.SearchResult{
  273. RepoID: int64(hit.Fields["RepoID"].(float64)),
  274. StartIndex: startIndex,
  275. EndIndex: endIndex,
  276. Filename: internal.FilenameOfIndexerID(hit.ID),
  277. Content: hit.Fields["Content"].(string),
  278. CommitID: hit.Fields["CommitID"].(string),
  279. UpdatedUnix: updatedUnix,
  280. Language: language,
  281. Color: enry.GetColor(language),
  282. }
  283. }
  284. searchResultLanguages := make([]*internal.SearchResultLanguages, 0, 10)
  285. if len(language) > 0 {
  286. // Use separate query to go get all language counts
  287. facetRequest := bleve.NewSearchRequestOptions(facetQuery, 1, 0, false)
  288. facetRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
  289. facetRequest.IncludeLocations = true
  290. facetRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10))
  291. if result, err = b.inner.Indexer.Search(facetRequest); err != nil {
  292. return 0, nil, nil, err
  293. }
  294. }
  295. languagesFacet := result.Facets["languages"]
  296. for _, term := range languagesFacet.Terms.Terms() {
  297. if len(term.Term) == 0 {
  298. continue
  299. }
  300. searchResultLanguages = append(searchResultLanguages, &internal.SearchResultLanguages{
  301. Language: term.Term,
  302. Color: enry.GetColor(term.Term),
  303. Count: term.Count,
  304. })
  305. }
  306. return total, searchResults, searchResultLanguages, nil
  307. }