You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

bleve.go 11KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350
  1. // Copyright 2019 The Gitea Authors. All rights reserved.
  2. // SPDX-License-Identifier: MIT
  3. package bleve
  4. import (
  5. "bufio"
  6. "context"
  7. "fmt"
  8. "io"
  9. "strconv"
  10. "strings"
  11. "time"
  12. repo_model "code.gitea.io/gitea/models/repo"
  13. "code.gitea.io/gitea/modules/analyze"
  14. "code.gitea.io/gitea/modules/charset"
  15. "code.gitea.io/gitea/modules/git"
  16. "code.gitea.io/gitea/modules/indexer/code/internal"
  17. indexer_internal "code.gitea.io/gitea/modules/indexer/internal"
  18. inner_bleve "code.gitea.io/gitea/modules/indexer/internal/bleve"
  19. "code.gitea.io/gitea/modules/log"
  20. "code.gitea.io/gitea/modules/setting"
  21. "code.gitea.io/gitea/modules/timeutil"
  22. "code.gitea.io/gitea/modules/typesniffer"
  23. "github.com/blevesearch/bleve/v2"
  24. analyzer_custom "github.com/blevesearch/bleve/v2/analysis/analyzer/custom"
  25. analyzer_keyword "github.com/blevesearch/bleve/v2/analysis/analyzer/keyword"
  26. "github.com/blevesearch/bleve/v2/analysis/token/camelcase"
  27. "github.com/blevesearch/bleve/v2/analysis/token/lowercase"
  28. "github.com/blevesearch/bleve/v2/analysis/token/unicodenorm"
  29. "github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
  30. "github.com/blevesearch/bleve/v2/mapping"
  31. "github.com/blevesearch/bleve/v2/search/query"
  32. "github.com/go-enry/go-enry/v2"
  33. )
  34. const (
  35. unicodeNormalizeName = "unicodeNormalize"
  36. maxBatchSize = 16
  37. // fuzzyDenominator determines the levenshtein distance per each character of a keyword
  38. fuzzyDenominator = 4
  39. )
  40. func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
  41. return m.AddCustomTokenFilter(unicodeNormalizeName, map[string]any{
  42. "type": unicodenorm.Name,
  43. "form": unicodenorm.NFC,
  44. })
  45. }
  46. // RepoIndexerData data stored in the repo indexer
  47. type RepoIndexerData struct {
  48. RepoID int64
  49. CommitID string
  50. Content string
  51. Language string
  52. UpdatedAt time.Time
  53. }
  54. // Type returns the document type, for bleve's mapping.Classifier interface.
  55. func (d *RepoIndexerData) Type() string {
  56. return repoIndexerDocType
  57. }
  58. const (
  59. repoIndexerAnalyzer = "repoIndexerAnalyzer"
  60. repoIndexerDocType = "repoIndexerDocType"
  61. repoIndexerLatestVersion = 6
  62. )
  63. // generateBleveIndexMapping generates a bleve index mapping for the repo indexer
  64. func generateBleveIndexMapping() (mapping.IndexMapping, error) {
  65. docMapping := bleve.NewDocumentMapping()
  66. numericFieldMapping := bleve.NewNumericFieldMapping()
  67. numericFieldMapping.IncludeInAll = false
  68. docMapping.AddFieldMappingsAt("RepoID", numericFieldMapping)
  69. textFieldMapping := bleve.NewTextFieldMapping()
  70. textFieldMapping.IncludeInAll = false
  71. docMapping.AddFieldMappingsAt("Content", textFieldMapping)
  72. termFieldMapping := bleve.NewTextFieldMapping()
  73. termFieldMapping.IncludeInAll = false
  74. termFieldMapping.Analyzer = analyzer_keyword.Name
  75. docMapping.AddFieldMappingsAt("Language", termFieldMapping)
  76. docMapping.AddFieldMappingsAt("CommitID", termFieldMapping)
  77. timeFieldMapping := bleve.NewDateTimeFieldMapping()
  78. timeFieldMapping.IncludeInAll = false
  79. docMapping.AddFieldMappingsAt("UpdatedAt", timeFieldMapping)
  80. mapping := bleve.NewIndexMapping()
  81. if err := addUnicodeNormalizeTokenFilter(mapping); err != nil {
  82. return nil, err
  83. } else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]any{
  84. "type": analyzer_custom.Name,
  85. "char_filters": []string{},
  86. "tokenizer": unicode.Name,
  87. "token_filters": []string{unicodeNormalizeName, camelcase.Name, lowercase.Name},
  88. }); err != nil {
  89. return nil, err
  90. }
  91. mapping.DefaultAnalyzer = repoIndexerAnalyzer
  92. mapping.AddDocumentMapping(repoIndexerDocType, docMapping)
  93. mapping.AddDocumentMapping("_all", bleve.NewDocumentDisabledMapping())
  94. return mapping, nil
  95. }
  96. var _ internal.Indexer = &Indexer{}
  97. // Indexer represents a bleve indexer implementation
  98. type Indexer struct {
  99. inner *inner_bleve.Indexer
  100. indexer_internal.Indexer // do not composite inner_bleve.Indexer directly to avoid exposing too much
  101. }
  102. // NewIndexer creates a new bleve local indexer
  103. func NewIndexer(indexDir string) *Indexer {
  104. inner := inner_bleve.NewIndexer(indexDir, repoIndexerLatestVersion, generateBleveIndexMapping)
  105. return &Indexer{
  106. Indexer: inner,
  107. inner: inner,
  108. }
  109. }
  110. func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserError, batchReader *bufio.Reader, commitSha string,
  111. update internal.FileUpdate, repo *repo_model.Repository, batch *inner_bleve.FlushingBatch,
  112. ) error {
  113. // Ignore vendored files in code search
  114. if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) {
  115. return nil
  116. }
  117. size := update.Size
  118. var err error
  119. if !update.Sized {
  120. var stdout string
  121. stdout, _, err = git.NewCommand(ctx, "cat-file", "-s").AddDynamicArguments(update.BlobSha).RunStdString(&git.RunOpts{Dir: repo.RepoPath()})
  122. if err != nil {
  123. return err
  124. }
  125. if size, err = strconv.ParseInt(strings.TrimSpace(stdout), 10, 64); err != nil {
  126. return fmt.Errorf("misformatted git cat-file output: %w", err)
  127. }
  128. }
  129. if size > setting.Indexer.MaxIndexerFileSize {
  130. return b.addDelete(update.Filename, repo, batch)
  131. }
  132. if _, err := batchWriter.Write([]byte(update.BlobSha + "\n")); err != nil {
  133. return err
  134. }
  135. _, _, size, err = git.ReadBatchLine(batchReader)
  136. if err != nil {
  137. return err
  138. }
  139. fileContents, err := io.ReadAll(io.LimitReader(batchReader, size))
  140. if err != nil {
  141. return err
  142. } else if !typesniffer.DetectContentType(fileContents).IsText() {
  143. // FIXME: UTF-16 files will probably fail here
  144. return nil
  145. }
  146. if _, err = batchReader.Discard(1); err != nil {
  147. return err
  148. }
  149. id := internal.FilenameIndexerID(repo.ID, update.Filename)
  150. return batch.Index(id, &RepoIndexerData{
  151. RepoID: repo.ID,
  152. CommitID: commitSha,
  153. Content: string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})),
  154. Language: analyze.GetCodeLanguage(update.Filename, fileContents),
  155. UpdatedAt: time.Now().UTC(),
  156. })
  157. }
  158. func (b *Indexer) addDelete(filename string, repo *repo_model.Repository, batch *inner_bleve.FlushingBatch) error {
  159. id := internal.FilenameIndexerID(repo.ID, filename)
  160. return batch.Delete(id)
  161. }
  162. // Index indexes the data
  163. func (b *Indexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *internal.RepoChanges) error {
  164. batch := inner_bleve.NewFlushingBatch(b.inner.Indexer, maxBatchSize)
  165. if len(changes.Updates) > 0 {
  166. // Now because of some insanity with git cat-file not immediately failing if not run in a valid git directory we need to run git rev-parse first!
  167. if err := git.EnsureValidGitRepository(ctx, repo.RepoPath()); err != nil {
  168. log.Error("Unable to open git repo: %s for %-v: %v", repo.RepoPath(), repo, err)
  169. return err
  170. }
  171. batchWriter, batchReader, cancel := git.CatFileBatch(ctx, repo.RepoPath())
  172. defer cancel()
  173. for _, update := range changes.Updates {
  174. if err := b.addUpdate(ctx, batchWriter, batchReader, sha, update, repo, batch); err != nil {
  175. return err
  176. }
  177. }
  178. cancel()
  179. }
  180. for _, filename := range changes.RemovedFilenames {
  181. if err := b.addDelete(filename, repo, batch); err != nil {
  182. return err
  183. }
  184. }
  185. return batch.Flush()
  186. }
  187. // Delete deletes indexes by ids
  188. func (b *Indexer) Delete(_ context.Context, repoID int64) error {
  189. query := inner_bleve.NumericEqualityQuery(repoID, "RepoID")
  190. searchRequest := bleve.NewSearchRequestOptions(query, 2147483647, 0, false)
  191. result, err := b.inner.Indexer.Search(searchRequest)
  192. if err != nil {
  193. return err
  194. }
  195. batch := inner_bleve.NewFlushingBatch(b.inner.Indexer, maxBatchSize)
  196. for _, hit := range result.Hits {
  197. if err = batch.Delete(hit.ID); err != nil {
  198. return err
  199. }
  200. }
  201. return batch.Flush()
  202. }
  203. // Search searches for files in the specified repo.
  204. // Returns the matching file-paths
  205. func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
  206. var (
  207. indexerQuery query.Query
  208. keywordQuery query.Query
  209. )
  210. phraseQuery := bleve.NewMatchPhraseQuery(opts.Keyword)
  211. phraseQuery.FieldVal = "Content"
  212. phraseQuery.Analyzer = repoIndexerAnalyzer
  213. keywordQuery = phraseQuery
  214. if opts.IsKeywordFuzzy {
  215. phraseQuery.Fuzziness = len(opts.Keyword) / fuzzyDenominator
  216. }
  217. if len(opts.RepoIDs) > 0 {
  218. repoQueries := make([]query.Query, 0, len(opts.RepoIDs))
  219. for _, repoID := range opts.RepoIDs {
  220. repoQueries = append(repoQueries, inner_bleve.NumericEqualityQuery(repoID, "RepoID"))
  221. }
  222. indexerQuery = bleve.NewConjunctionQuery(
  223. bleve.NewDisjunctionQuery(repoQueries...),
  224. keywordQuery,
  225. )
  226. } else {
  227. indexerQuery = keywordQuery
  228. }
  229. // Save for reuse without language filter
  230. facetQuery := indexerQuery
  231. if len(opts.Language) > 0 {
  232. languageQuery := bleve.NewMatchQuery(opts.Language)
  233. languageQuery.FieldVal = "Language"
  234. languageQuery.Analyzer = analyzer_keyword.Name
  235. indexerQuery = bleve.NewConjunctionQuery(
  236. indexerQuery,
  237. languageQuery,
  238. )
  239. }
  240. from, pageSize := opts.GetSkipTake()
  241. searchRequest := bleve.NewSearchRequestOptions(indexerQuery, pageSize, from, false)
  242. searchRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
  243. searchRequest.IncludeLocations = true
  244. if len(opts.Language) == 0 {
  245. searchRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10))
  246. }
  247. result, err := b.inner.Indexer.SearchInContext(ctx, searchRequest)
  248. if err != nil {
  249. return 0, nil, nil, err
  250. }
  251. total := int64(result.Total)
  252. searchResults := make([]*internal.SearchResult, len(result.Hits))
  253. for i, hit := range result.Hits {
  254. startIndex, endIndex := -1, -1
  255. for _, locations := range hit.Locations["Content"] {
  256. location := locations[0]
  257. locationStart := int(location.Start)
  258. locationEnd := int(location.End)
  259. if startIndex < 0 || locationStart < startIndex {
  260. startIndex = locationStart
  261. }
  262. if endIndex < 0 || locationEnd > endIndex {
  263. endIndex = locationEnd
  264. }
  265. }
  266. language := hit.Fields["Language"].(string)
  267. var updatedUnix timeutil.TimeStamp
  268. if t, err := time.Parse(time.RFC3339, hit.Fields["UpdatedAt"].(string)); err == nil {
  269. updatedUnix = timeutil.TimeStamp(t.Unix())
  270. }
  271. searchResults[i] = &internal.SearchResult{
  272. RepoID: int64(hit.Fields["RepoID"].(float64)),
  273. StartIndex: startIndex,
  274. EndIndex: endIndex,
  275. Filename: internal.FilenameOfIndexerID(hit.ID),
  276. Content: hit.Fields["Content"].(string),
  277. CommitID: hit.Fields["CommitID"].(string),
  278. UpdatedUnix: updatedUnix,
  279. Language: language,
  280. Color: enry.GetColor(language),
  281. }
  282. }
  283. searchResultLanguages := make([]*internal.SearchResultLanguages, 0, 10)
  284. if len(opts.Language) > 0 {
  285. // Use separate query to go get all language counts
  286. facetRequest := bleve.NewSearchRequestOptions(facetQuery, 1, 0, false)
  287. facetRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
  288. facetRequest.IncludeLocations = true
  289. facetRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10))
  290. if result, err = b.inner.Indexer.Search(facetRequest); err != nil {
  291. return 0, nil, nil, err
  292. }
  293. }
  294. languagesFacet := result.Facets["languages"]
  295. for _, term := range languagesFacet.Terms.Terms() {
  296. if len(term.Term) == 0 {
  297. continue
  298. }
  299. searchResultLanguages = append(searchResultLanguages, &internal.SearchResultLanguages{
  300. Language: term.Term,
  301. Color: enry.GetColor(term.Term),
  302. Count: term.Count,
  303. })
  304. }
  305. return total, searchResults, searchResultLanguages, nil
  306. }