You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

bleve.go 13KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446
  1. // Copyright 2019 The Gitea Authors. All rights reserved.
  2. // SPDX-License-Identifier: MIT
  3. package code
  4. import (
  5. "bufio"
  6. "context"
  7. "fmt"
  8. "io"
  9. "os"
  10. "strconv"
  11. "strings"
  12. "time"
  13. repo_model "code.gitea.io/gitea/models/repo"
  14. "code.gitea.io/gitea/modules/analyze"
  15. "code.gitea.io/gitea/modules/charset"
  16. "code.gitea.io/gitea/modules/git"
  17. gitea_bleve "code.gitea.io/gitea/modules/indexer/bleve"
  18. "code.gitea.io/gitea/modules/log"
  19. "code.gitea.io/gitea/modules/setting"
  20. "code.gitea.io/gitea/modules/timeutil"
  21. "code.gitea.io/gitea/modules/typesniffer"
  22. "code.gitea.io/gitea/modules/util"
  23. "github.com/blevesearch/bleve/v2"
  24. analyzer_custom "github.com/blevesearch/bleve/v2/analysis/analyzer/custom"
  25. analyzer_keyword "github.com/blevesearch/bleve/v2/analysis/analyzer/keyword"
  26. "github.com/blevesearch/bleve/v2/analysis/token/camelcase"
  27. "github.com/blevesearch/bleve/v2/analysis/token/lowercase"
  28. "github.com/blevesearch/bleve/v2/analysis/token/unicodenorm"
  29. "github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
  30. "github.com/blevesearch/bleve/v2/index/upsidedown"
  31. "github.com/blevesearch/bleve/v2/mapping"
  32. "github.com/blevesearch/bleve/v2/search/query"
  33. "github.com/ethantkoenig/rupture"
  34. "github.com/go-enry/go-enry/v2"
  35. )
  36. const (
  37. unicodeNormalizeName = "unicodeNormalize"
  38. maxBatchSize = 16
  39. )
  40. // numericEqualityQuery a numeric equality query for the given value and field
  41. func numericEqualityQuery(value int64, field string) *query.NumericRangeQuery {
  42. f := float64(value)
  43. tru := true
  44. q := bleve.NewNumericRangeInclusiveQuery(&f, &f, &tru, &tru)
  45. q.SetField(field)
  46. return q
  47. }
  48. func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
  49. return m.AddCustomTokenFilter(unicodeNormalizeName, map[string]interface{}{
  50. "type": unicodenorm.Name,
  51. "form": unicodenorm.NFC,
  52. })
  53. }
  54. // openBleveIndexer open the index at the specified path, checking for metadata
  55. // updates and bleve version updates. If index needs to be created (or
  56. // re-created), returns (nil, nil)
  57. func openBleveIndexer(path string, latestVersion int) (bleve.Index, error) {
  58. _, err := os.Stat(path)
  59. if err != nil && os.IsNotExist(err) {
  60. return nil, nil
  61. } else if err != nil {
  62. return nil, err
  63. }
  64. metadata, err := rupture.ReadIndexMetadata(path)
  65. if err != nil {
  66. return nil, err
  67. }
  68. if metadata.Version < latestVersion {
  69. // the indexer is using a previous version, so we should delete it and
  70. // re-populate
  71. return nil, util.RemoveAll(path)
  72. }
  73. index, err := bleve.Open(path)
  74. if err != nil && err == upsidedown.IncompatibleVersion {
  75. // the indexer was built with a previous version of bleve, so we should
  76. // delete it and re-populate
  77. return nil, util.RemoveAll(path)
  78. } else if err != nil {
  79. return nil, err
  80. }
  81. return index, nil
  82. }
  83. // RepoIndexerData data stored in the repo indexer
  84. type RepoIndexerData struct {
  85. RepoID int64
  86. CommitID string
  87. Content string
  88. Language string
  89. UpdatedAt time.Time
  90. }
  91. // Type returns the document type, for bleve's mapping.Classifier interface.
  92. func (d *RepoIndexerData) Type() string {
  93. return repoIndexerDocType
  94. }
  95. const (
  96. repoIndexerAnalyzer = "repoIndexerAnalyzer"
  97. repoIndexerDocType = "repoIndexerDocType"
  98. repoIndexerLatestVersion = 6
  99. )
  100. // createBleveIndexer create a bleve repo indexer if one does not already exist
  101. func createBleveIndexer(path string, latestVersion int) (bleve.Index, error) {
  102. docMapping := bleve.NewDocumentMapping()
  103. numericFieldMapping := bleve.NewNumericFieldMapping()
  104. numericFieldMapping.IncludeInAll = false
  105. docMapping.AddFieldMappingsAt("RepoID", numericFieldMapping)
  106. textFieldMapping := bleve.NewTextFieldMapping()
  107. textFieldMapping.IncludeInAll = false
  108. docMapping.AddFieldMappingsAt("Content", textFieldMapping)
  109. termFieldMapping := bleve.NewTextFieldMapping()
  110. termFieldMapping.IncludeInAll = false
  111. termFieldMapping.Analyzer = analyzer_keyword.Name
  112. docMapping.AddFieldMappingsAt("Language", termFieldMapping)
  113. docMapping.AddFieldMappingsAt("CommitID", termFieldMapping)
  114. timeFieldMapping := bleve.NewDateTimeFieldMapping()
  115. timeFieldMapping.IncludeInAll = false
  116. docMapping.AddFieldMappingsAt("UpdatedAt", timeFieldMapping)
  117. mapping := bleve.NewIndexMapping()
  118. if err := addUnicodeNormalizeTokenFilter(mapping); err != nil {
  119. return nil, err
  120. } else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]interface{}{
  121. "type": analyzer_custom.Name,
  122. "char_filters": []string{},
  123. "tokenizer": unicode.Name,
  124. "token_filters": []string{unicodeNormalizeName, camelcase.Name, lowercase.Name},
  125. }); err != nil {
  126. return nil, err
  127. }
  128. mapping.DefaultAnalyzer = repoIndexerAnalyzer
  129. mapping.AddDocumentMapping(repoIndexerDocType, docMapping)
  130. mapping.AddDocumentMapping("_all", bleve.NewDocumentDisabledMapping())
  131. indexer, err := bleve.New(path, mapping)
  132. if err != nil {
  133. return nil, err
  134. }
  135. if err = rupture.WriteIndexMetadata(path, &rupture.IndexMetadata{
  136. Version: latestVersion,
  137. }); err != nil {
  138. return nil, err
  139. }
  140. return indexer, nil
  141. }
  142. var _ Indexer = &BleveIndexer{}
  143. // BleveIndexer represents a bleve indexer implementation
  144. type BleveIndexer struct {
  145. indexDir string
  146. indexer bleve.Index
  147. }
  148. // NewBleveIndexer creates a new bleve local indexer
  149. func NewBleveIndexer(indexDir string) (*BleveIndexer, bool, error) {
  150. indexer := &BleveIndexer{
  151. indexDir: indexDir,
  152. }
  153. created, err := indexer.init()
  154. if err != nil {
  155. indexer.Close()
  156. return nil, false, err
  157. }
  158. return indexer, created, err
  159. }
  160. func (b *BleveIndexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserError, batchReader *bufio.Reader, commitSha string,
  161. update fileUpdate, repo *repo_model.Repository, batch *gitea_bleve.FlushingBatch,
  162. ) error {
  163. // Ignore vendored files in code search
  164. if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) {
  165. return nil
  166. }
  167. size := update.Size
  168. var err error
  169. if !update.Sized {
  170. var stdout string
  171. stdout, _, err = git.NewCommand(ctx, "cat-file", "-s").AddDynamicArguments(update.BlobSha).RunStdString(&git.RunOpts{Dir: repo.RepoPath()})
  172. if err != nil {
  173. return err
  174. }
  175. if size, err = strconv.ParseInt(strings.TrimSpace(stdout), 10, 64); err != nil {
  176. return fmt.Errorf("Misformatted git cat-file output: %w", err)
  177. }
  178. }
  179. if size > setting.Indexer.MaxIndexerFileSize {
  180. return b.addDelete(update.Filename, repo, batch)
  181. }
  182. if _, err := batchWriter.Write([]byte(update.BlobSha + "\n")); err != nil {
  183. return err
  184. }
  185. _, _, size, err = git.ReadBatchLine(batchReader)
  186. if err != nil {
  187. return err
  188. }
  189. fileContents, err := io.ReadAll(io.LimitReader(batchReader, size))
  190. if err != nil {
  191. return err
  192. } else if !typesniffer.DetectContentType(fileContents).IsText() {
  193. // FIXME: UTF-16 files will probably fail here
  194. return nil
  195. }
  196. if _, err = batchReader.Discard(1); err != nil {
  197. return err
  198. }
  199. id := filenameIndexerID(repo.ID, update.Filename)
  200. return batch.Index(id, &RepoIndexerData{
  201. RepoID: repo.ID,
  202. CommitID: commitSha,
  203. Content: string(charset.ToUTF8DropErrors(fileContents)),
  204. Language: analyze.GetCodeLanguage(update.Filename, fileContents),
  205. UpdatedAt: time.Now().UTC(),
  206. })
  207. }
  208. func (b *BleveIndexer) addDelete(filename string, repo *repo_model.Repository, batch *gitea_bleve.FlushingBatch) error {
  209. id := filenameIndexerID(repo.ID, filename)
  210. return batch.Delete(id)
  211. }
  212. // init init the indexer
  213. func (b *BleveIndexer) init() (bool, error) {
  214. var err error
  215. b.indexer, err = openBleveIndexer(b.indexDir, repoIndexerLatestVersion)
  216. if err != nil {
  217. return false, err
  218. }
  219. if b.indexer != nil {
  220. return false, nil
  221. }
  222. b.indexer, err = createBleveIndexer(b.indexDir, repoIndexerLatestVersion)
  223. if err != nil {
  224. return false, err
  225. }
  226. return true, nil
  227. }
  228. // Close close the indexer
  229. func (b *BleveIndexer) Close() {
  230. log.Debug("Closing repo indexer")
  231. if b.indexer != nil {
  232. err := b.indexer.Close()
  233. if err != nil {
  234. log.Error("Error whilst closing the repository indexer: %v", err)
  235. }
  236. }
  237. log.Info("PID: %d Repository Indexer closed", os.Getpid())
  238. }
  239. // Ping does nothing
  240. func (b *BleveIndexer) Ping() bool {
  241. return true
  242. }
  243. // Index indexes the data
  244. func (b *BleveIndexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *repoChanges) error {
  245. batch := gitea_bleve.NewFlushingBatch(b.indexer, maxBatchSize)
  246. if len(changes.Updates) > 0 {
  247. // Now because of some insanity with git cat-file not immediately failing if not run in a valid git directory we need to run git rev-parse first!
  248. if err := git.EnsureValidGitRepository(ctx, repo.RepoPath()); err != nil {
  249. log.Error("Unable to open git repo: %s for %-v: %v", repo.RepoPath(), repo, err)
  250. return err
  251. }
  252. batchWriter, batchReader, cancel := git.CatFileBatch(ctx, repo.RepoPath())
  253. defer cancel()
  254. for _, update := range changes.Updates {
  255. if err := b.addUpdate(ctx, batchWriter, batchReader, sha, update, repo, batch); err != nil {
  256. return err
  257. }
  258. }
  259. cancel()
  260. }
  261. for _, filename := range changes.RemovedFilenames {
  262. if err := b.addDelete(filename, repo, batch); err != nil {
  263. return err
  264. }
  265. }
  266. return batch.Flush()
  267. }
  268. // Delete deletes indexes by ids
  269. func (b *BleveIndexer) Delete(repoID int64) error {
  270. query := numericEqualityQuery(repoID, "RepoID")
  271. searchRequest := bleve.NewSearchRequestOptions(query, 2147483647, 0, false)
  272. result, err := b.indexer.Search(searchRequest)
  273. if err != nil {
  274. return err
  275. }
  276. batch := gitea_bleve.NewFlushingBatch(b.indexer, maxBatchSize)
  277. for _, hit := range result.Hits {
  278. if err = batch.Delete(hit.ID); err != nil {
  279. return err
  280. }
  281. }
  282. return batch.Flush()
  283. }
  284. // Search searches for files in the specified repo.
  285. // Returns the matching file-paths
  286. func (b *BleveIndexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int64, []*SearchResult, []*SearchResultLanguages, error) {
  287. var (
  288. indexerQuery query.Query
  289. keywordQuery query.Query
  290. )
  291. if isMatch {
  292. prefixQuery := bleve.NewPrefixQuery(keyword)
  293. prefixQuery.FieldVal = "Content"
  294. keywordQuery = prefixQuery
  295. } else {
  296. phraseQuery := bleve.NewMatchPhraseQuery(keyword)
  297. phraseQuery.FieldVal = "Content"
  298. phraseQuery.Analyzer = repoIndexerAnalyzer
  299. keywordQuery = phraseQuery
  300. }
  301. if len(repoIDs) > 0 {
  302. repoQueries := make([]query.Query, 0, len(repoIDs))
  303. for _, repoID := range repoIDs {
  304. repoQueries = append(repoQueries, numericEqualityQuery(repoID, "RepoID"))
  305. }
  306. indexerQuery = bleve.NewConjunctionQuery(
  307. bleve.NewDisjunctionQuery(repoQueries...),
  308. keywordQuery,
  309. )
  310. } else {
  311. indexerQuery = keywordQuery
  312. }
  313. // Save for reuse without language filter
  314. facetQuery := indexerQuery
  315. if len(language) > 0 {
  316. languageQuery := bleve.NewMatchQuery(language)
  317. languageQuery.FieldVal = "Language"
  318. languageQuery.Analyzer = analyzer_keyword.Name
  319. indexerQuery = bleve.NewConjunctionQuery(
  320. indexerQuery,
  321. languageQuery,
  322. )
  323. }
  324. from := (page - 1) * pageSize
  325. searchRequest := bleve.NewSearchRequestOptions(indexerQuery, pageSize, from, false)
  326. searchRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
  327. searchRequest.IncludeLocations = true
  328. if len(language) == 0 {
  329. searchRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10))
  330. }
  331. result, err := b.indexer.SearchInContext(ctx, searchRequest)
  332. if err != nil {
  333. return 0, nil, nil, err
  334. }
  335. total := int64(result.Total)
  336. searchResults := make([]*SearchResult, len(result.Hits))
  337. for i, hit := range result.Hits {
  338. startIndex, endIndex := -1, -1
  339. for _, locations := range hit.Locations["Content"] {
  340. location := locations[0]
  341. locationStart := int(location.Start)
  342. locationEnd := int(location.End)
  343. if startIndex < 0 || locationStart < startIndex {
  344. startIndex = locationStart
  345. }
  346. if endIndex < 0 || locationEnd > endIndex {
  347. endIndex = locationEnd
  348. }
  349. }
  350. language := hit.Fields["Language"].(string)
  351. var updatedUnix timeutil.TimeStamp
  352. if t, err := time.Parse(time.RFC3339, hit.Fields["UpdatedAt"].(string)); err == nil {
  353. updatedUnix = timeutil.TimeStamp(t.Unix())
  354. }
  355. searchResults[i] = &SearchResult{
  356. RepoID: int64(hit.Fields["RepoID"].(float64)),
  357. StartIndex: startIndex,
  358. EndIndex: endIndex,
  359. Filename: filenameOfIndexerID(hit.ID),
  360. Content: hit.Fields["Content"].(string),
  361. CommitID: hit.Fields["CommitID"].(string),
  362. UpdatedUnix: updatedUnix,
  363. Language: language,
  364. Color: enry.GetColor(language),
  365. }
  366. }
  367. searchResultLanguages := make([]*SearchResultLanguages, 0, 10)
  368. if len(language) > 0 {
  369. // Use separate query to go get all language counts
  370. facetRequest := bleve.NewSearchRequestOptions(facetQuery, 1, 0, false)
  371. facetRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
  372. facetRequest.IncludeLocations = true
  373. facetRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10))
  374. if result, err = b.indexer.Search(facetRequest); err != nil {
  375. return 0, nil, nil, err
  376. }
  377. }
  378. languagesFacet := result.Facets["languages"]
  379. for _, term := range languagesFacet.Terms.Terms() {
  380. if len(term.Term) == 0 {
  381. continue
  382. }
  383. searchResultLanguages = append(searchResultLanguages, &SearchResultLanguages{
  384. Language: term.Term,
  385. Color: enry.GetColor(term.Term),
  386. Count: term.Count,
  387. })
  388. }
  389. return total, searchResults, searchResultLanguages, nil
  390. }