aboutsummaryrefslogtreecommitdiffstats
path: root/modules/indexer/internal
diff options
context:
space:
mode:
authorBruno Sofiato <bruno.sofiato@gmail.com>2024-10-11 20:35:04 -0300
committerGitHub <noreply@github.com>2024-10-11 23:35:04 +0000
commit900ac622514081b90e08135cab175d5d1ea1bc9d (patch)
tree5b9fe1530a0ef54ae2abd24f9bf7a18b0fe4fa0e /modules/indexer/internal
parent0fe5e2b08c311f26c3fc0fc71eb6abffb06bc182 (diff)
downloadgitea-900ac622514081b90e08135cab175d5d1ea1bc9d.tar.gz
gitea-900ac622514081b90e08135cab175d5d1ea1bc9d.zip
Allow code search by filename (#32210)
This is a large and complex PR, so let me explain in detail its changes. First, I had to create new index mappings for Bleve and ElasticSerach as the current ones do not support search by filename. This requires Gitea to recreate the code search indexes (I do not know if this is a breaking change, but I feel it deserves a heads-up). I've used [this approach](https://www.elastic.co/guide/en/elasticsearch/reference/7.17/analysis-pathhierarchy-tokenizer.html) to model the filename index. It allows us to efficiently search for both the full path and the name of a file. Bleve, however, does not support this out-of-box, so I had to code a brand new [token filter](https://blevesearch.com/docs/Token-Filters/) to generate the search terms. I also did an overhaul in the `indexer_test.go` file. It now asserts the order of the expected results (this is important since matches based on the name of a file are more relevant than those based on its content). I've added new test scenarios that deal with searching by filename. They use a new repo included in the Gitea fixture. The screenshot below depicts how Gitea shows the search results. It shows results based on content in the same way as the current version does. In matches based on the filename, the first seven lines of the file contents are shown (BTW, this is how GitHub does it). ![image](https://github.com/user-attachments/assets/9d938d86-1a8d-4f89-8644-1921a473e858) Resolves #32096 --------- Signed-off-by: Bruno Sofiato <bruno.sofiato@gmail.com>
Diffstat (limited to 'modules/indexer/internal')
-rw-r--r--modules/indexer/internal/bleve/util.go27
-rw-r--r--modules/indexer/internal/bleve/util_test.go45
2 files changed, 71 insertions, 1 deletions
diff --git a/modules/indexer/internal/bleve/util.go b/modules/indexer/internal/bleve/util.go
index a2265f86e6..b426b39bc2 100644
--- a/modules/indexer/internal/bleve/util.go
+++ b/modules/indexer/internal/bleve/util.go
@@ -11,10 +11,15 @@ import (
"code.gitea.io/gitea/modules/util"
"github.com/blevesearch/bleve/v2"
+ "github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
"github.com/blevesearch/bleve/v2/index/upsidedown"
"github.com/ethantkoenig/rupture"
)
+const (
+ maxFuzziness = 2
+)
+
// openIndexer open the index at the specified path, checking for metadata
// updates and bleve version updates. If index needs to be created (or
// re-created), returns (nil, nil)
@@ -48,7 +53,27 @@ func openIndexer(path string, latestVersion int) (bleve.Index, int, error) {
return index, 0, nil
}
+// This method test the GuessFuzzinessByKeyword method. The fuzziness is based on the levenshtein distance and determines how many chars
+// may be different on two string and they still be considered equivalent.
+// Given a phrasse, its shortest word determines its fuzziness. If a phrase uses CJK (eg: `갃갃갃` `啊啊啊`), the fuzziness is zero.
func GuessFuzzinessByKeyword(s string) int {
+ tokenizer := unicode.NewUnicodeTokenizer()
+ tokens := tokenizer.Tokenize([]byte(s))
+
+ if len(tokens) > 0 {
+ fuzziness := maxFuzziness
+
+ for _, token := range tokens {
+ fuzziness = min(fuzziness, guessFuzzinessByKeyword(string(token.Term)))
+ }
+
+ return fuzziness
+ }
+
+ return 0
+}
+
+func guessFuzzinessByKeyword(s string) int {
// according to https://github.com/blevesearch/bleve/issues/1563, the supported max fuzziness is 2
// magic number 4 was chosen to determine the levenshtein distance per each character of a keyword
// BUT, when using CJK (eg: `갃갃갃` `啊啊啊`), it mismatches a lot.
@@ -57,5 +82,5 @@ func GuessFuzzinessByKeyword(s string) int {
return 0
}
}
- return min(2, len(s)/4)
+ return min(maxFuzziness, len(s)/4)
}
diff --git a/modules/indexer/internal/bleve/util_test.go b/modules/indexer/internal/bleve/util_test.go
new file mode 100644
index 0000000000..ae0b12c08d
--- /dev/null
+++ b/modules/indexer/internal/bleve/util_test.go
@@ -0,0 +1,45 @@
+// Copyright 2024 The Gitea Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package bleve
+
+import (
+ "fmt"
+ "testing"
+
+ "github.com/stretchr/testify/assert"
+)
+
+func TestBleveGuessFuzzinessByKeyword(t *testing.T) {
+ scenarios := []struct {
+ Input string
+ Fuzziness int // See util.go for the definition of fuzziness in this particular context
+ }{
+ {
+ Input: "",
+ Fuzziness: 0,
+ },
+ {
+ Input: "Avocado",
+ Fuzziness: 1,
+ },
+ {
+ Input: "Geschwindigkeit",
+ Fuzziness: 2,
+ },
+ {
+ Input: "non-exist",
+ Fuzziness: 0,
+ },
+ {
+ Input: "갃갃갃",
+ Fuzziness: 0,
+ },
+ }
+
+ for _, scenario := range scenarios {
+ t.Run(fmt.Sprintf("ensure fuzziness of '%s' is '%d'", scenario.Input, scenario.Fuzziness), func(t *testing.T) {
+ assert.Equal(t, scenario.Fuzziness, GuessFuzzinessByKeyword(scenario.Input))
+ })
+ }
+}