aboutsummaryrefslogtreecommitdiffstats
path: root/modules/indexer/code/bleve/token/path/path.go
blob: 6dfc12f146990c91b408109609c82bc45749209d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
// Copyright 2024 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT

package path

import (
	"slices"
	"strings"

	"github.com/blevesearch/bleve/v2/analysis"
	"github.com/blevesearch/bleve/v2/registry"
)

const (
	Name = "gitea/path"
)

type TokenFilter struct{}

func NewTokenFilter() *TokenFilter {
	return &TokenFilter{}
}

func TokenFilterConstructor(config map[string]any, cache *registry.Cache) (analysis.TokenFilter, error) {
	return NewTokenFilter(), nil
}

func (s *TokenFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
	if len(input) == 1 {
		// if there is only one token, we dont need to generate the reversed chain
		return generatePathTokens(input, false)
	}

	normal := generatePathTokens(input, false)
	reversed := generatePathTokens(input, true)

	return append(normal, reversed...)
}

// Generates path tokens from the input tokens.
// This mimics the behavior of the path hierarchy tokenizer in ES. It takes the input tokens and combine them, generating a term for each component
// in tree (e.g., foo/bar/baz.md will generate foo, foo/bar, and foo/bar/baz.md).
//
// If the reverse flag is set, the order of the tokens is reversed (the same input will generate baz.md, baz.md/bar, baz.md/bar/foo). This is useful
// to efficiently search for filenames without supplying the fullpath.
func generatePathTokens(input analysis.TokenStream, reversed bool) analysis.TokenStream {
	terms := make([]string, 0, len(input))
	longestTerm := 0

	if reversed {
		slices.Reverse(input)
	}

	for i := range input {
		var sb strings.Builder
		sb.Write(input[0].Term)

		for j := 1; j < i; j++ {
			sb.WriteString("/")
			sb.Write(input[j].Term)
		}

		term := sb.String()

		if longestTerm < len(term) {
			longestTerm = len(term)
		}

		terms = append(terms, term)
	}

	output := make(analysis.TokenStream, 0, len(terms))

	for _, term := range terms {
		var start, end int

		if reversed {
			start = 0
			end = len(term)
		} else {
			start = longestTerm - len(term)
			end = longestTerm
		}

		token := analysis.Token{
			Position: 1,
			Start:    start,
			End:      end,
			Type:     analysis.AlphaNumeric,
			Term:     []byte(term),
		}

		output = append(output, &token)
	}

	return output
}

func init() {
	// FIXME: move it to the bleve's init function, but do not call it in global init
	err := registry.RegisterTokenFilter(Name, TokenFilterConstructor)
	if err != nil {
		panic(err)
	}
}