aboutsummaryrefslogtreecommitdiffstats
path: root/src/libserver/word.h
blob: 7698bf327591ea17d01254394b012290919096e4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
/*
 * Copyright 2025 Vsevolod Stakhov
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef RSPAMD_WORD_H
#define RSPAMD_WORD_H

#include "config.h"
#include "fstring.h"
#include "contrib/libucl/kvec.h"

#ifdef __cplusplus
extern "C" {
#endif

/**
 * @file word.h
 * Word processing structures and definitions
 */

/* Word flags */
#define RSPAMD_WORD_FLAG_TEXT (1u << 0)
#define RSPAMD_WORD_FLAG_META (1u << 1)
#define RSPAMD_WORD_FLAG_LUA_META (1u << 2)
#define RSPAMD_WORD_FLAG_EXCEPTION (1u << 3)
#define RSPAMD_WORD_FLAG_HEADER (1u << 4)
#define RSPAMD_WORD_FLAG_UNIGRAM (1u << 5)
#define RSPAMD_WORD_FLAG_UTF (1u << 6)
#define RSPAMD_WORD_FLAG_NORMALISED (1u << 7)
#define RSPAMD_WORD_FLAG_STEMMED (1u << 8)
#define RSPAMD_WORD_FLAG_BROKEN_UNICODE (1u << 9)
#define RSPAMD_WORD_FLAG_STOP_WORD (1u << 10)
#define RSPAMD_WORD_FLAG_SKIPPED (1u << 11)
#define RSPAMD_WORD_FLAG_INVISIBLE_SPACES (1u << 12)
#define RSPAMD_WORD_FLAG_EMOJI (1u << 13)

/**
 * Word structure representing tokenized text
 */
typedef struct rspamd_word_s {
	rspamd_ftok_t original;        /* utf8 raw */
	rspamd_ftok_unicode_t unicode; /* array of unicode characters, normalized, lowercased */
	rspamd_ftok_t normalized;      /* normalized and lowercased utf8 */
	rspamd_ftok_t stemmed;         /* stemmed utf8 */
	unsigned int flags;
} rspamd_word_t;

/**
 * Vector of words using kvec
 */
typedef kvec_t(rspamd_word_t) rspamd_words_t;

/* Legacy typedefs for backward compatibility */
typedef rspamd_word_t rspamd_stat_token_t;

/* Legacy flag aliases for backward compatibility */
#define RSPAMD_STAT_TOKEN_FLAG_TEXT RSPAMD_WORD_FLAG_TEXT
#define RSPAMD_STAT_TOKEN_FLAG_META RSPAMD_WORD_FLAG_META
#define RSPAMD_STAT_TOKEN_FLAG_LUA_META RSPAMD_WORD_FLAG_LUA_META
#define RSPAMD_STAT_TOKEN_FLAG_EXCEPTION RSPAMD_WORD_FLAG_EXCEPTION
#define RSPAMD_STAT_TOKEN_FLAG_HEADER RSPAMD_WORD_FLAG_HEADER
#define RSPAMD_STAT_TOKEN_FLAG_UNIGRAM RSPAMD_WORD_FLAG_UNIGRAM
#define RSPAMD_STAT_TOKEN_FLAG_UTF RSPAMD_WORD_FLAG_UTF
#define RSPAMD_STAT_TOKEN_FLAG_NORMALISED RSPAMD_WORD_FLAG_NORMALISED
#define RSPAMD_STAT_TOKEN_FLAG_STEMMED RSPAMD_WORD_FLAG_STEMMED
#define RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE RSPAMD_WORD_FLAG_BROKEN_UNICODE
#define RSPAMD_STAT_TOKEN_FLAG_STOP_WORD RSPAMD_WORD_FLAG_STOP_WORD
#define RSPAMD_STAT_TOKEN_FLAG_SKIPPED RSPAMD_WORD_FLAG_SKIPPED
#define RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES RSPAMD_WORD_FLAG_INVISIBLE_SPACES
#define RSPAMD_STAT_TOKEN_FLAG_EMOJI RSPAMD_WORD_FLAG_EMOJI

#ifdef __cplusplus
}
#endif

#endif /* RSPAMD_WORD_H */