aboutsummaryrefslogtreecommitdiffstats
path: root/src/libserver/word.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/libserver/word.h')
-rw-r--r--src/libserver/word.h88
1 files changed, 88 insertions, 0 deletions
diff --git a/src/libserver/word.h b/src/libserver/word.h
new file mode 100644
index 000000000..7698bf327
--- /dev/null
+++ b/src/libserver/word.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2025 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSPAMD_WORD_H
+#define RSPAMD_WORD_H
+
+#include "config.h"
+#include "fstring.h"
+#include "contrib/libucl/kvec.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @file word.h
+ * Word processing structures and definitions
+ */
+
+/* Word flags */
+#define RSPAMD_WORD_FLAG_TEXT (1u << 0)
+#define RSPAMD_WORD_FLAG_META (1u << 1)
+#define RSPAMD_WORD_FLAG_LUA_META (1u << 2)
+#define RSPAMD_WORD_FLAG_EXCEPTION (1u << 3)
+#define RSPAMD_WORD_FLAG_HEADER (1u << 4)
+#define RSPAMD_WORD_FLAG_UNIGRAM (1u << 5)
+#define RSPAMD_WORD_FLAG_UTF (1u << 6)
+#define RSPAMD_WORD_FLAG_NORMALISED (1u << 7)
+#define RSPAMD_WORD_FLAG_STEMMED (1u << 8)
+#define RSPAMD_WORD_FLAG_BROKEN_UNICODE (1u << 9)
+#define RSPAMD_WORD_FLAG_STOP_WORD (1u << 10)
+#define RSPAMD_WORD_FLAG_SKIPPED (1u << 11)
+#define RSPAMD_WORD_FLAG_INVISIBLE_SPACES (1u << 12)
+#define RSPAMD_WORD_FLAG_EMOJI (1u << 13)
+
+/**
+ * Word structure representing tokenized text
+ */
+typedef struct rspamd_word_s {
+ rspamd_ftok_t original; /* utf8 raw */
+ rspamd_ftok_unicode_t unicode; /* array of unicode characters, normalized, lowercased */
+ rspamd_ftok_t normalized; /* normalized and lowercased utf8 */
+ rspamd_ftok_t stemmed; /* stemmed utf8 */
+ unsigned int flags;
+} rspamd_word_t;
+
+/**
+ * Vector of words using kvec
+ */
+typedef kvec_t(rspamd_word_t) rspamd_words_t;
+
+/* Legacy typedefs for backward compatibility */
+typedef rspamd_word_t rspamd_stat_token_t;
+
+/* Legacy flag aliases for backward compatibility */
+#define RSPAMD_STAT_TOKEN_FLAG_TEXT RSPAMD_WORD_FLAG_TEXT
+#define RSPAMD_STAT_TOKEN_FLAG_META RSPAMD_WORD_FLAG_META
+#define RSPAMD_STAT_TOKEN_FLAG_LUA_META RSPAMD_WORD_FLAG_LUA_META
+#define RSPAMD_STAT_TOKEN_FLAG_EXCEPTION RSPAMD_WORD_FLAG_EXCEPTION
+#define RSPAMD_STAT_TOKEN_FLAG_HEADER RSPAMD_WORD_FLAG_HEADER
+#define RSPAMD_STAT_TOKEN_FLAG_UNIGRAM RSPAMD_WORD_FLAG_UNIGRAM
+#define RSPAMD_STAT_TOKEN_FLAG_UTF RSPAMD_WORD_FLAG_UTF
+#define RSPAMD_STAT_TOKEN_FLAG_NORMALISED RSPAMD_WORD_FLAG_NORMALISED
+#define RSPAMD_STAT_TOKEN_FLAG_STEMMED RSPAMD_WORD_FLAG_STEMMED
+#define RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE RSPAMD_WORD_FLAG_BROKEN_UNICODE
+#define RSPAMD_STAT_TOKEN_FLAG_STOP_WORD RSPAMD_WORD_FLAG_STOP_WORD
+#define RSPAMD_STAT_TOKEN_FLAG_SKIPPED RSPAMD_WORD_FLAG_SKIPPED
+#define RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES RSPAMD_WORD_FLAG_INVISIBLE_SPACES
+#define RSPAMD_STAT_TOKEN_FLAG_EMOJI RSPAMD_WORD_FLAG_EMOJI
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* RSPAMD_WORD_H */