123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148 |
- /*
- * Copyright 2024 Vsevolod Stakhov
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- #ifndef STAT_API_H_
- #define STAT_API_H_
-
- #include "config.h"
- #include "task.h"
- #include "lua/lua_common.h"
- #include "contrib/libev/ev.h"
-
- #ifdef __cplusplus
- extern "C" {
- #endif
-
- /**
- * @file stat_api.h
- * High level statistics API
- */
-
- #define RSPAMD_STAT_TOKEN_FLAG_TEXT (1u << 0)
- #define RSPAMD_STAT_TOKEN_FLAG_META (1u << 1)
- #define RSPAMD_STAT_TOKEN_FLAG_LUA_META (1u << 2)
- #define RSPAMD_STAT_TOKEN_FLAG_EXCEPTION (1u << 3)
- #define RSPAMD_STAT_TOKEN_FLAG_HEADER (1u << 4)
- #define RSPAMD_STAT_TOKEN_FLAG_UNIGRAM (1u << 5)
- #define RSPAMD_STAT_TOKEN_FLAG_UTF (1u << 6)
- #define RSPAMD_STAT_TOKEN_FLAG_NORMALISED (1u << 7)
- #define RSPAMD_STAT_TOKEN_FLAG_STEMMED (1u << 8)
- #define RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE (1u << 9)
- #define RSPAMD_STAT_TOKEN_FLAG_STOP_WORD (1u << 10)
- #define RSPAMD_STAT_TOKEN_FLAG_SKIPPED (1u << 11)
- #define RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES (1u << 12)
- #define RSPAMD_STAT_TOKEN_FLAG_EMOJI (1u << 13)
-
- typedef struct rspamd_stat_token_s {
- rspamd_ftok_t original; /* utf8 raw */
- rspamd_ftok_unicode_t unicode; /* array of unicode characters, normalized, lowercased */
- rspamd_ftok_t normalized; /* normalized and lowercased utf8 */
- rspamd_ftok_t stemmed; /* stemmed utf8 */
- guint flags;
- } rspamd_stat_token_t;
-
- #define RSPAMD_TOKEN_VALUE_TYPE float
- typedef struct token_node_s {
- uint64_t data;
- guint window_idx;
- guint flags;
- rspamd_stat_token_t *t1;
- rspamd_stat_token_t *t2;
- RSPAMD_TOKEN_VALUE_TYPE values[0];
- } rspamd_token_t;
-
- struct rspamd_stat_ctx;
-
- /**
- * The results of statistics processing:
- * - error
- * - need to do additional job for processing
- * - all processed
- */
- typedef enum rspamd_stat_result_e {
- RSPAMD_STAT_PROCESS_ERROR = 0,
- RSPAMD_STAT_PROCESS_DELAYED = 1,
- RSPAMD_STAT_PROCESS_OK
- } rspamd_stat_result_t;
-
- /**
- * Initialise statistics modules
- * @param cfg
- */
- void rspamd_stat_init(struct rspamd_config *cfg, struct ev_loop *ev_base);
-
- /**
- * Finalize statistics
- */
- void rspamd_stat_close(void);
-
- /**
- * Tokenize task
- * @param st_ctx
- * @param task
- */
- void rspamd_stat_process_tokenize(struct rspamd_stat_ctx *st_ctx,
- struct rspamd_task *task);
-
- /**
- * Classify the task specified and insert symbols if needed
- * @param task
- * @param L lua state
- * @param err error returned
- * @return TRUE if task has been classified
- */
- rspamd_stat_result_t rspamd_stat_classify(struct rspamd_task *task,
- lua_State *L, guint stage, GError **err);
-
-
- /**
- * Check if a task should be learned and set the appropriate flags for it
- * @param task
- * @return
- */
- gboolean rspamd_stat_check_autolearn(struct rspamd_task *task);
-
- /**
- * Learn task as spam or ham, task must be processed prior to this call
- * @param task task to learn
- * @param spam if TRUE learn spam, otherwise learn ham
- * @param L lua state
- * @param classifier NULL to learn all classifiers, name to learn a specific one
- * @param err error returned
- * @return TRUE if task has been learned
- */
- rspamd_stat_result_t rspamd_stat_learn(struct rspamd_task *task,
- gboolean spam, lua_State *L, const gchar *classifier,
- guint stage,
- GError **err);
-
- /**
- * Get the overall statistics for all statfile backends
- * @param cfg configuration
- * @param total_learns the total number of learns is stored here
- * @return array of statistical information
- */
- rspamd_stat_result_t rspamd_stat_statistics(struct rspamd_task *task,
- struct rspamd_config *cfg,
- uint64_t *total_learns,
- ucl_object_t **res);
-
- void rspamd_stat_unload(void);
-
- #ifdef __cplusplus
- }
- #endif
-
- #endif /* STAT_API_H_ */
|