/*- * Copyright 2016 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef STAT_API_H_ #define STAT_API_H_ #include "config.h" #include "task.h" #include #include /** * @file stat_api.h * High level statistics API */ #define RSPAMD_STAT_TOKEN_FLAG_TEXT (1u << 0) #define RSPAMD_STAT_TOKEN_FLAG_META (1u << 1) #define RSPAMD_STAT_TOKEN_FLAG_LUA_META (1u << 2) #define RSPAMD_STAT_TOKEN_FLAG_EXCEPTION (1u << 3) #define RSPAMD_STAT_TOKEN_FLAG_HEADER (1u << 4) #define RSPAMD_STAT_TOKEN_FLAG_UNIGRAM (1u << 5) #define RSPAMD_STAT_TOKEN_FLAG_UTF (1u << 6) #define RSPAMD_STAT_TOKEN_FLAG_NORMALISED (1u << 7) #define RSPAMD_STAT_TOKEN_FLAG_STEMMED (1u << 8) #define RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE (1u << 9) #define RSPAMD_STAT_TOKEN_FLAG_STOP_WORD (1u << 10) #define RSPAMD_STAT_TOKEN_FLAG_SKIPPED (1u << 11) #define RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES (1u << 12) #define RSPAMD_STAT_TOKEN_FLAG_EMOJI (1u << 13) typedef struct rspamd_stat_token_s { rspamd_ftok_t original; /* utf8 raw */ rspamd_ftok_unicode_t unicode; /* array of unicode characters, normalized, lowercased */ rspamd_ftok_t normalized; /* normalized and lowercased utf8 */ rspamd_ftok_t stemmed; /* stemmed utf8 */ guint flags; } rspamd_stat_token_t; typedef struct token_node_s { guint64 data; guint window_idx; guint flags; rspamd_stat_token_t *t1; rspamd_stat_token_t *t2; gdouble values[]; } rspamd_token_t; struct rspamd_stat_ctx; /** * The results of statistics processing: * - error * - need to do additional job for processing * - all processed */ typedef enum rspamd_stat_result_e { RSPAMD_STAT_PROCESS_ERROR = 0, RSPAMD_STAT_PROCESS_DELAYED = 1, RSPAMD_STAT_PROCESS_OK } rspamd_stat_result_t; /** * Initialise statistics modules * @param cfg */ void rspamd_stat_init (struct rspamd_config *cfg, struct event_base *ev_base); /** * Finalize statistics */ void rspamd_stat_close (void); /** * Tokenize task * @param st_ctx * @param task */ void rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx, struct rspamd_task *task); /** * Classify the task specified and insert symbols if needed * @param task * @param L lua state * @param err error returned * @return TRUE if task has been classified */ rspamd_stat_result_t rspamd_stat_classify (struct rspamd_task *task, lua_State *L, guint stage, GError **err); /** * Check if a task should be learned and set the appropriate flags for it * @param task * @return */ gboolean rspamd_stat_check_autolearn (struct rspamd_task *task); /** * Learn task as spam or ham, task must be processed prior to this call * @param task task to learn * @param spam if TRUE learn spam, otherwise learn ham * @param L lua state * @param classifier NULL to learn all classifiers, name to learn a specific one * @param err error returned * @return TRUE if task has been learned */ rspamd_stat_result_t rspamd_stat_learn (struct rspamd_task *task, gboolean spam, lua_State *L, const gchar *classifier, guint stage, GError **err); /** * Get the overall statistics for all statfile backends * @param cfg configuration * @param total_learns the total number of learns is stored here * @return array of statistical information */ rspamd_stat_result_t rspamd_stat_statistics (struct rspamd_task *task, struct rspamd_config *cfg, guint64 *total_learns, ucl_object_t **res); void rspamd_stat_unload (void); #endif /* STAT_API_H_ */