aboutsummaryrefslogtreecommitdiffstats
path: root/src/libstat/stat_api.h
blob: 40a6bc71644251dda6c3cadaa64ec3b230e7af24 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
/*-
 * Copyright 2016 Vsevolod Stakhov
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
#ifndef STAT_API_H_
#define STAT_API_H_

#include "config.h"
#include "task.h"
#include <lua.h>
#include "contrib/libev/ev.h"

/**
 * @file stat_api.h
 * High level statistics API
 */

#define RSPAMD_STAT_TOKEN_FLAG_TEXT (1u << 0)
#define RSPAMD_STAT_TOKEN_FLAG_META (1u << 1)
#define RSPAMD_STAT_TOKEN_FLAG_LUA_META (1u << 2)
#define RSPAMD_STAT_TOKEN_FLAG_EXCEPTION (1u << 3)
#define RSPAMD_STAT_TOKEN_FLAG_HEADER (1u << 4)
#define RSPAMD_STAT_TOKEN_FLAG_UNIGRAM (1u << 5)
#define RSPAMD_STAT_TOKEN_FLAG_UTF (1u << 6)
#define RSPAMD_STAT_TOKEN_FLAG_NORMALISED (1u << 7)
#define RSPAMD_STAT_TOKEN_FLAG_STEMMED (1u << 8)
#define RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE (1u << 9)
#define RSPAMD_STAT_TOKEN_FLAG_STOP_WORD (1u << 10)
#define RSPAMD_STAT_TOKEN_FLAG_SKIPPED (1u << 11)
#define RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES (1u << 12)
#define RSPAMD_STAT_TOKEN_FLAG_EMOJI (1u << 13)

typedef struct rspamd_stat_token_s {
	rspamd_ftok_t original; /* utf8 raw */
	rspamd_ftok_unicode_t unicode; /* array of unicode characters, normalized, lowercased */
	rspamd_ftok_t normalized; /* normalized and lowercased utf8 */
	rspamd_ftok_t stemmed; /* stemmed utf8 */
	guint flags;
} rspamd_stat_token_t;

typedef struct token_node_s {
	guint64 data;
	guint window_idx;
	guint flags;
	rspamd_stat_token_t *t1;
	rspamd_stat_token_t *t2;
	gdouble values[];
} rspamd_token_t;

struct rspamd_stat_ctx;

/**
 * The results of statistics processing:
 * - error
 * - need to do additional job for processing
 * - all processed
 */
typedef enum rspamd_stat_result_e {
	RSPAMD_STAT_PROCESS_ERROR = 0,
	RSPAMD_STAT_PROCESS_DELAYED = 1,
	RSPAMD_STAT_PROCESS_OK
} rspamd_stat_result_t;

/**
 * Initialise statistics modules
 * @param cfg
 */
void rspamd_stat_init (struct rspamd_config *cfg, struct ev_loop *ev_base);

/**
 * Finalize statistics
 */
void rspamd_stat_close (void);

/**
 * Tokenize task
 * @param st_ctx
 * @param task
 */
void rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
							  struct rspamd_task *task);

/**
 * Classify the task specified and insert symbols if needed
 * @param task
 * @param L lua state
 * @param err error returned
 * @return TRUE if task has been classified
 */
rspamd_stat_result_t rspamd_stat_classify (struct rspamd_task *task,
		lua_State *L, guint stage, GError **err);


/**
 * Check if a task should be learned and set the appropriate flags for it
 * @param task
 * @return
 */
gboolean rspamd_stat_check_autolearn (struct rspamd_task *task);

/**
 * Learn task as spam or ham, task must be processed prior to this call
 * @param task task to learn
 * @param spam if TRUE learn spam, otherwise learn ham
 * @param L lua state
 * @param classifier NULL to learn all classifiers, name to learn a specific one
 * @param err error returned
 * @return TRUE if task has been learned
 */
rspamd_stat_result_t rspamd_stat_learn (struct rspamd_task *task,
		gboolean spam, lua_State *L, const gchar *classifier,
		guint stage,
		GError **err);

/**
 * Get the overall statistics for all statfile backends
 * @param cfg configuration
 * @param total_learns the total number of learns is stored here
 * @return array of statistical information
 */
rspamd_stat_result_t rspamd_stat_statistics (struct rspamd_task *task,
		struct rspamd_config *cfg,
		guint64 *total_learns,
		ucl_object_t **res);

void rspamd_stat_unload (void);

#endif /* STAT_API_H_ */