From ec0e4b788949b555d0fbd52c8a0b8f970a8ba4b0 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Tue, 12 Apr 2016 15:23:58 +0100 Subject: [PATCH] [Feature] Add implementation of abstract multipattern matcher --- src/libutil/CMakeLists.txt | 3 +- src/libutil/multipattern.c | 322 +++++++++++++++++++++++++++++++++++++ src/libutil/multipattern.h | 122 ++++++++++++++ 3 files changed, 446 insertions(+), 1 deletion(-) create mode 100644 src/libutil/multipattern.c create mode 100644 src/libutil/multipattern.h diff --git a/src/libutil/CMakeLists.txt b/src/libutil/CMakeLists.txt index de59b5fea..a229c7f0d 100644 --- a/src/libutil/CMakeLists.txt +++ b/src/libutil/CMakeLists.txt @@ -19,6 +19,7 @@ SET(LIBRSPAMDUTILSRC ${CMAKE_CURRENT_SOURCE_DIR}/str_util.c ${CMAKE_CURRENT_SOURCE_DIR}/upstream.c ${CMAKE_CURRENT_SOURCE_DIR}/util.c - ${CMAKE_CURRENT_SOURCE_DIR}/heap.c) + ${CMAKE_CURRENT_SOURCE_DIR}/heap.c + ${CMAKE_CURRENT_SOURCE_DIR}/multipattern.c) # Rspamdutil SET(RSPAMD_UTIL ${LIBRSPAMDUTILSRC} PARENT_SCOPE) \ No newline at end of file diff --git a/src/libutil/multipattern.c b/src/libutil/multipattern.c new file mode 100644 index 000000000..967a5115a --- /dev/null +++ b/src/libutil/multipattern.c @@ -0,0 +1,322 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "config.h" +#include "multipattern.h" + +#ifdef WITH_HYPERSCAN +#include "hs.h" +#else +#include "acism.h" +#endif + +struct rspamd_multipattern { +#ifdef WITH_HYPERSCAN + hs_database_t *db; + hs_scratch_t *scratch; + GArray *hs_pats; + GArray *hs_ids; + GArray *hs_flags; +#else + ac_trie_t *t; + GArray *pats; +#endif + gboolean compiled; + guint cnt; + enum rspamd_multipattern_flags flags; +}; + +static GQuark +rspamd_multipattern_quark (void) +{ + return g_quark_from_static_string ("multipattern"); +} + +/* + * Escapes special characters from specific pattern + */ +static gchar * +rspamd_multipattern_pattern_filter (const gchar *pattern, + enum rspamd_multipattern_flags flags) +{ + /* + * TODO: implement patterns filtering + */ + return strdup (pattern); +} + +struct rspamd_multipattern * +rspamd_multipattern_create (enum rspamd_multipattern_flags flags) +{ + struct rspamd_multipattern *mp; + + mp = g_slice_alloc0 (sizeof (*mp)); + mp->flags = flags; + +#ifdef WITH_HYPERSCAN + mp->hs_pats = g_array_new (FALSE, TRUE, sizeof (gchar *)); + mp->hs_flags = g_array_new (FALSE, TRUE, sizeof (gint)); + mp->hs_ids = g_array_new (FALSE, TRUE, sizeof (gint)); +#else + mp->pats = g_array_new (FALSE, TRUE, sizeof (ac_trie_pat_t)); +#endif + + return mp; +} + +struct rspamd_multipattern * +rspamd_multipattern_create_sized (guint npatterns, + enum rspamd_multipattern_flags flags) +{ + struct rspamd_multipattern *mp; + + g_assert (npatterns > 0); + + mp = g_slice_alloc0 (sizeof (*mp)); + mp->flags = flags; + +#ifdef WITH_HYPERSCAN + mp->hs_pats = g_array_sized_new (FALSE, TRUE, sizeof (gchar *), npatterns); + mp->hs_flags = g_array_sized_new (FALSE, TRUE, sizeof (gint), npatterns); + mp->hs_ids = g_array_sized_new (FALSE, TRUE, sizeof (gint), npatterns); +#else + mp->pats = g_array_sized_new (FALSE, TRUE, sizeof (ac_trie_pat_t), npatterns); +#endif + + return mp; +} + +void +rspamd_multipattern_add_pattern (struct rspamd_multipattern *mp, + const gchar *pattern) +{ + g_assert (pattern != NULL); + g_assert (mp != NULL); + g_assert (!mp->compiled); + +#ifdef WITH_HYPERSCAN + gchar *np; + gint fl = 0; + + if (mp->flags & RSPAMD_MULTIPATTERN_ICASE) { + fl |= HS_FLAG_CASELESS; + } + if (mp->flags & RSPAMD_MULTIPATTERN_UTF8) { + fl |= HS_FLAG_UTF8; + } + + g_array_append_val (mp->hs_flags, fl); + np = rspamd_multipattern_pattern_filter (pattern, mp->flags); + g_array_append_val (mp->hs_pats, np); + fl = mp->cnt; + g_array_append_val (mp->hs_ids, fl); +#else + ac_trie_pat_t pat; + + pat.ptr = rspamd_multipattern_pattern_filter (pattern, mp->flags); + pat.len = strlen (pat.ptr); + + g_array_append_val (mp->pats, pat); +#endif + + mp->cnt ++; +} + +struct rspamd_multipattern * +rspamd_multipattern_create_full (const gchar **patterns, + guint npatterns, enum rspamd_multipattern_flags flags) +{ + struct rspamd_multipattern *mp; + guint i; + + g_assert (npatterns > 0); + g_assert (patterns != NULL); + + mp = rspamd_multipattern_create_sized (npatterns, flags); + + for (i = 0; i < npatterns; i++) { + rspamd_multipattern_add_pattern (mp, patterns[i]); + } + + return mp; +} + +gboolean +rspamd_multipattern_compile (struct rspamd_multipattern *mp, GError **err) +{ + g_assert (mp != NULL); + g_assert (!mp->compiled); + +#ifdef WITH_HYPERSCAN + hs_platform_info_t plt; + hs_compile_error_t *hs_errors; + + g_assert (hs_populate_platform (&plt) == HS_SUCCESS); + + if (hs_compile_multi ((const char *const *)mp->hs_pats->data, + (const unsigned int *)mp->hs_flags->data, + (const unsigned int *)mp->hs_ids->data, + mp->cnt, + HS_MODE_BLOCK, + &plt, + &mp->db, + &hs_errors) != HS_SUCCESS) { + + g_set_error (err, rspamd_multipattern_quark (), EINVAL, + "cannot create tree of regexp when processing '%s': %s", + g_array_index (mp->hs_pats, char *, hs_errors->expression), + hs_errors->message); + hs_free_compile_error (hs_errors); + + return FALSE; + } + + g_assert (hs_alloc_scratch (mp->db, &mp->scratch) == HS_SUCCESS); +#else + mp->t = acism_create (mp->pats->data, mp->cnt); +#endif + mp->compiled = TRUE; + + return TRUE; +} + +struct rspamd_multipattern_cbdata { + struct rspamd_multipattern *mp; + const gchar *in; + gsize len; + rspamd_multipattern_cb_t cb; + gpointer ud; + guint nfound; + gint ret; +}; + +#ifdef WITH_HYPERSCAN +static gint +rspamd_multipattern_hs_cb (unsigned int id, + unsigned long long from, + unsigned long long to, + unsigned int flags, + void *ud) +{ + struct rspamd_multipattern_cbdata *cbd = ud; + gint ret; + + ret = cbd->cb (cbd->mp, id, to, cbd->in, cbd->len, cbd->ud); + + cbd->nfound ++; + cbd->ret = ret; + + return ret; +} +#else +static gint +rspamd_multipattern_acism_cb (int strnum, int textpos, void *context) +{ + struct rspamd_multipattern_cbdata *cbd = context; + gint ret; + + ret = cbd->cb (cbd->mp, strnum, textpos, cbd->in, cbd->len, cbd->ud); + + cbd->nfound ++; + cbd->ret = ret; + + return ret; +} +#endif + +gint +rspamd_multipattern_lookup (struct rspamd_multipattern *mp, + const gchar *in, gsize len, rspamd_multipattern_cb_t cb, + gpointer ud, guint *pnfound) +{ + struct rspamd_multipattern_cbdata cbd; + gint ret = 0; + + g_assert (mp != NULL); + g_assert (mp->compiled); + + cbd.mp = mp; + cbd.in = in; + cbd.len = len; + cbd.cb = cb; + cbd.ud = ud; + cbd.nfound = 0; + cbd.ret = 0; + +#ifdef WITH_HYPERSCAN + ret = hs_scan (mp->db, in, len, 0, mp->scratch, + rspamd_multipattern_hs_cb, &cbd); + + if (ret == HS_SUCCESS) { + ret = 0; + } + else if (ret == HS_SCAN_TERMINATED) { + ret = cbd.ret; + } +#else + gint state = 0; + + ret = acism_lookup (mp->t, in, len, rspamd_multipattern_acism_cb, &cbd, + &state, mp->flags & RSPAMD_MULTIPATTERN_ICASE); +#endif + + if (pnfound) { + *pnfound = cbd.nfound; + } + + return ret; +} + + +void +rspamd_multipattern_destroy (struct rspamd_multipattern *mp) +{ + guint i; + + if (mp) { +#ifdef WITH_HYPERSCAN + gchar *p; + + if (mp->compiled) { + hs_free_scratch (mp->scratch); + hs_free_database (mp->db); + } + + for (i = 0; i < mp->cnt; i ++) { + p = g_array_index (mp->hs_pats, gchar *, i); + g_free (p); + } + + g_array_free (mp->hs_pats, TRUE); + g_array_free (mp->hs_ids, TRUE); + g_array_free (mp->hs_flags, TRUE); +#else + ac_trie_pat_t pat; + + if (mp->compiled) { + acism_destroy (mp->t); + } + + for (i = 0; i < mp->cnt; i ++) { + pat = g_array_index (mp->pats, ac_trie_pat_t, i); + g_free ((gchar *)pat.ptr); + } + + g_array_free (mp->pats, TRUE); +#endif + g_slice_free1 (sizeof (*mp), mp); + } +} diff --git a/src/libutil/multipattern.h b/src/libutil/multipattern.h new file mode 100644 index 000000000..45ade2743 --- /dev/null +++ b/src/libutil/multipattern.h @@ -0,0 +1,122 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef SRC_LIBUTIL_MULTIPATTERN_H_ +#define SRC_LIBUTIL_MULTIPATTERN_H_ + +#include "config.h" + +/** + * @file multipattern.h + * + * This file defines structure that acts like a transparent bridge between + * hyperscan and ac-trie + */ + +enum rspamd_multipattern_flags { + RSPAMD_MULTIPATTERN_DEFAULT = 0, + RSPAMD_MULTIPATTERN_ICASE = (1 << 0), + RSPAMD_MULTIPATTERN_UTF8 = (1 << 1), + RSPAMD_MULTIPATTERN_TLD = (1 << 2), + /* Not supported by acism */ + RSPAMD_MULTIPATTERN_GLOB = (1 << 3), + RSPAMD_MULTIPATTERN_RE = (1 << 4), +}; + +struct rspamd_multipattern; + +/** + * Called on pattern match + * @param mp multipattern structure + * @param strnum number of pattern matched + * @param textpos position in the text + * @param text input text + * @param len length of input text + * @param context userdata + * @return if 0 then search for another pattern, otherwise return this value to caller + */ +typedef gint (*rspamd_multipattern_cb_t) (struct rspamd_multipattern *mp, + guint strnum, + gint textpos, + const gchar *text, + gsize len, + void *context); + +/** + * Creates empty multipattern structure + * @param flags + * @return + */ +struct rspamd_multipattern *rspamd_multipattern_create ( + enum rspamd_multipattern_flags flags); + +/** + * Creates multipattern with preallocated number of patterns to speed up loading + * @param flags + * @param reserved + * @return + */ +struct rspamd_multipattern *rspamd_multipattern_create_sized ( + enum rspamd_multipattern_flags flags, guint reserved); + +/** + * Creates new multipattern structure + * @param patterns vector of null terminated strings + * @param npatterns number of patterns + * @param flags flags applied to all patterns + * @return new multipattern structure + */ +struct rspamd_multipattern *rspamd_multipattern_create_full ( + const gchar **patterns, + guint npatterns, + enum rspamd_multipattern_flags flags); + +/** + * Adds new pattern to match engine + * @param mp + * @param pattern + */ +void rspamd_multipattern_add_pattern (struct rspamd_multipattern *mp, + const gchar *pattern); + +/** + * Compiles multipattern structure + * @param mp + * @return + */ +gboolean rspamd_multipattern_compile (struct rspamd_multipattern *mp, + GError **err); + +/** + * Lookups for patterns in a text using the specified callback function + * @param mp + * @param in + * @param len + * @param cb if callback returns non-zero, then search is terminated and that value is returned + * @param ud calback data + * @return + */ +gint rspamd_multipattern_lookup (struct rspamd_multipattern *mp, + const gchar *in, gsize len, rspamd_multipattern_cb_t cb, + gpointer ud, guint *pnfound); + +/** + * Destroys multipattern structure + * @param mp + */ +void rspamd_multipattern_destroy (struct rspamd_multipattern *mp); + +#endif /* SRC_LIBUTIL_MULTIPATTERN_H_ */ -- 2.39.5