@@ -901,7 +901,8 @@ INCLUDE_DIRECTORIES("${CMAKE_SOURCE_DIR}/src" | |||
"${CMAKE_SOURCE_DIR}/contrib/snowball/include" | |||
"${CMAKE_SOURCE_DIR}/contrib/siphash" | |||
"${CMAKE_SOURCE_DIR}/contrib/blake2" | |||
"${CMAKE_SOURCE_DIR}/contrib/librdns") | |||
"${CMAKE_SOURCE_DIR}/contrib/librdns" | |||
"${CMAKE_SOURCE_DIR}/contrib/aho-corasic") | |||
################################ SUBDIRS SECTION ########################### | |||
@@ -957,6 +958,7 @@ ADD_SUBDIRECTORY(contrib/siphash) | |||
ADD_SUBDIRECTORY(contrib/blake2) | |||
ADD_SUBDIRECTORY(contrib/libucl) | |||
ADD_SUBDIRECTORY(contrib/librdns) | |||
ADD_SUBDIRECTORY(contrib/aho-corasic) | |||
ADD_SUBDIRECTORY(src) | |||
ADD_SUBDIRECTORY(test) |
@@ -0,0 +1,8 @@ | |||
SET(AHOCORASICSRC acism_create.c | |||
acism.c) | |||
if ("${CMAKE_C_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_C_COMPILER_ID}" STREQUAL "GNU") | |||
SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3") | |||
endif () | |||
ADD_LIBRARY(rspamd-actrie SHARED ${AHOCORASICSRC}) | |||
INSTALL(TARGETS rspamd-actrie | |||
LIBRARY DESTINATION lib) |
@@ -0,0 +1,165 @@ | |||
GNU LESSER GENERAL PUBLIC LICENSE | |||
Version 3, 29 June 2007 | |||
Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/> | |||
Everyone is permitted to copy and distribute verbatim copies | |||
of this license document, but changing it is not allowed. | |||
This version of the GNU Lesser General Public License incorporates | |||
the terms and conditions of version 3 of the GNU General Public | |||
License, supplemented by the additional permissions listed below. | |||
0. Additional Definitions. | |||
As used herein, "this License" refers to version 3 of the GNU Lesser | |||
General Public License, and the "GNU GPL" refers to version 3 of the GNU | |||
General Public License. | |||
"The Library" refers to a covered work governed by this License, | |||
other than an Application or a Combined Work as defined below. | |||
An "Application" is any work that makes use of an interface provided | |||
by the Library, but which is not otherwise based on the Library. | |||
Defining a subclass of a class defined by the Library is deemed a mode | |||
of using an interface provided by the Library. | |||
A "Combined Work" is a work produced by combining or linking an | |||
Application with the Library. The particular version of the Library | |||
with which the Combined Work was made is also called the "Linked | |||
Version". | |||
The "Minimal Corresponding Source" for a Combined Work means the | |||
Corresponding Source for the Combined Work, excluding any source code | |||
for portions of the Combined Work that, considered in isolation, are | |||
based on the Application, and not on the Linked Version. | |||
The "Corresponding Application Code" for a Combined Work means the | |||
object code and/or source code for the Application, including any data | |||
and utility programs needed for reproducing the Combined Work from the | |||
Application, but excluding the System Libraries of the Combined Work. | |||
1. Exception to Section 3 of the GNU GPL. | |||
You may convey a covered work under sections 3 and 4 of this License | |||
without being bound by section 3 of the GNU GPL. | |||
2. Conveying Modified Versions. | |||
If you modify a copy of the Library, and, in your modifications, a | |||
facility refers to a function or data to be supplied by an Application | |||
that uses the facility (other than as an argument passed when the | |||
facility is invoked), then you may convey a copy of the modified | |||
version: | |||
a) under this License, provided that you make a good faith effort to | |||
ensure that, in the event an Application does not supply the | |||
function or data, the facility still operates, and performs | |||
whatever part of its purpose remains meaningful, or | |||
b) under the GNU GPL, with none of the additional permissions of | |||
this License applicable to that copy. | |||
3. Object Code Incorporating Material from Library Header Files. | |||
The object code form of an Application may incorporate material from | |||
a header file that is part of the Library. You may convey such object | |||
code under terms of your choice, provided that, if the incorporated | |||
material is not limited to numerical parameters, data structure | |||
layouts and accessors, or small macros, inline functions and templates | |||
(ten or fewer lines in length), you do both of the following: | |||
a) Give prominent notice with each copy of the object code that the | |||
Library is used in it and that the Library and its use are | |||
covered by this License. | |||
b) Accompany the object code with a copy of the GNU GPL and this license | |||
document. | |||
4. Combined Works. | |||
You may convey a Combined Work under terms of your choice that, | |||
taken together, effectively do not restrict modification of the | |||
portions of the Library contained in the Combined Work and reverse | |||
engineering for debugging such modifications, if you also do each of | |||
the following: | |||
a) Give prominent notice with each copy of the Combined Work that | |||
the Library is used in it and that the Library and its use are | |||
covered by this License. | |||
b) Accompany the Combined Work with a copy of the GNU GPL and this license | |||
document. | |||
c) For a Combined Work that displays copyright notices during | |||
execution, include the copyright notice for the Library among | |||
these notices, as well as a reference directing the user to the | |||
copies of the GNU GPL and this license document. | |||
d) Do one of the following: | |||
0) Convey the Minimal Corresponding Source under the terms of this | |||
License, and the Corresponding Application Code in a form | |||
suitable for, and under terms that permit, the user to | |||
recombine or relink the Application with a modified version of | |||
the Linked Version to produce a modified Combined Work, in the | |||
manner specified by section 6 of the GNU GPL for conveying | |||
Corresponding Source. | |||
1) Use a suitable shared library mechanism for linking with the | |||
Library. A suitable mechanism is one that (a) uses at run time | |||
a copy of the Library already present on the user's computer | |||
system, and (b) will operate properly with a modified version | |||
of the Library that is interface-compatible with the Linked | |||
Version. | |||
e) Provide Installation Information, but only if you would otherwise | |||
be required to provide such information under section 6 of the | |||
GNU GPL, and only to the extent that such information is | |||
necessary to install and execute a modified version of the | |||
Combined Work produced by recombining or relinking the | |||
Application with a modified version of the Linked Version. (If | |||
you use option 4d0, the Installation Information must accompany | |||
the Minimal Corresponding Source and Corresponding Application | |||
Code. If you use option 4d1, you must provide the Installation | |||
Information in the manner specified by section 6 of the GNU GPL | |||
for conveying Corresponding Source.) | |||
5. Combined Libraries. | |||
You may place library facilities that are a work based on the | |||
Library side by side in a single library together with other library | |||
facilities that are not Applications and are not covered by this | |||
License, and convey such a combined library under terms of your | |||
choice, if you do both of the following: | |||
a) Accompany the combined library with a copy of the same work based | |||
on the Library, uncombined with any other library facilities, | |||
conveyed under the terms of this License. | |||
b) Give prominent notice with the combined library that part of it | |||
is a work based on the Library, and explaining where to find the | |||
accompanying uncombined form of the same work. | |||
6. Revised Versions of the GNU Lesser General Public License. | |||
The Free Software Foundation may publish revised and/or new versions | |||
of the GNU Lesser General Public License from time to time. Such new | |||
versions will be similar in spirit to the present version, but may | |||
differ in detail to address new problems or concerns. | |||
Each version is given a distinguishing version number. If the | |||
Library as you received it specifies that a certain numbered version | |||
of the GNU Lesser General Public License "or any later version" | |||
applies to it, you have the option of following the terms and | |||
conditions either of that published version or of any later version | |||
published by the Free Software Foundation. If the Library as you | |||
received it does not specify a version number of the GNU Lesser | |||
General Public License, you may choose any version of the GNU Lesser | |||
General Public License ever published by the Free Software Foundation. | |||
If the Library as you received it specifies that a proxy can decide | |||
whether future versions of the GNU Lesser General Public License shall | |||
apply, that proxy's public statement of acceptance of any version is | |||
permanent authorization for you to choose that version for the | |||
Library. |
@@ -0,0 +1,67 @@ | |||
aho-corasick | |||
== | |||
Aho-Corasick parallel string search, using interleaved arrays. | |||
Mischa Sandberg mischasan@gmail.com | |||
ACISM is an implementation of Aho-Corasick parallel string search, | |||
using an Interleaved State-transition Matrix. | |||
It combines the fastest possible Aho-Corasick implementation, | |||
with the smallest possible data structure (!). | |||
FEATURES | |||
-------- | |||
* Fast. No hashing, no tree traversal; just a straight look-up equivalent to | |||
matrix[state, input-byte] per input character. | |||
* Tiny. On average, the whole data structure (mostly the array) takes about 2-3 bytes per | |||
input pattern byte. The original set of pattern strings can be reverse-generated from the machine. | |||
* Shareable. The state machine contains no pointers, so it can be compiled once, | |||
then memory-mapped by many processes. | |||
* Searches byte vectors, not null-terminated strings. | |||
Suitable for searching machine code as much as searching text. | |||
* DOS-proof. Well, that's an attribute of Aho-Corasick, | |||
so no real points for that. | |||
* Stream-ready. The state can be saved between calls to search data. | |||
DOCUMENTATION | |||
------------- | |||
The GoogleDocs description is at http://goo.gl/lE6zG | |||
I originally called it "psearch", but found that name was overused by other authors. | |||
LICENSE | |||
------- | |||
Though I've had strong suggestions to go with BSD license, I'm going with GPL2 until I figure out | |||
how to keep in touch with people who download and use the code. Hence the "CONTACT ME IF..." line in the license. | |||
GETTING STARTED | |||
--------------- | |||
Download the source, type "gmake". | |||
"gmake install" exports lib/libacism.a, include/acism.h and bin/acism_x. | |||
"acism_x.c" is a good example of calling acism_create and acism_scan/acism_more. | |||
(If you're interested in the GNUmakefile and rules.mk, | |||
check my blog posts on non-recursive make, at mischasan.wordpress.com.) | |||
HISTORY | |||
------- | |||
The interleaved-array approach was tried and discarded in the late 70's, because the compile time was O(n^2). | |||
acism_create beats the problem with a "hint" array that tracks the restart points for searches. | |||
That, plus discarding the original idea of how to get maximal density, resulted in the tiny-fast win-win. | |||
ACKNOWLEDGEMENTS | |||
---------------- | |||
I'd like to thank Mike Shannon, who wanted to see a machine built to make best use of L1/L2 cache. | |||
The change to do that doubled performance on hardware with a much larger cache than the matrix. | |||
Go figure. |
@@ -0,0 +1,105 @@ | |||
/* | |||
** Copyright (C) 2009-2014 Mischa Sandberg <mischasan@gmail.com> | |||
** | |||
** This program is free software; you can redistribute it and/or modify | |||
** it under the terms of the GNU Lesser General Public License Version as | |||
** published by the Free Software Foundation. You may not use, modify or | |||
** distribute this program under any other version of the GNU Lesser General | |||
** Public License. | |||
** | |||
** This program is distributed in the hope that it will be useful, | |||
** but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||
** GNU Lesser General Public License for more details. | |||
** | |||
** You should have received a copy of the GNU Lesser General Public License | |||
** along with this program; if not, write to the Free Software | |||
** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | |||
*/ | |||
#ifndef _ACISM_H | |||
#define _ACISM_H | |||
#include <stdint.h> | |||
#include <stdlib.h> // malloc | |||
#include <string.h> // memcpy | |||
typedef int (*qsort_cmp)(const void *, const void *); | |||
#ifndef ACISM_SIZE | |||
# define ACISM_SIZE 4 | |||
#endif | |||
#if ACISM_SIZE == 8 | |||
typedef uint64_t TRAN, STATE, STRNO; | |||
# define SYM_BITS 9U | |||
# define SYM_MASK 511 | |||
#else | |||
typedef uint32_t TRAN, STATE, STRNO; | |||
# define SYM_BITS psp->sym_bits | |||
# define SYM_MASK psp->sym_mask | |||
#endif | |||
typedef uint16_t SYMBOL; | |||
typedef unsigned _SYMBOL; // An efficient stacklocal SYMBOL | |||
enum { | |||
IS_MATCH = (TRAN)1 << (8*sizeof(TRAN) - 1), | |||
IS_SUFFIX = (TRAN)1 << (8*sizeof(TRAN) - 2), | |||
T_FLAGS = IS_MATCH | IS_SUFFIX | |||
}; | |||
typedef struct { STATE state; STRNO strno; } STRASH; | |||
typedef enum { BASE=2, USED=1 } USES; | |||
struct acism { | |||
TRAN* tranv; | |||
STRASH* hashv; | |||
unsigned flags; | |||
# define IS_MMAP 1 | |||
#if ACISM_SIZE < 8 | |||
TRAN sym_mask; | |||
unsigned sym_bits; | |||
#endif | |||
unsigned hash_mod; // search hashv starting at (state + sym) % hash_mod. | |||
unsigned hash_size; // #(hashv): hash_mod plus the overflows past [hash_mod-1] | |||
unsigned tran_size; // #(tranv) | |||
unsigned nsyms, nchars, nstrs, maxlen; | |||
SYMBOL symv[256]; | |||
}; | |||
#include "acism.h" | |||
// p_size: size of tranv + hashv | |||
static inline unsigned p_size(ac_trie_t const *psp) | |||
{ return psp->hash_size * sizeof*psp->hashv | |||
+ psp->tran_size * sizeof*psp->tranv; } | |||
static inline unsigned p_hash(ac_trie_t const *psp, STATE s) | |||
{ return s * 107 % psp->hash_mod; } | |||
static inline void set_tranv(ac_trie_t *psp, void *mem) | |||
{ psp->hashv = (STRASH*)&(psp->tranv = mem)[psp->tran_size]; } | |||
// TRAN accessors. For ACISM_SIZE=8, SYM_{BITS,MASK} do not use psp. | |||
static inline TRAN p_tran(ac_trie_t const *psp, STATE s, _SYMBOL sym) | |||
{ return psp->tranv[s + sym] ^ sym; } | |||
static inline _SYMBOL t_sym(ac_trie_t const *psp, TRAN t) | |||
{ (void)psp; return t & SYM_MASK; } | |||
static inline STATE t_next(ac_trie_t const *psp, TRAN t) | |||
{ (void)psp; return (t & ~T_FLAGS) >> SYM_BITS; } | |||
static inline int t_isleaf(ac_trie_t const *psp, TRAN t) | |||
{ return t_next(psp, t) >= psp->tran_size; } | |||
static inline int t_strno(ac_trie_t const *psp, TRAN t) | |||
{ return t_next(psp, t) - psp->tran_size; } | |||
static inline _SYMBOL t_valid(ac_trie_t const *psp, TRAN t) | |||
{ return !t_sym(psp, t); } | |||
#endif//_ACISM_H |
@@ -0,0 +1,107 @@ | |||
/* | |||
** Copyright (C) 2009-2014 Mischa Sandberg <mischasan@gmail.com> | |||
** | |||
** This program is free software; you can redistribute it and/or modify | |||
** it under the terms of the GNU Lesser General Public License Version as | |||
** published by the Free Software Foundation. You may not use, modify or | |||
** distribute this program under any other version of the GNU Lesser General | |||
** Public License. | |||
** | |||
** This program is distributed in the hope that it will be useful, | |||
** but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||
** GNU Lesser General Public License for more details. | |||
** | |||
** You should have received a copy of the GNU Lesser General Public License | |||
** along with this program; if not, write to the Free Software | |||
** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | |||
*/ | |||
#include "_acism.h" | |||
#define BACK ((SYMBOL)0) | |||
#define ROOT ((STATE) 0) | |||
int | |||
acism_more(ac_trie_t const *psp, MEMREF const text, | |||
ACISM_ACTION *cb, void *context, int *statep) | |||
{ | |||
ac_trie_t const ps = *psp; | |||
char const *cp = text.ptr, *endp = cp + text.len; | |||
STATE state = *statep; | |||
int ret = 0; | |||
while (cp < endp) { | |||
_SYMBOL sym = ps.symv[(uint8_t)*cp++]; | |||
if (!sym) { | |||
// Input byte is not in any pattern string. | |||
state = ROOT; | |||
continue; | |||
} | |||
// Search for a valid transition from this (state, sym), | |||
// following the backref chain. | |||
TRAN next; | |||
while (!t_valid(&ps, next = p_tran(&ps, state, sym)) && state != ROOT) { | |||
TRAN back = p_tran(&ps, state, BACK); | |||
state = t_valid(&ps, back) ? t_next(&ps, back) : ROOT; | |||
} | |||
if (!t_valid(&ps, next)) | |||
continue; | |||
if (!(next & (IS_MATCH | IS_SUFFIX))) { | |||
// No complete match yet; keep going. | |||
state = t_next(&ps, next); | |||
continue; | |||
} | |||
// At this point, one or more patterns have matched. | |||
// Find all matches by following the backref chain. | |||
// A valid node for (sym) with no SUFFIX flag marks the | |||
// end of the suffix chain. | |||
// In the same backref traversal, find a new (state), | |||
// if the original transition is to a leaf. | |||
STATE s = state; | |||
// Initially state is ROOT. The chain search saves the | |||
// first state from which the next char has a transition. | |||
state = t_isleaf(&ps, next) ? 0 : t_next(&ps, next); | |||
while (1) { | |||
if (t_valid(&ps, next)) { | |||
if (next & IS_MATCH) { | |||
unsigned strno, ss = s + sym, i; | |||
if (t_isleaf(&ps, ps.tranv[ss])) { | |||
strno = t_strno(&ps, ps.tranv[ss]); | |||
} else { | |||
for (i = p_hash(&ps, ss); ps.hashv[i].state != ss; ++i); | |||
strno = ps.hashv[i].strno; | |||
} | |||
if ((ret = cb(strno, cp - text.ptr, context))) | |||
goto EXIT; | |||
} | |||
if (!state && !t_isleaf(&ps, next)) | |||
state = t_next(&ps, next); | |||
if ( state && !(next & IS_SUFFIX)) | |||
break; | |||
} | |||
if (s == ROOT) | |||
break; | |||
TRAN b = p_tran(&ps, s, BACK); | |||
s = t_valid(&ps, b) ? t_next(&ps, b) : ROOT; | |||
next = p_tran(&ps, s, sym); | |||
} | |||
} | |||
EXIT: | |||
return *statep = state, ret; | |||
} | |||
//EOF |
@@ -0,0 +1,60 @@ | |||
/* | |||
** Copyright (C) 2009-2014 Mischa Sandberg <mischasan@gmail.com> | |||
** Copyright (C) 2015 Vsevolod Stakhov <vsevolod@highsecure.ru> | |||
** | |||
** This program is free software; you can redistribute it and/or modify | |||
** it under the terms of the GNU Lesser General Public License as | |||
** published by the Free Software Foundation. You may not use, modify or | |||
** distribute this program under any other version of the GNU Lesser General | |||
** Public License. | |||
** | |||
** This program is distributed in the hope that it will be useful, | |||
** but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||
** GNU Lesser General Public License for more details. | |||
** | |||
** You should have received a copy of the GNU Lesser General Public License | |||
** along with this program; if not, write to the Free Software | |||
** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | |||
*/ | |||
#ifndef ACISM_H | |||
#define ACISM_H | |||
// "acism" uses MEMREF {ptr,len} bytevec structs for "string" args, | |||
// rather than NUL-terminated "C" strings. | |||
#ifndef MSUTIL_H | |||
#include <stdio.h> | |||
typedef struct { char const *ptr; size_t len; } MEMREF; | |||
#endif | |||
typedef struct acism ac_trie_t; | |||
ac_trie_t* acism_create(MEMREF const *strv, int nstrs); | |||
void acism_destroy(ac_trie_t*); | |||
// For each match, acism_scan calls its ACISM_ACTION fn, | |||
// giving it the strv[] index of the matched string, | |||
// and the text[] offset of the byte PAST the end of the string. | |||
// If ACISM_ACTION returns 0, search continues; otherwise, | |||
// acism_more returns that nonzero value immediately. | |||
typedef int (ACISM_ACTION)(int strnum, int textpos, void *context); | |||
// If sequential blocks of (text) are passed to repeated acism_more calls, | |||
// then search continues where the previous acism_more left off -- | |||
// string matches can cross block boundaries. | |||
// *state should initially be (0). | |||
int acism_more(ac_trie_t const*, MEMREF const text, | |||
ACISM_ACTION *fn, void *fndata, int *state); | |||
static inline int acism_scan(ac_trie_t const*psp, MEMREF const text, | |||
ACISM_ACTION *fn, void *fndata) | |||
{ | |||
int state = 0; | |||
return acism_more(psp, text, fn, fndata, &state); | |||
} | |||
#endif//ACISM_H |
@@ -0,0 +1,396 @@ | |||
/* | |||
** Copyright (C) 2009-2014 Mischa Sandberg <mischasan@gmail.com> | |||
** | |||
** This program is free software; you can redistribute it and/or modify | |||
** it under the terms of the GNU Lesser General Public License Version as | |||
** published by the Free Software Foundation. You may not use, modify or | |||
** distribute this program under any other version of the GNU Lesser General | |||
** Public License. | |||
** | |||
** This program is distributed in the hope that it will be useful, | |||
** but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||
** GNU Lesser General Public License for more details. | |||
** | |||
** You should have received a copy of the GNU Lesser General Public License | |||
** along with this program; if not, write to the Free Software | |||
** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | |||
*/ | |||
#include "_acism.h" | |||
typedef struct tnode { | |||
struct tnode *child, *next, *back; | |||
union { unsigned nrefs; STATE state; } x; | |||
STRNO match; | |||
SYMBOL sym, is_suffix; | |||
} TNODE; | |||
//--------------|--------------------------------------------- | |||
// bitwid: 1+floor(log2(u)) | |||
static inline int bitwid(unsigned u) | |||
{ | |||
int ret = !!u; | |||
if (u & 0xFFFF0000) u >>= 16, ret += 16; | |||
if (u & 0x0000FF00) u >>= 8, ret += 8; | |||
if (u & 0x000000F0) u >>= 4, ret += 4; | |||
if (u & 0x0000000C) u >>= 2, ret += 2; | |||
if (u & 0x00000002) ret++; | |||
return ret; | |||
} | |||
static void fill_symv(ac_trie_t*, MEMREF const*, int ns); | |||
static int create_tree(TNODE*, SYMBOL const*symv, MEMREF const*strv, int nstrs); | |||
static void add_backlinks(TNODE*, TNODE**, TNODE**); | |||
static void prune_backlinks(TNODE*); | |||
static int interleave(TNODE*, int nnodes, int nsyms, TNODE**, TNODE**); | |||
static void fill_tranv(ac_trie_t*, TNODE const*); | |||
static void fill_hashv(ac_trie_t*, TNODE const*, int nn); | |||
static TNODE* find_child(TNODE*, SYMBOL); | |||
// (ns) is either a STATE, or a (STRNO + tran_size) | |||
static inline void | |||
set_tran(ac_trie_t *psp, STATE s, SYMBOL sym, int match, int suffix, TRAN ns) | |||
{ | |||
psp->tranv[s + sym] = sym | (match ? IS_MATCH : 0) | |||
| (suffix ? IS_SUFFIX : 0) | |||
| (ns << SYM_BITS); | |||
} | |||
// Track statistics for construction | |||
#ifdef ACISM_STATS | |||
typedef struct { long long val; const char *name; } PSSTAT; | |||
extern PSSTAT psstat[]; | |||
# define NOTE(n) (psstat[__LINE__] = (PSSTAT) {n, #n}) | |||
# define HIT(id) (psstat[__LINE__].val++, psstat[__LINE__].name = id) | |||
#else | |||
# define NOTE(n) (void)0 | |||
# define HIT(id) (void)0 | |||
#endif //ACISM_STATS | |||
//--------------|--------------------------------------------- | |||
ac_trie_t* | |||
acism_create(MEMREF const* strv, int nstrs) | |||
{ | |||
TNODE *tp, **v1 = NULL, **v2 = NULL; | |||
ac_trie_t *psp = calloc(1, sizeof*psp); | |||
fill_symv(psp, strv, nstrs); | |||
TNODE *troot = calloc(psp->nchars + 1, sizeof*troot); | |||
int nnodes = create_tree(troot, psp->symv, strv, nstrs); | |||
NOTE(nnodes); | |||
// v1, v2: breadth-first work vectors for add_backlink and interleave. | |||
int nhash, i = (nstrs + 1) * sizeof*tp; | |||
add_backlinks(troot, v1 = malloc(i), v2 = malloc(i)); | |||
for (tp = troot + nnodes, nhash = 0; --tp > troot;) { | |||
prune_backlinks(tp); | |||
nhash += tp->match && tp->child; | |||
} | |||
// Calculate each node's offset in tranv[]: | |||
psp->tran_size = interleave(troot, nnodes, psp->nsyms, v1, v2); | |||
if (bitwid(psp->tran_size + nstrs - 1) + SYM_BITS > sizeof(TRAN)*8 - 2) | |||
goto FAIL; | |||
if (nhash) { | |||
// Hash table is for match info of non-leaf nodes (only). | |||
// Set hash_size for p_size(psp): | |||
psp->hash_mod = nhash * 5 / 4 + 1; | |||
// Initially oversize the table for overflows without wraparound. | |||
psp->hash_size = psp->hash_mod + nhash; | |||
} | |||
set_tranv(psp, calloc(p_size(psp), 1)); | |||
if (!psp->tranv) goto FAIL; | |||
fill_tranv(psp, troot); | |||
// The root state (0) must not look like a valid backref. | |||
// Any symbol value other than (0) in tranv[0] ensures that. | |||
psp->tranv[0] = 1; | |||
if (nhash) { | |||
fill_hashv(psp, troot, nnodes); | |||
// Adjust hash_size to include trailing overflows | |||
// but trim trailing empty slots. | |||
psp->hash_size = psp->hash_mod; | |||
while ( psp->hashv[psp->hash_size].state) ++psp->hash_size; | |||
while (!psp->hashv[psp->hash_size - 1].state) --psp->hash_size; | |||
set_tranv(psp, realloc(psp->tranv, p_size(psp))); | |||
} | |||
// Diagnostics/statistics only: | |||
psp->nstrs = nstrs; | |||
for (i = psp->maxlen = 0; i < nstrs; ++i) | |||
if (psp->maxlen < strv[i].len) psp->maxlen = strv[i].len; | |||
goto DONE; | |||
FAIL: acism_destroy(psp), psp = NULL; | |||
DONE: free(troot), free(v1), free(v2); | |||
return psp; | |||
} | |||
typedef struct { int freq, rank; } FRANK; | |||
static int frcmp(FRANK*a, FRANK*b) { return a->freq - b->freq; } | |||
static void | |||
fill_symv(ac_trie_t *psp, MEMREF const *strv, int nstrs) | |||
{ | |||
int i, j; | |||
FRANK frv[256]; | |||
for (i = 0; i < 256; ++i) frv[i] = (FRANK){0,i}; | |||
for (i = 0; i < nstrs; ++i) | |||
for (psp->nchars += j = strv[i].len; --j >= 0;) | |||
frv[(uint8_t)strv[i].ptr[j]].freq++; | |||
qsort(frv, 256, sizeof*frv, (qsort_cmp)frcmp); | |||
for (i = 256; --i >= 0 && frv[i].freq;) | |||
psp->symv[frv[i].rank] = ++psp->nsyms; | |||
++psp->nsyms; | |||
#if ACISM_SIZE < 8 | |||
psp->sym_bits = bitwid(psp->nsyms); | |||
psp->sym_mask = ~(-1 << psp->sym_bits); | |||
#endif | |||
} | |||
static int | |||
create_tree(TNODE *Tree, SYMBOL const *symv, MEMREF const *strv, int nstrs) | |||
{ | |||
int i, j; | |||
TNODE *nextp = Tree + 1; | |||
for (i = 0; i < nstrs; ++i) { | |||
TNODE *tp = Tree; | |||
for (j = 0; tp->child && j < (int)strv[i].len; ++j) { | |||
SYMBOL sym = symv[(uint8_t)strv[i].ptr[j]]; | |||
if (sym < tp->child->sym) { | |||
// Prep to insert new node before tp->child | |||
nextp->next = tp->child; | |||
break; | |||
} | |||
tp = tp->child; | |||
while (tp->next && sym >= tp->next->sym) | |||
tp = tp->next; | |||
// Insert new sibling after tp | |||
if (sym > tp->sym) { | |||
nextp->next = tp->next; | |||
tp = tp->next = nextp++; | |||
tp->sym = sym; | |||
tp->back = Tree; | |||
} | |||
} | |||
for (; j < (int) strv[i].len; ++j) { | |||
tp = tp->child = nextp++; | |||
tp->sym = symv[(uint8_t)strv[i].ptr[j]]; | |||
tp->back = Tree; | |||
} | |||
tp->match = i + 1; // Encode strno as nonzero | |||
} | |||
return nextp - Tree; | |||
} | |||
static void | |||
add_backlinks(TNODE *troot, TNODE **v1, TNODE **v2) | |||
{ | |||
TNODE *tp, **tmp; | |||
for (tp = troot->child, tmp = v1; tp; tp = tp->next) | |||
*tmp++ = tp; | |||
*tmp = NULL; | |||
while (*v1) { | |||
TNODE **spp = v1, **dpp = v2, *srcp, *dstp; | |||
while ((srcp = *spp++)) { | |||
for (dstp = srcp->child; dstp; dstp = dstp->next) { | |||
TNODE *bp = NULL; | |||
*dpp++ = dstp; | |||
for (tp = srcp->back; tp; tp = tp->back) | |||
if ((bp = find_child(tp, dstp->sym))) break; | |||
if (!bp) | |||
bp = troot; | |||
dstp->back = dstp->child ? bp : tp ? tp : troot; | |||
dstp->back->x.nrefs++; | |||
dstp->is_suffix = bp->match || bp->is_suffix; | |||
} | |||
} | |||
*dpp = 0; | |||
tmp = v1; v1 = v2; v2 = tmp; | |||
} | |||
} | |||
static void | |||
prune_backlinks(TNODE *tp) | |||
{ | |||
if (tp->x.nrefs || !tp->child) | |||
return; | |||
TNODE *bp; | |||
// (bp != bp->back IFF bp != troot) | |||
while ((bp = tp->back) && !bp->match && bp != bp->back) { | |||
HIT("backlinks"); | |||
TNODE *cp = tp->child, *pp = bp->child; | |||
// Search for a child of bp that's not a child of tp | |||
for (; cp && pp && pp->sym >= cp->sym; cp = cp->next) { | |||
if (pp->sym == cp->sym) { | |||
if (pp->match && cp->is_suffix) break; | |||
pp = pp->next; | |||
} | |||
} | |||
if (pp) break; | |||
// So, target of back link is not a suffix match | |||
// of this node, and its children are a subset | |||
// of this node's children: prune it. | |||
HIT("pruned"); | |||
if ((tp->back = bp->back)) { | |||
tp->back->x.nrefs++; | |||
if (!--bp->x.nrefs) | |||
prune_backlinks(bp); | |||
} | |||
} | |||
} | |||
static int | |||
interleave(TNODE *troot, int nnodes, int nsyms, TNODE **v1, TNODE **v2) | |||
{ | |||
unsigned usev_size = nnodes + nsyms; | |||
char *usev = calloc(usev_size, sizeof*usev); | |||
STATE last_trans = 0, startv[nsyms][2]; | |||
TNODE *cp, **tmp; | |||
memset(startv, 0, nsyms * sizeof*startv); | |||
// Iterate through one level of the Tree at a time. | |||
// That srsly improves locality (L1-cache use). | |||
v1[0] = troot, v1[1] = NULL; | |||
for (; *v1; tmp = v1, v1 = v2, v2 = tmp) { | |||
TNODE **srcp = v1, **dstp = v2, *tp; | |||
while ((tp = *srcp++)) { | |||
if (!tp->child) continue; | |||
HIT("nonleaf"); | |||
if (tp->back == troot) tp->back = NULL; // simplify tests. | |||
cp = tp->child; | |||
STATE pos, *startp = &startv[cp->sym][!!tp->back]; | |||
while ((cp = cp->next)) { | |||
STATE *newp = &startv[cp->sym][!!tp->back]; | |||
if (*startp < *newp) startp = newp; | |||
} | |||
// If (tp) has a backref, we need a slot at offset 0 | |||
// that is free as a base AND to be used (filled in). | |||
char need = tp->back ? BASE|USED : BASE; | |||
for (pos = *startp;; ++pos) { | |||
if (usev[pos] & need) { | |||
HIT("inner loop"); | |||
continue; | |||
} | |||
for (cp = tp->child; cp; cp = cp->next) { | |||
HIT("child loop"); | |||
if (usev[pos + cp->sym] & USED) break; | |||
} | |||
// No child needs an in-use slot? We're done. | |||
if (!cp) break; | |||
} | |||
tp->x.state = pos; | |||
// Mark node's base and children as used: | |||
usev[pos] |= need; | |||
STATE last = 0; // Make compiler happy | |||
int nkids = 0; | |||
for (cp = tp->child; cp; *dstp++ = cp, cp = cp->next, ++nkids) | |||
usev[last = pos + cp->sym] |= USED; | |||
// This is a HEURISTIC for advancing search for other nodes | |||
*startp += (pos - *startp) / nkids; | |||
if (last_trans < last) { | |||
last_trans = last; | |||
if (last + nsyms >= usev_size) { | |||
usev = realloc(usev, usev_size << 1); | |||
memset(usev + usev_size, 0, usev_size); | |||
usev_size <<= 1; | |||
} | |||
} | |||
} | |||
*dstp = NULL; | |||
} | |||
free(usev); | |||
return last_trans + 1; | |||
} | |||
static void | |||
fill_hashv(ac_trie_t *psp, TNODE const treev[], int nnodes) | |||
{ | |||
STRASH *sv = malloc(psp->hash_mod * sizeof*sv), *sp = sv; | |||
int i; | |||
// First pass: insert without resolving collisions. | |||
for (i = 0; i < nnodes; ++i) { | |||
STATE base = treev[i].x.state; | |||
TNODE const *tp; | |||
for (tp = treev[i].child; tp; tp = tp->next) { | |||
if (tp->match && tp->child) { | |||
STATE state = base + tp->sym; | |||
STRASH *hp = &psp->hashv[p_hash(psp, state)]; | |||
*(hp->state ? sp++ : hp) = (STRASH){state, tp->match - 1}; | |||
} | |||
} | |||
} | |||
while (--sp >= sv) { | |||
HIT("hash collisions"); | |||
for (i = p_hash(psp, sp->state); psp->hashv[i].state; ++i) | |||
HIT("hash displacements"); | |||
psp->hashv[i] = *sp; | |||
} | |||
free(sv); | |||
} | |||
static void | |||
fill_tranv(ac_trie_t *psp, TNODE const*tp) | |||
{ | |||
TNODE const *cp = tp->child; | |||
if (cp && tp->back) | |||
set_tran(psp, tp->x.state, 0, 0, 0, tp->back->x.state); | |||
for (; cp; cp = cp->next) { | |||
//NOTE: cp->match is (strno+1) so that !cp->match means "no match". | |||
set_tran(psp, tp->x.state, cp->sym, cp->match, cp->is_suffix, | |||
cp->child ? cp->x.state : cp->match - 1 + psp->tran_size); | |||
if (cp->child) | |||
fill_tranv(psp, cp); | |||
} | |||
} | |||
static TNODE * | |||
find_child(TNODE *tp, SYMBOL sym) | |||
{ | |||
for (tp = tp->child; tp && tp->sym < sym; tp = tp->next); | |||
return tp && tp->sym == sym ? tp : NULL; | |||
} | |||
#ifdef ACISM_STATS | |||
PSSTAT psstat[__LINE__] = {{__LINE__,0}}; | |||
#endif//ACISM_STATS | |||
//EOF |
@@ -99,6 +99,7 @@ ENDIF(NOT DEBIAN_BUILD) | |||
TARGET_LINK_LIBRARIES(rspamd rspamd-server) | |||
TARGET_LINK_LIBRARIES(rspamd stemmer) | |||
TARGET_LINK_LIBRARIES(rspamd rspamd-actrie) | |||
TARGET_LINK_LIBRARIES(rspamd ${RSPAMD_REQUIRED_LIBRARIES}) | |||