spegling av
https://github.com/rspamd/rspamd.git
synced 2024-08-13 14:12:38 +02:00
Import aho-corasic by Mischa Sandberg.
This commit is contained in:
förälder
7c2a321e83
incheckning
e35607288e
@ -901,7 +901,8 @@ INCLUDE_DIRECTORIES("${CMAKE_SOURCE_DIR}/src"
|
||||
"${CMAKE_SOURCE_DIR}/contrib/snowball/include"
|
||||
"${CMAKE_SOURCE_DIR}/contrib/siphash"
|
||||
"${CMAKE_SOURCE_DIR}/contrib/blake2"
|
||||
"${CMAKE_SOURCE_DIR}/contrib/librdns")
|
||||
"${CMAKE_SOURCE_DIR}/contrib/librdns"
|
||||
"${CMAKE_SOURCE_DIR}/contrib/aho-corasic")
|
||||
|
||||
################################ SUBDIRS SECTION ###########################
|
||||
|
||||
@ -957,6 +958,7 @@ ADD_SUBDIRECTORY(contrib/siphash)
|
||||
ADD_SUBDIRECTORY(contrib/blake2)
|
||||
ADD_SUBDIRECTORY(contrib/libucl)
|
||||
ADD_SUBDIRECTORY(contrib/librdns)
|
||||
ADD_SUBDIRECTORY(contrib/aho-corasic)
|
||||
ADD_SUBDIRECTORY(src)
|
||||
|
||||
ADD_SUBDIRECTORY(test)
|
||||
|
8
contrib/aho-corasic/CMakeLists.txt
Normal file
8
contrib/aho-corasic/CMakeLists.txt
Normal file
@ -0,0 +1,8 @@
|
||||
SET(AHOCORASICSRC acism_create.c
|
||||
acism.c)
|
||||
if ("${CMAKE_C_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_C_COMPILER_ID}" STREQUAL "GNU")
|
||||
SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3")
|
||||
endif ()
|
||||
ADD_LIBRARY(rspamd-actrie SHARED ${AHOCORASICSRC})
|
||||
INSTALL(TARGETS rspamd-actrie
|
||||
LIBRARY DESTINATION lib)
|
165
contrib/aho-corasic/LICENSE
Normal file
165
contrib/aho-corasic/LICENSE
Normal file
@ -0,0 +1,165 @@
|
||||
GNU LESSER GENERAL PUBLIC LICENSE
|
||||
Version 3, 29 June 2007
|
||||
|
||||
Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
|
||||
Everyone is permitted to copy and distribute verbatim copies
|
||||
of this license document, but changing it is not allowed.
|
||||
|
||||
|
||||
This version of the GNU Lesser General Public License incorporates
|
||||
the terms and conditions of version 3 of the GNU General Public
|
||||
License, supplemented by the additional permissions listed below.
|
||||
|
||||
0. Additional Definitions.
|
||||
|
||||
As used herein, "this License" refers to version 3 of the GNU Lesser
|
||||
General Public License, and the "GNU GPL" refers to version 3 of the GNU
|
||||
General Public License.
|
||||
|
||||
"The Library" refers to a covered work governed by this License,
|
||||
other than an Application or a Combined Work as defined below.
|
||||
|
||||
An "Application" is any work that makes use of an interface provided
|
||||
by the Library, but which is not otherwise based on the Library.
|
||||
Defining a subclass of a class defined by the Library is deemed a mode
|
||||
of using an interface provided by the Library.
|
||||
|
||||
A "Combined Work" is a work produced by combining or linking an
|
||||
Application with the Library. The particular version of the Library
|
||||
with which the Combined Work was made is also called the "Linked
|
||||
Version".
|
||||
|
||||
The "Minimal Corresponding Source" for a Combined Work means the
|
||||
Corresponding Source for the Combined Work, excluding any source code
|
||||
for portions of the Combined Work that, considered in isolation, are
|
||||
based on the Application, and not on the Linked Version.
|
||||
|
||||
The "Corresponding Application Code" for a Combined Work means the
|
||||
object code and/or source code for the Application, including any data
|
||||
and utility programs needed for reproducing the Combined Work from the
|
||||
Application, but excluding the System Libraries of the Combined Work.
|
||||
|
||||
1. Exception to Section 3 of the GNU GPL.
|
||||
|
||||
You may convey a covered work under sections 3 and 4 of this License
|
||||
without being bound by section 3 of the GNU GPL.
|
||||
|
||||
2. Conveying Modified Versions.
|
||||
|
||||
If you modify a copy of the Library, and, in your modifications, a
|
||||
facility refers to a function or data to be supplied by an Application
|
||||
that uses the facility (other than as an argument passed when the
|
||||
facility is invoked), then you may convey a copy of the modified
|
||||
version:
|
||||
|
||||
a) under this License, provided that you make a good faith effort to
|
||||
ensure that, in the event an Application does not supply the
|
||||
function or data, the facility still operates, and performs
|
||||
whatever part of its purpose remains meaningful, or
|
||||
|
||||
b) under the GNU GPL, with none of the additional permissions of
|
||||
this License applicable to that copy.
|
||||
|
||||
3. Object Code Incorporating Material from Library Header Files.
|
||||
|
||||
The object code form of an Application may incorporate material from
|
||||
a header file that is part of the Library. You may convey such object
|
||||
code under terms of your choice, provided that, if the incorporated
|
||||
material is not limited to numerical parameters, data structure
|
||||
layouts and accessors, or small macros, inline functions and templates
|
||||
(ten or fewer lines in length), you do both of the following:
|
||||
|
||||
a) Give prominent notice with each copy of the object code that the
|
||||
Library is used in it and that the Library and its use are
|
||||
covered by this License.
|
||||
|
||||
b) Accompany the object code with a copy of the GNU GPL and this license
|
||||
document.
|
||||
|
||||
4. Combined Works.
|
||||
|
||||
You may convey a Combined Work under terms of your choice that,
|
||||
taken together, effectively do not restrict modification of the
|
||||
portions of the Library contained in the Combined Work and reverse
|
||||
engineering for debugging such modifications, if you also do each of
|
||||
the following:
|
||||
|
||||
a) Give prominent notice with each copy of the Combined Work that
|
||||
the Library is used in it and that the Library and its use are
|
||||
covered by this License.
|
||||
|
||||
b) Accompany the Combined Work with a copy of the GNU GPL and this license
|
||||
document.
|
||||
|
||||
c) For a Combined Work that displays copyright notices during
|
||||
execution, include the copyright notice for the Library among
|
||||
these notices, as well as a reference directing the user to the
|
||||
copies of the GNU GPL and this license document.
|
||||
|
||||
d) Do one of the following:
|
||||
|
||||
0) Convey the Minimal Corresponding Source under the terms of this
|
||||
License, and the Corresponding Application Code in a form
|
||||
suitable for, and under terms that permit, the user to
|
||||
recombine or relink the Application with a modified version of
|
||||
the Linked Version to produce a modified Combined Work, in the
|
||||
manner specified by section 6 of the GNU GPL for conveying
|
||||
Corresponding Source.
|
||||
|
||||
1) Use a suitable shared library mechanism for linking with the
|
||||
Library. A suitable mechanism is one that (a) uses at run time
|
||||
a copy of the Library already present on the user's computer
|
||||
system, and (b) will operate properly with a modified version
|
||||
of the Library that is interface-compatible with the Linked
|
||||
Version.
|
||||
|
||||
e) Provide Installation Information, but only if you would otherwise
|
||||
be required to provide such information under section 6 of the
|
||||
GNU GPL, and only to the extent that such information is
|
||||
necessary to install and execute a modified version of the
|
||||
Combined Work produced by recombining or relinking the
|
||||
Application with a modified version of the Linked Version. (If
|
||||
you use option 4d0, the Installation Information must accompany
|
||||
the Minimal Corresponding Source and Corresponding Application
|
||||
Code. If you use option 4d1, you must provide the Installation
|
||||
Information in the manner specified by section 6 of the GNU GPL
|
||||
for conveying Corresponding Source.)
|
||||
|
||||
5. Combined Libraries.
|
||||
|
||||
You may place library facilities that are a work based on the
|
||||
Library side by side in a single library together with other library
|
||||
facilities that are not Applications and are not covered by this
|
||||
License, and convey such a combined library under terms of your
|
||||
choice, if you do both of the following:
|
||||
|
||||
a) Accompany the combined library with a copy of the same work based
|
||||
on the Library, uncombined with any other library facilities,
|
||||
conveyed under the terms of this License.
|
||||
|
||||
b) Give prominent notice with the combined library that part of it
|
||||
is a work based on the Library, and explaining where to find the
|
||||
accompanying uncombined form of the same work.
|
||||
|
||||
6. Revised Versions of the GNU Lesser General Public License.
|
||||
|
||||
The Free Software Foundation may publish revised and/or new versions
|
||||
of the GNU Lesser General Public License from time to time. Such new
|
||||
versions will be similar in spirit to the present version, but may
|
||||
differ in detail to address new problems or concerns.
|
||||
|
||||
Each version is given a distinguishing version number. If the
|
||||
Library as you received it specifies that a certain numbered version
|
||||
of the GNU Lesser General Public License "or any later version"
|
||||
applies to it, you have the option of following the terms and
|
||||
conditions either of that published version or of any later version
|
||||
published by the Free Software Foundation. If the Library as you
|
||||
received it does not specify a version number of the GNU Lesser
|
||||
General Public License, you may choose any version of the GNU Lesser
|
||||
General Public License ever published by the Free Software Foundation.
|
||||
|
||||
If the Library as you received it specifies that a proxy can decide
|
||||
whether future versions of the GNU Lesser General Public License shall
|
||||
apply, that proxy's public statement of acceptance of any version is
|
||||
permanent authorization for you to choose that version for the
|
||||
Library.
|
67
contrib/aho-corasic/README.md
Normal file
67
contrib/aho-corasic/README.md
Normal file
@ -0,0 +1,67 @@
|
||||
aho-corasick
|
||||
==
|
||||
|
||||
Aho-Corasick parallel string search, using interleaved arrays.
|
||||
|
||||
Mischa Sandberg mischasan@gmail.com
|
||||
|
||||
ACISM is an implementation of Aho-Corasick parallel string search,
|
||||
using an Interleaved State-transition Matrix.
|
||||
It combines the fastest possible Aho-Corasick implementation,
|
||||
with the smallest possible data structure (!).
|
||||
|
||||
FEATURES
|
||||
--------
|
||||
|
||||
* Fast. No hashing, no tree traversal; just a straight look-up equivalent to
|
||||
matrix[state, input-byte] per input character.
|
||||
|
||||
* Tiny. On average, the whole data structure (mostly the array) takes about 2-3 bytes per
|
||||
input pattern byte. The original set of pattern strings can be reverse-generated from the machine.
|
||||
|
||||
* Shareable. The state machine contains no pointers, so it can be compiled once,
|
||||
then memory-mapped by many processes.
|
||||
|
||||
* Searches byte vectors, not null-terminated strings.
|
||||
Suitable for searching machine code as much as searching text.
|
||||
|
||||
* DOS-proof. Well, that's an attribute of Aho-Corasick,
|
||||
so no real points for that.
|
||||
|
||||
* Stream-ready. The state can be saved between calls to search data.
|
||||
|
||||
DOCUMENTATION
|
||||
-------------
|
||||
|
||||
The GoogleDocs description is at http://goo.gl/lE6zG
|
||||
I originally called it "psearch", but found that name was overused by other authors.
|
||||
|
||||
LICENSE
|
||||
-------
|
||||
|
||||
Though I've had strong suggestions to go with BSD license, I'm going with GPL2 until I figure out
|
||||
how to keep in touch with people who download and use the code. Hence the "CONTACT ME IF..." line in the license.
|
||||
|
||||
GETTING STARTED
|
||||
---------------
|
||||
|
||||
Download the source, type "gmake".
|
||||
"gmake install" exports lib/libacism.a, include/acism.h and bin/acism_x.
|
||||
"acism_x.c" is a good example of calling acism_create and acism_scan/acism_more.
|
||||
|
||||
(If you're interested in the GNUmakefile and rules.mk,
|
||||
check my blog posts on non-recursive make, at mischasan.wordpress.com.)
|
||||
|
||||
HISTORY
|
||||
-------
|
||||
|
||||
The interleaved-array approach was tried and discarded in the late 70's, because the compile time was O(n^2).
|
||||
acism_create beats the problem with a "hint" array that tracks the restart points for searches.
|
||||
That, plus discarding the original idea of how to get maximal density, resulted in the tiny-fast win-win.
|
||||
|
||||
ACKNOWLEDGEMENTS
|
||||
----------------
|
||||
|
||||
I'd like to thank Mike Shannon, who wanted to see a machine built to make best use of L1/L2 cache.
|
||||
The change to do that doubled performance on hardware with a much larger cache than the matrix.
|
||||
Go figure.
|
105
contrib/aho-corasic/_acism.h
Normal file
105
contrib/aho-corasic/_acism.h
Normal file
@ -0,0 +1,105 @@
|
||||
/*
|
||||
** Copyright (C) 2009-2014 Mischa Sandberg <mischasan@gmail.com>
|
||||
**
|
||||
** This program is free software; you can redistribute it and/or modify
|
||||
** it under the terms of the GNU Lesser General Public License Version as
|
||||
** published by the Free Software Foundation. You may not use, modify or
|
||||
** distribute this program under any other version of the GNU Lesser General
|
||||
** Public License.
|
||||
**
|
||||
** This program is distributed in the hope that it will be useful,
|
||||
** but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
** GNU Lesser General Public License for more details.
|
||||
**
|
||||
** You should have received a copy of the GNU Lesser General Public License
|
||||
** along with this program; if not, write to the Free Software
|
||||
** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
*/
|
||||
#ifndef _ACISM_H
|
||||
#define _ACISM_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h> // malloc
|
||||
#include <string.h> // memcpy
|
||||
|
||||
typedef int (*qsort_cmp)(const void *, const void *);
|
||||
|
||||
#ifndef ACISM_SIZE
|
||||
# define ACISM_SIZE 4
|
||||
#endif
|
||||
|
||||
#if ACISM_SIZE == 8
|
||||
typedef uint64_t TRAN, STATE, STRNO;
|
||||
# define SYM_BITS 9U
|
||||
# define SYM_MASK 511
|
||||
#else
|
||||
typedef uint32_t TRAN, STATE, STRNO;
|
||||
# define SYM_BITS psp->sym_bits
|
||||
# define SYM_MASK psp->sym_mask
|
||||
#endif
|
||||
|
||||
typedef uint16_t SYMBOL;
|
||||
typedef unsigned _SYMBOL; // An efficient stacklocal SYMBOL
|
||||
|
||||
enum {
|
||||
IS_MATCH = (TRAN)1 << (8*sizeof(TRAN) - 1),
|
||||
IS_SUFFIX = (TRAN)1 << (8*sizeof(TRAN) - 2),
|
||||
T_FLAGS = IS_MATCH | IS_SUFFIX
|
||||
};
|
||||
|
||||
typedef struct { STATE state; STRNO strno; } STRASH;
|
||||
|
||||
typedef enum { BASE=2, USED=1 } USES;
|
||||
|
||||
struct acism {
|
||||
TRAN* tranv;
|
||||
STRASH* hashv;
|
||||
unsigned flags;
|
||||
# define IS_MMAP 1
|
||||
|
||||
#if ACISM_SIZE < 8
|
||||
TRAN sym_mask;
|
||||
unsigned sym_bits;
|
||||
#endif
|
||||
unsigned hash_mod; // search hashv starting at (state + sym) % hash_mod.
|
||||
unsigned hash_size; // #(hashv): hash_mod plus the overflows past [hash_mod-1]
|
||||
unsigned tran_size; // #(tranv)
|
||||
unsigned nsyms, nchars, nstrs, maxlen;
|
||||
SYMBOL symv[256];
|
||||
};
|
||||
|
||||
#include "acism.h"
|
||||
|
||||
// p_size: size of tranv + hashv
|
||||
static inline unsigned p_size(ac_trie_t const *psp)
|
||||
{ return psp->hash_size * sizeof*psp->hashv
|
||||
+ psp->tran_size * sizeof*psp->tranv; }
|
||||
|
||||
static inline unsigned p_hash(ac_trie_t const *psp, STATE s)
|
||||
{ return s * 107 % psp->hash_mod; }
|
||||
|
||||
static inline void set_tranv(ac_trie_t *psp, void *mem)
|
||||
{ psp->hashv = (STRASH*)&(psp->tranv = mem)[psp->tran_size]; }
|
||||
|
||||
// TRAN accessors. For ACISM_SIZE=8, SYM_{BITS,MASK} do not use psp.
|
||||
|
||||
static inline TRAN p_tran(ac_trie_t const *psp, STATE s, _SYMBOL sym)
|
||||
{ return psp->tranv[s + sym] ^ sym; }
|
||||
|
||||
static inline _SYMBOL t_sym(ac_trie_t const *psp, TRAN t)
|
||||
{ (void)psp; return t & SYM_MASK; }
|
||||
|
||||
static inline STATE t_next(ac_trie_t const *psp, TRAN t)
|
||||
{ (void)psp; return (t & ~T_FLAGS) >> SYM_BITS; }
|
||||
|
||||
static inline int t_isleaf(ac_trie_t const *psp, TRAN t)
|
||||
{ return t_next(psp, t) >= psp->tran_size; }
|
||||
|
||||
static inline int t_strno(ac_trie_t const *psp, TRAN t)
|
||||
{ return t_next(psp, t) - psp->tran_size; }
|
||||
|
||||
static inline _SYMBOL t_valid(ac_trie_t const *psp, TRAN t)
|
||||
{ return !t_sym(psp, t); }
|
||||
|
||||
#endif//_ACISM_H
|
107
contrib/aho-corasic/acism.c
Normal file
107
contrib/aho-corasic/acism.c
Normal file
@ -0,0 +1,107 @@
|
||||
/*
|
||||
** Copyright (C) 2009-2014 Mischa Sandberg <mischasan@gmail.com>
|
||||
**
|
||||
** This program is free software; you can redistribute it and/or modify
|
||||
** it under the terms of the GNU Lesser General Public License Version as
|
||||
** published by the Free Software Foundation. You may not use, modify or
|
||||
** distribute this program under any other version of the GNU Lesser General
|
||||
** Public License.
|
||||
**
|
||||
** This program is distributed in the hope that it will be useful,
|
||||
** but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
** GNU Lesser General Public License for more details.
|
||||
**
|
||||
** You should have received a copy of the GNU Lesser General Public License
|
||||
** along with this program; if not, write to the Free Software
|
||||
** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
*/
|
||||
|
||||
#include "_acism.h"
|
||||
|
||||
#define BACK ((SYMBOL)0)
|
||||
#define ROOT ((STATE) 0)
|
||||
|
||||
int
|
||||
acism_more(ac_trie_t const *psp, MEMREF const text,
|
||||
ACISM_ACTION *cb, void *context, int *statep)
|
||||
{
|
||||
ac_trie_t const ps = *psp;
|
||||
char const *cp = text.ptr, *endp = cp + text.len;
|
||||
STATE state = *statep;
|
||||
int ret = 0;
|
||||
|
||||
while (cp < endp) {
|
||||
_SYMBOL sym = ps.symv[(uint8_t)*cp++];
|
||||
if (!sym) {
|
||||
// Input byte is not in any pattern string.
|
||||
state = ROOT;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Search for a valid transition from this (state, sym),
|
||||
// following the backref chain.
|
||||
|
||||
TRAN next;
|
||||
while (!t_valid(&ps, next = p_tran(&ps, state, sym)) && state != ROOT) {
|
||||
TRAN back = p_tran(&ps, state, BACK);
|
||||
state = t_valid(&ps, back) ? t_next(&ps, back) : ROOT;
|
||||
}
|
||||
|
||||
if (!t_valid(&ps, next))
|
||||
continue;
|
||||
|
||||
if (!(next & (IS_MATCH | IS_SUFFIX))) {
|
||||
// No complete match yet; keep going.
|
||||
state = t_next(&ps, next);
|
||||
continue;
|
||||
}
|
||||
|
||||
// At this point, one or more patterns have matched.
|
||||
// Find all matches by following the backref chain.
|
||||
// A valid node for (sym) with no SUFFIX flag marks the
|
||||
// end of the suffix chain.
|
||||
// In the same backref traversal, find a new (state),
|
||||
// if the original transition is to a leaf.
|
||||
|
||||
STATE s = state;
|
||||
|
||||
// Initially state is ROOT. The chain search saves the
|
||||
// first state from which the next char has a transition.
|
||||
state = t_isleaf(&ps, next) ? 0 : t_next(&ps, next);
|
||||
|
||||
while (1) {
|
||||
|
||||
if (t_valid(&ps, next)) {
|
||||
|
||||
if (next & IS_MATCH) {
|
||||
unsigned strno, ss = s + sym, i;
|
||||
if (t_isleaf(&ps, ps.tranv[ss])) {
|
||||
strno = t_strno(&ps, ps.tranv[ss]);
|
||||
} else {
|
||||
for (i = p_hash(&ps, ss); ps.hashv[i].state != ss; ++i);
|
||||
strno = ps.hashv[i].strno;
|
||||
}
|
||||
|
||||
if ((ret = cb(strno, cp - text.ptr, context)))
|
||||
goto EXIT;
|
||||
}
|
||||
|
||||
if (!state && !t_isleaf(&ps, next))
|
||||
state = t_next(&ps, next);
|
||||
if ( state && !(next & IS_SUFFIX))
|
||||
break;
|
||||
}
|
||||
|
||||
if (s == ROOT)
|
||||
break;
|
||||
|
||||
TRAN b = p_tran(&ps, s, BACK);
|
||||
s = t_valid(&ps, b) ? t_next(&ps, b) : ROOT;
|
||||
next = p_tran(&ps, s, sym);
|
||||
}
|
||||
}
|
||||
EXIT:
|
||||
return *statep = state, ret;
|
||||
}
|
||||
//EOF
|
60
contrib/aho-corasic/acism.h
Normal file
60
contrib/aho-corasic/acism.h
Normal file
@ -0,0 +1,60 @@
|
||||
/*
|
||||
** Copyright (C) 2009-2014 Mischa Sandberg <mischasan@gmail.com>
|
||||
** Copyright (C) 2015 Vsevolod Stakhov <vsevolod@highsecure.ru>
|
||||
**
|
||||
** This program is free software; you can redistribute it and/or modify
|
||||
** it under the terms of the GNU Lesser General Public License as
|
||||
** published by the Free Software Foundation. You may not use, modify or
|
||||
** distribute this program under any other version of the GNU Lesser General
|
||||
** Public License.
|
||||
**
|
||||
** This program is distributed in the hope that it will be useful,
|
||||
** but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
** GNU Lesser General Public License for more details.
|
||||
**
|
||||
** You should have received a copy of the GNU Lesser General Public License
|
||||
** along with this program; if not, write to the Free Software
|
||||
** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
*/
|
||||
|
||||
#ifndef ACISM_H
|
||||
#define ACISM_H
|
||||
|
||||
// "acism" uses MEMREF {ptr,len} bytevec structs for "string" args,
|
||||
// rather than NUL-terminated "C" strings.
|
||||
|
||||
#ifndef MSUTIL_H
|
||||
#include <stdio.h>
|
||||
typedef struct { char const *ptr; size_t len; } MEMREF;
|
||||
#endif
|
||||
|
||||
typedef struct acism ac_trie_t;
|
||||
|
||||
ac_trie_t* acism_create(MEMREF const *strv, int nstrs);
|
||||
void acism_destroy(ac_trie_t*);
|
||||
|
||||
// For each match, acism_scan calls its ACISM_ACTION fn,
|
||||
// giving it the strv[] index of the matched string,
|
||||
// and the text[] offset of the byte PAST the end of the string.
|
||||
// If ACISM_ACTION returns 0, search continues; otherwise,
|
||||
// acism_more returns that nonzero value immediately.
|
||||
|
||||
typedef int (ACISM_ACTION)(int strnum, int textpos, void *context);
|
||||
|
||||
// If sequential blocks of (text) are passed to repeated acism_more calls,
|
||||
// then search continues where the previous acism_more left off --
|
||||
// string matches can cross block boundaries.
|
||||
// *state should initially be (0).
|
||||
|
||||
int acism_more(ac_trie_t const*, MEMREF const text,
|
||||
ACISM_ACTION *fn, void *fndata, int *state);
|
||||
|
||||
static inline int acism_scan(ac_trie_t const*psp, MEMREF const text,
|
||||
ACISM_ACTION *fn, void *fndata)
|
||||
{
|
||||
int state = 0;
|
||||
return acism_more(psp, text, fn, fndata, &state);
|
||||
}
|
||||
|
||||
#endif//ACISM_H
|
396
contrib/aho-corasic/acism_create.c
Normal file
396
contrib/aho-corasic/acism_create.c
Normal file
@ -0,0 +1,396 @@
|
||||
/*
|
||||
** Copyright (C) 2009-2014 Mischa Sandberg <mischasan@gmail.com>
|
||||
**
|
||||
** This program is free software; you can redistribute it and/or modify
|
||||
** it under the terms of the GNU Lesser General Public License Version as
|
||||
** published by the Free Software Foundation. You may not use, modify or
|
||||
** distribute this program under any other version of the GNU Lesser General
|
||||
** Public License.
|
||||
**
|
||||
** This program is distributed in the hope that it will be useful,
|
||||
** but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
** GNU Lesser General Public License for more details.
|
||||
**
|
||||
** You should have received a copy of the GNU Lesser General Public License
|
||||
** along with this program; if not, write to the Free Software
|
||||
** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
*/
|
||||
#include "_acism.h"
|
||||
|
||||
typedef struct tnode {
|
||||
struct tnode *child, *next, *back;
|
||||
union { unsigned nrefs; STATE state; } x;
|
||||
STRNO match;
|
||||
SYMBOL sym, is_suffix;
|
||||
} TNODE;
|
||||
|
||||
//--------------|---------------------------------------------
|
||||
// bitwid: 1+floor(log2(u))
|
||||
static inline int bitwid(unsigned u)
|
||||
{
|
||||
int ret = !!u;
|
||||
if (u & 0xFFFF0000) u >>= 16, ret += 16;
|
||||
if (u & 0x0000FF00) u >>= 8, ret += 8;
|
||||
if (u & 0x000000F0) u >>= 4, ret += 4;
|
||||
if (u & 0x0000000C) u >>= 2, ret += 2;
|
||||
if (u & 0x00000002) ret++;
|
||||
return ret;
|
||||
}
|
||||
static void fill_symv(ac_trie_t*, MEMREF const*, int ns);
|
||||
static int create_tree(TNODE*, SYMBOL const*symv, MEMREF const*strv, int nstrs);
|
||||
static void add_backlinks(TNODE*, TNODE**, TNODE**);
|
||||
static void prune_backlinks(TNODE*);
|
||||
static int interleave(TNODE*, int nnodes, int nsyms, TNODE**, TNODE**);
|
||||
static void fill_tranv(ac_trie_t*, TNODE const*);
|
||||
static void fill_hashv(ac_trie_t*, TNODE const*, int nn);
|
||||
|
||||
static TNODE* find_child(TNODE*, SYMBOL);
|
||||
|
||||
// (ns) is either a STATE, or a (STRNO + tran_size)
|
||||
static inline void
|
||||
set_tran(ac_trie_t *psp, STATE s, SYMBOL sym, int match, int suffix, TRAN ns)
|
||||
{
|
||||
psp->tranv[s + sym] = sym | (match ? IS_MATCH : 0)
|
||||
| (suffix ? IS_SUFFIX : 0)
|
||||
| (ns << SYM_BITS);
|
||||
}
|
||||
|
||||
// Track statistics for construction
|
||||
#ifdef ACISM_STATS
|
||||
typedef struct { long long val; const char *name; } PSSTAT;
|
||||
extern PSSTAT psstat[];
|
||||
# define NOTE(n) (psstat[__LINE__] = (PSSTAT) {n, #n})
|
||||
# define HIT(id) (psstat[__LINE__].val++, psstat[__LINE__].name = id)
|
||||
#else
|
||||
# define NOTE(n) (void)0
|
||||
# define HIT(id) (void)0
|
||||
#endif //ACISM_STATS
|
||||
|
||||
//--------------|---------------------------------------------
|
||||
ac_trie_t*
|
||||
acism_create(MEMREF const* strv, int nstrs)
|
||||
{
|
||||
TNODE *tp, **v1 = NULL, **v2 = NULL;
|
||||
ac_trie_t *psp = calloc(1, sizeof*psp);
|
||||
|
||||
fill_symv(psp, strv, nstrs);
|
||||
TNODE *troot = calloc(psp->nchars + 1, sizeof*troot);
|
||||
|
||||
int nnodes = create_tree(troot, psp->symv, strv, nstrs);
|
||||
NOTE(nnodes);
|
||||
|
||||
// v1, v2: breadth-first work vectors for add_backlink and interleave.
|
||||
int nhash, i = (nstrs + 1) * sizeof*tp;
|
||||
add_backlinks(troot, v1 = malloc(i), v2 = malloc(i));
|
||||
|
||||
for (tp = troot + nnodes, nhash = 0; --tp > troot;) {
|
||||
prune_backlinks(tp);
|
||||
nhash += tp->match && tp->child;
|
||||
}
|
||||
|
||||
// Calculate each node's offset in tranv[]:
|
||||
psp->tran_size = interleave(troot, nnodes, psp->nsyms, v1, v2);
|
||||
if (bitwid(psp->tran_size + nstrs - 1) + SYM_BITS > sizeof(TRAN)*8 - 2)
|
||||
goto FAIL;
|
||||
|
||||
if (nhash) {
|
||||
// Hash table is for match info of non-leaf nodes (only).
|
||||
// Set hash_size for p_size(psp):
|
||||
psp->hash_mod = nhash * 5 / 4 + 1;
|
||||
// Initially oversize the table for overflows without wraparound.
|
||||
psp->hash_size = psp->hash_mod + nhash;
|
||||
}
|
||||
|
||||
set_tranv(psp, calloc(p_size(psp), 1));
|
||||
if (!psp->tranv) goto FAIL;
|
||||
fill_tranv(psp, troot);
|
||||
// The root state (0) must not look like a valid backref.
|
||||
// Any symbol value other than (0) in tranv[0] ensures that.
|
||||
psp->tranv[0] = 1;
|
||||
|
||||
if (nhash) {
|
||||
fill_hashv(psp, troot, nnodes);
|
||||
// Adjust hash_size to include trailing overflows
|
||||
// but trim trailing empty slots.
|
||||
psp->hash_size = psp->hash_mod;
|
||||
while ( psp->hashv[psp->hash_size].state) ++psp->hash_size;
|
||||
while (!psp->hashv[psp->hash_size - 1].state) --psp->hash_size;
|
||||
set_tranv(psp, realloc(psp->tranv, p_size(psp)));
|
||||
}
|
||||
|
||||
// Diagnostics/statistics only:
|
||||
psp->nstrs = nstrs;
|
||||
for (i = psp->maxlen = 0; i < nstrs; ++i)
|
||||
if (psp->maxlen < strv[i].len) psp->maxlen = strv[i].len;
|
||||
|
||||
goto DONE;
|
||||
FAIL: acism_destroy(psp), psp = NULL;
|
||||
DONE: free(troot), free(v1), free(v2);
|
||||
return psp;
|
||||
}
|
||||
|
||||
typedef struct { int freq, rank; } FRANK;
|
||||
static int frcmp(FRANK*a, FRANK*b) { return a->freq - b->freq; }
|
||||
|
||||
static void
|
||||
fill_symv(ac_trie_t *psp, MEMREF const *strv, int nstrs)
|
||||
{
|
||||
int i, j;
|
||||
FRANK frv[256];
|
||||
|
||||
for (i = 0; i < 256; ++i) frv[i] = (FRANK){0,i};
|
||||
for (i = 0; i < nstrs; ++i)
|
||||
for (psp->nchars += j = strv[i].len; --j >= 0;)
|
||||
frv[(uint8_t)strv[i].ptr[j]].freq++;
|
||||
|
||||
qsort(frv, 256, sizeof*frv, (qsort_cmp)frcmp);
|
||||
|
||||
for (i = 256; --i >= 0 && frv[i].freq;)
|
||||
psp->symv[frv[i].rank] = ++psp->nsyms;
|
||||
++psp->nsyms;
|
||||
#if ACISM_SIZE < 8
|
||||
psp->sym_bits = bitwid(psp->nsyms);
|
||||
psp->sym_mask = ~(-1 << psp->sym_bits);
|
||||
#endif
|
||||
}
|
||||
|
||||
static int
|
||||
create_tree(TNODE *Tree, SYMBOL const *symv, MEMREF const *strv, int nstrs)
|
||||
{
|
||||
int i, j;
|
||||
TNODE *nextp = Tree + 1;
|
||||
|
||||
for (i = 0; i < nstrs; ++i) {
|
||||
TNODE *tp = Tree;
|
||||
|
||||
for (j = 0; tp->child && j < (int)strv[i].len; ++j) {
|
||||
SYMBOL sym = symv[(uint8_t)strv[i].ptr[j]];
|
||||
|
||||
if (sym < tp->child->sym) {
|
||||
// Prep to insert new node before tp->child
|
||||
nextp->next = tp->child;
|
||||
break;
|
||||
}
|
||||
|
||||
tp = tp->child;
|
||||
while (tp->next && sym >= tp->next->sym)
|
||||
tp = tp->next;
|
||||
|
||||
// Insert new sibling after tp
|
||||
if (sym > tp->sym) {
|
||||
nextp->next = tp->next;
|
||||
tp = tp->next = nextp++;
|
||||
tp->sym = sym;
|
||||
tp->back = Tree;
|
||||
}
|
||||
}
|
||||
|
||||
for (; j < (int) strv[i].len; ++j) {
|
||||
tp = tp->child = nextp++;
|
||||
tp->sym = symv[(uint8_t)strv[i].ptr[j]];
|
||||
tp->back = Tree;
|
||||
}
|
||||
|
||||
tp->match = i + 1; // Encode strno as nonzero
|
||||
}
|
||||
|
||||
return nextp - Tree;
|
||||
}
|
||||
|
||||
static void
|
||||
add_backlinks(TNODE *troot, TNODE **v1, TNODE **v2)
|
||||
{
|
||||
TNODE *tp, **tmp;
|
||||
|
||||
for (tp = troot->child, tmp = v1; tp; tp = tp->next)
|
||||
*tmp++ = tp;
|
||||
*tmp = NULL;
|
||||
|
||||
while (*v1) {
|
||||
TNODE **spp = v1, **dpp = v2, *srcp, *dstp;
|
||||
while ((srcp = *spp++)) {
|
||||
for (dstp = srcp->child; dstp; dstp = dstp->next) {
|
||||
TNODE *bp = NULL;
|
||||
*dpp++ = dstp;
|
||||
for (tp = srcp->back; tp; tp = tp->back)
|
||||
if ((bp = find_child(tp, dstp->sym))) break;
|
||||
if (!bp)
|
||||
bp = troot;
|
||||
|
||||
dstp->back = dstp->child ? bp : tp ? tp : troot;
|
||||
dstp->back->x.nrefs++;
|
||||
dstp->is_suffix = bp->match || bp->is_suffix;
|
||||
}
|
||||
}
|
||||
*dpp = 0;
|
||||
tmp = v1; v1 = v2; v2 = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
prune_backlinks(TNODE *tp)
|
||||
{
|
||||
if (tp->x.nrefs || !tp->child)
|
||||
return;
|
||||
|
||||
TNODE *bp;
|
||||
// (bp != bp->back IFF bp != troot)
|
||||
while ((bp = tp->back) && !bp->match && bp != bp->back) {
|
||||
HIT("backlinks");
|
||||
TNODE *cp = tp->child, *pp = bp->child;
|
||||
|
||||
// Search for a child of bp that's not a child of tp
|
||||
for (; cp && pp && pp->sym >= cp->sym; cp = cp->next) {
|
||||
if (pp->sym == cp->sym) {
|
||||
if (pp->match && cp->is_suffix) break;
|
||||
pp = pp->next;
|
||||
}
|
||||
}
|
||||
|
||||
if (pp) break;
|
||||
|
||||
// So, target of back link is not a suffix match
|
||||
// of this node, and its children are a subset
|
||||
// of this node's children: prune it.
|
||||
HIT("pruned");
|
||||
if ((tp->back = bp->back)) {
|
||||
tp->back->x.nrefs++;
|
||||
if (!--bp->x.nrefs)
|
||||
prune_backlinks(bp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static int
|
||||
interleave(TNODE *troot, int nnodes, int nsyms, TNODE **v1, TNODE **v2)
|
||||
{
|
||||
unsigned usev_size = nnodes + nsyms;
|
||||
char *usev = calloc(usev_size, sizeof*usev);
|
||||
STATE last_trans = 0, startv[nsyms][2];
|
||||
TNODE *cp, **tmp;
|
||||
|
||||
memset(startv, 0, nsyms * sizeof*startv);
|
||||
|
||||
// Iterate through one level of the Tree at a time.
|
||||
// That srsly improves locality (L1-cache use).
|
||||
|
||||
v1[0] = troot, v1[1] = NULL;
|
||||
for (; *v1; tmp = v1, v1 = v2, v2 = tmp) {
|
||||
TNODE **srcp = v1, **dstp = v2, *tp;
|
||||
while ((tp = *srcp++)) {
|
||||
if (!tp->child) continue;
|
||||
|
||||
HIT("nonleaf");
|
||||
if (tp->back == troot) tp->back = NULL; // simplify tests.
|
||||
cp = tp->child;
|
||||
|
||||
STATE pos, *startp = &startv[cp->sym][!!tp->back];
|
||||
while ((cp = cp->next)) {
|
||||
STATE *newp = &startv[cp->sym][!!tp->back];
|
||||
if (*startp < *newp) startp = newp;
|
||||
}
|
||||
|
||||
// If (tp) has a backref, we need a slot at offset 0
|
||||
// that is free as a base AND to be used (filled in).
|
||||
char need = tp->back ? BASE|USED : BASE;
|
||||
for (pos = *startp;; ++pos) {
|
||||
if (usev[pos] & need) {
|
||||
HIT("inner loop");
|
||||
continue;
|
||||
}
|
||||
|
||||
for (cp = tp->child; cp; cp = cp->next) {
|
||||
HIT("child loop");
|
||||
if (usev[pos + cp->sym] & USED) break;
|
||||
}
|
||||
|
||||
// No child needs an in-use slot? We're done.
|
||||
if (!cp) break;
|
||||
}
|
||||
tp->x.state = pos;
|
||||
|
||||
// Mark node's base and children as used:
|
||||
usev[pos] |= need;
|
||||
STATE last = 0; // Make compiler happy
|
||||
int nkids = 0;
|
||||
for (cp = tp->child; cp; *dstp++ = cp, cp = cp->next, ++nkids)
|
||||
usev[last = pos + cp->sym] |= USED;
|
||||
|
||||
// This is a HEURISTIC for advancing search for other nodes
|
||||
*startp += (pos - *startp) / nkids;
|
||||
|
||||
if (last_trans < last) {
|
||||
last_trans = last;
|
||||
if (last + nsyms >= usev_size) {
|
||||
usev = realloc(usev, usev_size << 1);
|
||||
memset(usev + usev_size, 0, usev_size);
|
||||
usev_size <<= 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
*dstp = NULL;
|
||||
}
|
||||
|
||||
free(usev);
|
||||
return last_trans + 1;
|
||||
}
|
||||
|
||||
static void
|
||||
fill_hashv(ac_trie_t *psp, TNODE const treev[], int nnodes)
|
||||
{
|
||||
STRASH *sv = malloc(psp->hash_mod * sizeof*sv), *sp = sv;
|
||||
int i;
|
||||
|
||||
// First pass: insert without resolving collisions.
|
||||
for (i = 0; i < nnodes; ++i) {
|
||||
STATE base = treev[i].x.state;
|
||||
TNODE const *tp;
|
||||
for (tp = treev[i].child; tp; tp = tp->next) {
|
||||
if (tp->match && tp->child) {
|
||||
STATE state = base + tp->sym;
|
||||
STRASH *hp = &psp->hashv[p_hash(psp, state)];
|
||||
*(hp->state ? sp++ : hp) = (STRASH){state, tp->match - 1};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
while (--sp >= sv) {
|
||||
HIT("hash collisions");
|
||||
for (i = p_hash(psp, sp->state); psp->hashv[i].state; ++i)
|
||||
HIT("hash displacements");
|
||||
psp->hashv[i] = *sp;
|
||||
}
|
||||
|
||||
free(sv);
|
||||
}
|
||||
|
||||
static void
|
||||
fill_tranv(ac_trie_t *psp, TNODE const*tp)
|
||||
{
|
||||
TNODE const *cp = tp->child;
|
||||
|
||||
if (cp && tp->back)
|
||||
set_tran(psp, tp->x.state, 0, 0, 0, tp->back->x.state);
|
||||
|
||||
for (; cp; cp = cp->next) {
|
||||
//NOTE: cp->match is (strno+1) so that !cp->match means "no match".
|
||||
set_tran(psp, tp->x.state, cp->sym, cp->match, cp->is_suffix,
|
||||
cp->child ? cp->x.state : cp->match - 1 + psp->tran_size);
|
||||
if (cp->child)
|
||||
fill_tranv(psp, cp);
|
||||
}
|
||||
}
|
||||
|
||||
static TNODE *
|
||||
find_child(TNODE *tp, SYMBOL sym)
|
||||
{
|
||||
for (tp = tp->child; tp && tp->sym < sym; tp = tp->next);
|
||||
return tp && tp->sym == sym ? tp : NULL;
|
||||
}
|
||||
|
||||
#ifdef ACISM_STATS
|
||||
PSSTAT psstat[__LINE__] = {{__LINE__,0}};
|
||||
#endif//ACISM_STATS
|
||||
//EOF
|
@ -99,6 +99,7 @@ ENDIF(NOT DEBIAN_BUILD)
|
||||
|
||||
TARGET_LINK_LIBRARIES(rspamd rspamd-server)
|
||||
TARGET_LINK_LIBRARIES(rspamd stemmer)
|
||||
TARGET_LINK_LIBRARIES(rspamd rspamd-actrie)
|
||||
|
||||
TARGET_LINK_LIBRARIES(rspamd ${RSPAMD_REQUIRED_LIBRARIES})
|
||||
|
||||
|
Laddar…
Referens i nytt ärende
Block a user