From: Vsevolod Stakhov Date: Mon, 6 Apr 2015 17:04:58 +0000 (+0100) Subject: Corasic -> Corasick. X-Git-Tag: 0.9.0~318^2~2 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=8eca74d2778a0b272778501e908128386c5dd8f0;p=rspamd.git Corasic -> Corasick. --- diff --git a/contrib/aho-corasic/CMakeLists.txt b/contrib/aho-corasic/CMakeLists.txt deleted file mode 100644 index 2240f9b64..000000000 --- a/contrib/aho-corasic/CMakeLists.txt +++ /dev/null @@ -1,8 +0,0 @@ -SET(AHOCORASICSRC acism_create.c - acism.c) -if ("${CMAKE_C_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_C_COMPILER_ID}" STREQUAL "GNU") - SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3") -endif () -ADD_LIBRARY(rspamd-actrie SHARED ${AHOCORASICSRC}) -INSTALL(TARGETS rspamd-actrie - LIBRARY DESTINATION lib) \ No newline at end of file diff --git a/contrib/aho-corasic/LICENSE b/contrib/aho-corasic/LICENSE deleted file mode 100644 index 65c5ca88a..000000000 --- a/contrib/aho-corasic/LICENSE +++ /dev/null @@ -1,165 +0,0 @@ - GNU LESSER GENERAL PUBLIC LICENSE - Version 3, 29 June 2007 - - Copyright (C) 2007 Free Software Foundation, Inc. - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - - - This version of the GNU Lesser General Public License incorporates -the terms and conditions of version 3 of the GNU General Public -License, supplemented by the additional permissions listed below. - - 0. Additional Definitions. - - As used herein, "this License" refers to version 3 of the GNU Lesser -General Public License, and the "GNU GPL" refers to version 3 of the GNU -General Public License. - - "The Library" refers to a covered work governed by this License, -other than an Application or a Combined Work as defined below. - - An "Application" is any work that makes use of an interface provided -by the Library, but which is not otherwise based on the Library. -Defining a subclass of a class defined by the Library is deemed a mode -of using an interface provided by the Library. - - A "Combined Work" is a work produced by combining or linking an -Application with the Library. The particular version of the Library -with which the Combined Work was made is also called the "Linked -Version". - - The "Minimal Corresponding Source" for a Combined Work means the -Corresponding Source for the Combined Work, excluding any source code -for portions of the Combined Work that, considered in isolation, are -based on the Application, and not on the Linked Version. - - The "Corresponding Application Code" for a Combined Work means the -object code and/or source code for the Application, including any data -and utility programs needed for reproducing the Combined Work from the -Application, but excluding the System Libraries of the Combined Work. - - 1. Exception to Section 3 of the GNU GPL. - - You may convey a covered work under sections 3 and 4 of this License -without being bound by section 3 of the GNU GPL. - - 2. Conveying Modified Versions. - - If you modify a copy of the Library, and, in your modifications, a -facility refers to a function or data to be supplied by an Application -that uses the facility (other than as an argument passed when the -facility is invoked), then you may convey a copy of the modified -version: - - a) under this License, provided that you make a good faith effort to - ensure that, in the event an Application does not supply the - function or data, the facility still operates, and performs - whatever part of its purpose remains meaningful, or - - b) under the GNU GPL, with none of the additional permissions of - this License applicable to that copy. - - 3. Object Code Incorporating Material from Library Header Files. - - The object code form of an Application may incorporate material from -a header file that is part of the Library. You may convey such object -code under terms of your choice, provided that, if the incorporated -material is not limited to numerical parameters, data structure -layouts and accessors, or small macros, inline functions and templates -(ten or fewer lines in length), you do both of the following: - - a) Give prominent notice with each copy of the object code that the - Library is used in it and that the Library and its use are - covered by this License. - - b) Accompany the object code with a copy of the GNU GPL and this license - document. - - 4. Combined Works. - - You may convey a Combined Work under terms of your choice that, -taken together, effectively do not restrict modification of the -portions of the Library contained in the Combined Work and reverse -engineering for debugging such modifications, if you also do each of -the following: - - a) Give prominent notice with each copy of the Combined Work that - the Library is used in it and that the Library and its use are - covered by this License. - - b) Accompany the Combined Work with a copy of the GNU GPL and this license - document. - - c) For a Combined Work that displays copyright notices during - execution, include the copyright notice for the Library among - these notices, as well as a reference directing the user to the - copies of the GNU GPL and this license document. - - d) Do one of the following: - - 0) Convey the Minimal Corresponding Source under the terms of this - License, and the Corresponding Application Code in a form - suitable for, and under terms that permit, the user to - recombine or relink the Application with a modified version of - the Linked Version to produce a modified Combined Work, in the - manner specified by section 6 of the GNU GPL for conveying - Corresponding Source. - - 1) Use a suitable shared library mechanism for linking with the - Library. A suitable mechanism is one that (a) uses at run time - a copy of the Library already present on the user's computer - system, and (b) will operate properly with a modified version - of the Library that is interface-compatible with the Linked - Version. - - e) Provide Installation Information, but only if you would otherwise - be required to provide such information under section 6 of the - GNU GPL, and only to the extent that such information is - necessary to install and execute a modified version of the - Combined Work produced by recombining or relinking the - Application with a modified version of the Linked Version. (If - you use option 4d0, the Installation Information must accompany - the Minimal Corresponding Source and Corresponding Application - Code. If you use option 4d1, you must provide the Installation - Information in the manner specified by section 6 of the GNU GPL - for conveying Corresponding Source.) - - 5. Combined Libraries. - - You may place library facilities that are a work based on the -Library side by side in a single library together with other library -facilities that are not Applications and are not covered by this -License, and convey such a combined library under terms of your -choice, if you do both of the following: - - a) Accompany the combined library with a copy of the same work based - on the Library, uncombined with any other library facilities, - conveyed under the terms of this License. - - b) Give prominent notice with the combined library that part of it - is a work based on the Library, and explaining where to find the - accompanying uncombined form of the same work. - - 6. Revised Versions of the GNU Lesser General Public License. - - The Free Software Foundation may publish revised and/or new versions -of the GNU Lesser General Public License from time to time. Such new -versions will be similar in spirit to the present version, but may -differ in detail to address new problems or concerns. - - Each version is given a distinguishing version number. If the -Library as you received it specifies that a certain numbered version -of the GNU Lesser General Public License "or any later version" -applies to it, you have the option of following the terms and -conditions either of that published version or of any later version -published by the Free Software Foundation. If the Library as you -received it does not specify a version number of the GNU Lesser -General Public License, you may choose any version of the GNU Lesser -General Public License ever published by the Free Software Foundation. - - If the Library as you received it specifies that a proxy can decide -whether future versions of the GNU Lesser General Public License shall -apply, that proxy's public statement of acceptance of any version is -permanent authorization for you to choose that version for the -Library. diff --git a/contrib/aho-corasic/README.md b/contrib/aho-corasic/README.md deleted file mode 100644 index c245d6fc9..000000000 --- a/contrib/aho-corasic/README.md +++ /dev/null @@ -1,67 +0,0 @@ -aho-corasick -== - -Aho-Corasick parallel string search, using interleaved arrays. - -Mischa Sandberg mischasan@gmail.com - -ACISM is an implementation of Aho-Corasick parallel string search, -using an Interleaved State-transition Matrix. -It combines the fastest possible Aho-Corasick implementation, -with the smallest possible data structure (!). - -FEATURES --------- - -* Fast. No hashing, no tree traversal; just a straight look-up equivalent to - matrix[state, input-byte] per input character. - -* Tiny. On average, the whole data structure (mostly the array) takes about 2-3 bytes per - input pattern byte. The original set of pattern strings can be reverse-generated from the machine. - -* Shareable. The state machine contains no pointers, so it can be compiled once, - then memory-mapped by many processes. - -* Searches byte vectors, not null-terminated strings. - Suitable for searching machine code as much as searching text. - -* DOS-proof. Well, that's an attribute of Aho-Corasick, - so no real points for that. - -* Stream-ready. The state can be saved between calls to search data. - -DOCUMENTATION -------------- - -The GoogleDocs description is at http://goo.gl/lE6zG -I originally called it "psearch", but found that name was overused by other authors. - -LICENSE -------- - -Though I've had strong suggestions to go with BSD license, I'm going with GPL2 until I figure out -how to keep in touch with people who download and use the code. Hence the "CONTACT ME IF..." line in the license. - -GETTING STARTED ---------------- - -Download the source, type "gmake". -"gmake install" exports lib/libacism.a, include/acism.h and bin/acism_x. -"acism_x.c" is a good example of calling acism_create and acism_scan/acism_more. - -(If you're interested in the GNUmakefile and rules.mk, - check my blog posts on non-recursive make, at mischasan.wordpress.com.) - -HISTORY -------- - -The interleaved-array approach was tried and discarded in the late 70's, because the compile time was O(n^2). -acism_create beats the problem with a "hint" array that tracks the restart points for searches. -That, plus discarding the original idea of how to get maximal density, resulted in the tiny-fast win-win. - -ACKNOWLEDGEMENTS ----------------- - -I'd like to thank Mike Shannon, who wanted to see a machine built to make best use of L1/L2 cache. -The change to do that doubled performance on hardware with a much larger cache than the matrix. -Go figure. diff --git a/contrib/aho-corasic/_acism.h b/contrib/aho-corasic/_acism.h deleted file mode 100644 index b994bf073..000000000 --- a/contrib/aho-corasic/_acism.h +++ /dev/null @@ -1,105 +0,0 @@ -/* -** Copyright (C) 2009-2014 Mischa Sandberg -** -** This program is free software; you can redistribute it and/or modify -** it under the terms of the GNU Lesser General Public License Version as -** published by the Free Software Foundation. You may not use, modify or -** distribute this program under any other version of the GNU Lesser General -** Public License. -** -** This program is distributed in the hope that it will be useful, -** but WITHOUT ANY WARRANTY; without even the implied warranty of -** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -** GNU Lesser General Public License for more details. -** -** You should have received a copy of the GNU Lesser General Public License -** along with this program; if not, write to the Free Software -** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -*/ -#ifndef _ACISM_H -#define _ACISM_H - -#include -#include // malloc -#include // memcpy - -typedef int (*qsort_cmp)(const void *, const void *); - -#ifndef ACISM_SIZE -# define ACISM_SIZE 4 -#endif - -#if ACISM_SIZE == 8 - typedef uint64_t TRAN, STATE, STRNO; -# define SYM_BITS 9U -# define SYM_MASK 511 -#else - typedef uint32_t TRAN, STATE, STRNO; -# define SYM_BITS psp->sym_bits -# define SYM_MASK psp->sym_mask -#endif - -typedef uint16_t SYMBOL; -typedef unsigned _SYMBOL; // An efficient stacklocal SYMBOL - -enum { - IS_MATCH = (TRAN)1 << (8*sizeof(TRAN) - 1), - IS_SUFFIX = (TRAN)1 << (8*sizeof(TRAN) - 2), - T_FLAGS = IS_MATCH | IS_SUFFIX -}; - -typedef struct { STATE state; STRNO strno; } STRASH; - -typedef enum { BASE=2, USED=1 } USES; - -struct acism { - TRAN* tranv; - STRASH* hashv; - unsigned flags; -# define IS_MMAP 1 - -#if ACISM_SIZE < 8 - TRAN sym_mask; - unsigned sym_bits; -#endif - unsigned hash_mod; // search hashv starting at (state + sym) % hash_mod. - unsigned hash_size; // #(hashv): hash_mod plus the overflows past [hash_mod-1] - unsigned tran_size; // #(tranv) - unsigned nsyms, nchars, nstrs, maxlen; - SYMBOL symv[256]; -}; - -#include "acism.h" - -// p_size: size of tranv + hashv -static inline unsigned p_size(ac_trie_t const *psp) -{ return psp->hash_size * sizeof*psp->hashv - + psp->tran_size * sizeof*psp->tranv; } - -static inline unsigned p_hash(ac_trie_t const *psp, STATE s) - { return s * 107 % psp->hash_mod; } - -static inline void set_tranv(ac_trie_t *psp, void *mem) - { psp->hashv = (STRASH*)&(psp->tranv = mem)[psp->tran_size]; } - -// TRAN accessors. For ACISM_SIZE=8, SYM_{BITS,MASK} do not use psp. - -static inline TRAN p_tran(ac_trie_t const *psp, STATE s, _SYMBOL sym) - { return psp->tranv[s + sym] ^ sym; } - -static inline _SYMBOL t_sym(ac_trie_t const *psp, TRAN t) - { (void)psp; return t & SYM_MASK; } - -static inline STATE t_next(ac_trie_t const *psp, TRAN t) - { (void)psp; return (t & ~T_FLAGS) >> SYM_BITS; } - -static inline int t_isleaf(ac_trie_t const *psp, TRAN t) - { return t_next(psp, t) >= psp->tran_size; } - -static inline int t_strno(ac_trie_t const *psp, TRAN t) - { return t_next(psp, t) - psp->tran_size; } - -static inline _SYMBOL t_valid(ac_trie_t const *psp, TRAN t) - { return !t_sym(psp, t); } - -#endif//_ACISM_H diff --git a/contrib/aho-corasic/acism.c b/contrib/aho-corasic/acism.c deleted file mode 100644 index a4c678154..000000000 --- a/contrib/aho-corasic/acism.c +++ /dev/null @@ -1,119 +0,0 @@ -/* -** Copyright (C) 2009-2014 Mischa Sandberg -** -** This program is free software; you can redistribute it and/or modify -** it under the terms of the GNU Lesser General Public License Version as -** published by the Free Software Foundation. You may not use, modify or -** distribute this program under any other version of the GNU Lesser General -** Public License. -** -** This program is distributed in the hope that it will be useful, -** but WITHOUT ANY WARRANTY; without even the implied warranty of -** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -** GNU Lesser General Public License for more details. -** -** You should have received a copy of the GNU Lesser General Public License -** along with this program; if not, write to the Free Software -** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -*/ - -#include "_acism.h" - -#define BACK ((SYMBOL)0) -#define ROOT ((STATE) 0) - -int -acism_lookup(ac_trie_t const *psp, const char *text, size_t len, - ACISM_ACTION *cb, void *context, int *statep) -{ - ac_trie_t const ps = *psp; - char const *cp = text, *endp = cp + len; - STATE state = *statep; - int ret = 0; - - while (cp < endp) { - _SYMBOL sym = ps.symv[(uint8_t)*cp++]; - if (!sym) { - // Input byte is not in any pattern string. - state = ROOT; - continue; - } - - // Search for a valid transition from this (state, sym), - // following the backref chain. - - TRAN next; - while (!t_valid(&ps, next = p_tran(&ps, state, sym)) && state != ROOT) { - TRAN back = p_tran(&ps, state, BACK); - state = t_valid(&ps, back) ? t_next(&ps, back) : ROOT; - } - - if (!t_valid(&ps, next)) - continue; - - if (!(next & (IS_MATCH | IS_SUFFIX))) { - // No complete match yet; keep going. - state = t_next(&ps, next); - continue; - } - - // At this point, one or more patterns have matched. - // Find all matches by following the backref chain. - // A valid node for (sym) with no SUFFIX flag marks the - // end of the suffix chain. - // In the same backref traversal, find a new (state), - // if the original transition is to a leaf. - - STATE s = state; - - // Initially state is ROOT. The chain search saves the - // first state from which the next char has a transition. - state = t_isleaf(&ps, next) ? 0 : t_next(&ps, next); - - while (1) { - - if (t_valid(&ps, next)) { - - if (next & IS_MATCH) { - unsigned strno, ss = s + sym, i; - if (t_isleaf(&ps, ps.tranv[ss])) { - strno = t_strno(&ps, ps.tranv[ss]); - } else { - for (i = p_hash(&ps, ss); ps.hashv[i].state != ss; ++i); - strno = ps.hashv[i].strno; - } - - if ((ret = cb(strno, cp - text, context))) - goto EXIT; - } - - if (!state && !t_isleaf(&ps, next)) - state = t_next(&ps, next); - if ( state && !(next & IS_SUFFIX)) - break; - } - - if (s == ROOT) - break; - - TRAN b = p_tran(&ps, s, BACK); - s = t_valid(&ps, b) ? t_next(&ps, b) : ROOT; - next = p_tran(&ps, s, sym); - } - } -EXIT: - *statep = state; - return ret; -} - -void -acism_destroy(ac_trie_t *psp) -{ - if (!psp) return; - if (psp->flags & IS_MMAP) - munmap((char*)psp->tranv - sizeof(ac_trie_t), - sizeof(ac_trie_t) + p_size(psp)); - else free(psp->tranv); - free(psp); -} -//EOF diff --git a/contrib/aho-corasic/acism.h b/contrib/aho-corasic/acism.h deleted file mode 100644 index 3886b149e..000000000 --- a/contrib/aho-corasic/acism.h +++ /dev/null @@ -1,51 +0,0 @@ -/* -** Copyright (C) 2009-2014 Mischa Sandberg -** Copyright (C) 2015 Vsevolod Stakhov -** -** This program is free software; you can redistribute it and/or modify -** it under the terms of the GNU Lesser General Public License as -** published by the Free Software Foundation. You may not use, modify or -** distribute this program under any other version of the GNU Lesser General -** Public License. -** -** This program is distributed in the hope that it will be useful, -** but WITHOUT ANY WARRANTY; without even the implied warranty of -** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -** GNU Lesser General Public License for more details. -** -** You should have received a copy of the GNU Lesser General Public License -** along with this program; if not, write to the Free Software -** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -*/ - -#ifndef ACISM_H -#define ACISM_H - -#include "config.h" -// "acism" uses MEMREF {ptr,len} bytevec structs for "string" args, -// rather than NUL-terminated "C" strings. - -typedef struct { char const *ptr; size_t len; } ac_trie_pat_t; - -typedef struct acism ac_trie_t; - -ac_trie_t* acism_create(ac_trie_pat_t const *strv, int nstrs); -void acism_destroy(ac_trie_t*); - -// For each match, acism_scan calls its ACISM_ACTION fn, -// giving it the strv[] index of the matched string, -// and the text[] offset of the byte PAST the end of the string. -// If ACISM_ACTION returns 0, search continues; otherwise, -// acism_more returns that nonzero value immediately. - -typedef int (ACISM_ACTION)(int strnum, int textpos, void *context); - -// If sequential blocks of (text) are passed to repeated acism_more calls, -// then search continues where the previous acism_more left off -- -// string matches can cross block boundaries. -// *state should initially be (0). - -int acism_lookup(ac_trie_t const *psp, const char *text, size_t len, - ACISM_ACTION *cb, void *context, int *statep); - -#endif//ACISM_H diff --git a/contrib/aho-corasic/acism_create.c b/contrib/aho-corasic/acism_create.c deleted file mode 100644 index b1d9b51e3..000000000 --- a/contrib/aho-corasic/acism_create.c +++ /dev/null @@ -1,396 +0,0 @@ -/* -** Copyright (C) 2009-2014 Mischa Sandberg -** -** This program is free software; you can redistribute it and/or modify -** it under the terms of the GNU Lesser General Public License Version as -** published by the Free Software Foundation. You may not use, modify or -** distribute this program under any other version of the GNU Lesser General -** Public License. -** -** This program is distributed in the hope that it will be useful, -** but WITHOUT ANY WARRANTY; without even the implied warranty of -** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -** GNU Lesser General Public License for more details. -** -** You should have received a copy of the GNU Lesser General Public License -** along with this program; if not, write to the Free Software -** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -*/ -#include "_acism.h" - -typedef struct tnode { - struct tnode *child, *next, *back; - union { unsigned nrefs; STATE state; } x; - STRNO match; - SYMBOL sym, is_suffix; -} TNODE; - -//--------------|--------------------------------------------- -// bitwid: 1+floor(log2(u)) -static inline int bitwid(unsigned u) -{ - int ret = !!u; - if (u & 0xFFFF0000) u >>= 16, ret += 16; - if (u & 0x0000FF00) u >>= 8, ret += 8; - if (u & 0x000000F0) u >>= 4, ret += 4; - if (u & 0x0000000C) u >>= 2, ret += 2; - if (u & 0x00000002) ret++; - return ret; -} -static void fill_symv(ac_trie_t*, ac_trie_pat_t const*, int ns); -static int create_tree(TNODE*, SYMBOL const*symv, ac_trie_pat_t const*strv, int nstrs); -static void add_backlinks(TNODE*, TNODE**, TNODE**); -static void prune_backlinks(TNODE*); -static int interleave(TNODE*, int nnodes, int nsyms, TNODE**, TNODE**); -static void fill_tranv(ac_trie_t*, TNODE const*); -static void fill_hashv(ac_trie_t*, TNODE const*, int nn); - -static TNODE* find_child(TNODE*, SYMBOL); - -// (ns) is either a STATE, or a (STRNO + tran_size) -static inline void -set_tran(ac_trie_t *psp, STATE s, SYMBOL sym, int match, int suffix, TRAN ns) -{ - psp->tranv[s + sym] = sym | (match ? IS_MATCH : 0) - | (suffix ? IS_SUFFIX : 0) - | (ns << SYM_BITS); -} - -// Track statistics for construction -#ifdef ACISM_STATS -typedef struct { long long val; const char *name; } PSSTAT; -extern PSSTAT psstat[]; -# define NOTE(n) (psstat[__LINE__] = (PSSTAT) {n, #n}) -# define HIT(id) (psstat[__LINE__].val++, psstat[__LINE__].name = id) -#else -# define NOTE(n) (void)0 -# define HIT(id) (void)0 -#endif //ACISM_STATS - -//--------------|--------------------------------------------- -ac_trie_t* -acism_create(ac_trie_pat_t const* strv, int nstrs) -{ - TNODE *tp, **v1 = NULL, **v2 = NULL; - ac_trie_t *psp = calloc(1, sizeof*psp); - - fill_symv(psp, strv, nstrs); - TNODE *troot = calloc(psp->nchars + 1, sizeof*troot); - - int nnodes = create_tree(troot, psp->symv, strv, nstrs); - NOTE(nnodes); - - // v1, v2: breadth-first work vectors for add_backlink and interleave. - int nhash, i = (nstrs + 1) * sizeof*tp; - add_backlinks(troot, v1 = malloc(i), v2 = malloc(i)); - - for (tp = troot + nnodes, nhash = 0; --tp > troot;) { - prune_backlinks(tp); - nhash += tp->match && tp->child; - } - - // Calculate each node's offset in tranv[]: - psp->tran_size = interleave(troot, nnodes, psp->nsyms, v1, v2); - if (bitwid(psp->tran_size + nstrs - 1) + SYM_BITS > sizeof(TRAN)*8 - 2) - goto FAIL; - - if (nhash) { - // Hash table is for match info of non-leaf nodes (only). - // Set hash_size for p_size(psp): - psp->hash_mod = nhash * 5 / 4 + 1; - // Initially oversize the table for overflows without wraparound. - psp->hash_size = psp->hash_mod + nhash; - } - - set_tranv(psp, calloc(p_size(psp), 1)); - if (!psp->tranv) goto FAIL; - fill_tranv(psp, troot); - // The root state (0) must not look like a valid backref. - // Any symbol value other than (0) in tranv[0] ensures that. - psp->tranv[0] = 1; - - if (nhash) { - fill_hashv(psp, troot, nnodes); - // Adjust hash_size to include trailing overflows - // but trim trailing empty slots. - psp->hash_size = psp->hash_mod; - while ( psp->hashv[psp->hash_size].state) ++psp->hash_size; - while (!psp->hashv[psp->hash_size - 1].state) --psp->hash_size; - set_tranv(psp, realloc(psp->tranv, p_size(psp))); - } - - // Diagnostics/statistics only: - psp->nstrs = nstrs; - for (i = psp->maxlen = 0; i < nstrs; ++i) - if (psp->maxlen < strv[i].len) psp->maxlen = strv[i].len; - - goto DONE; -FAIL: acism_destroy(psp), psp = NULL; -DONE: free(troot), free(v1), free(v2); - return psp; -} - -typedef struct { int freq, rank; } FRANK; -static int frcmp(FRANK*a, FRANK*b) { return a->freq - b->freq; } - -static void -fill_symv(ac_trie_t *psp, ac_trie_pat_t const *strv, int nstrs) -{ - int i, j; - FRANK frv[256]; - - for (i = 0; i < 256; ++i) frv[i] = (FRANK){0,i}; - for (i = 0; i < nstrs; ++i) - for (psp->nchars += j = strv[i].len; --j >= 0;) - frv[(uint8_t)strv[i].ptr[j]].freq++; - - qsort(frv, 256, sizeof*frv, (qsort_cmp)frcmp); - - for (i = 256; --i >= 0 && frv[i].freq;) - psp->symv[frv[i].rank] = ++psp->nsyms; - ++psp->nsyms; -#if ACISM_SIZE < 8 - psp->sym_bits = bitwid(psp->nsyms); - psp->sym_mask = ~(-1 << psp->sym_bits); -#endif -} - -static int -create_tree(TNODE *Tree, SYMBOL const *symv, ac_trie_pat_t const *strv, int nstrs) -{ - int i, j; - TNODE *nextp = Tree + 1; - - for (i = 0; i < nstrs; ++i) { - TNODE *tp = Tree; - - for (j = 0; tp->child && j < (int)strv[i].len; ++j) { - SYMBOL sym = symv[(uint8_t)strv[i].ptr[j]]; - - if (sym < tp->child->sym) { - // Prep to insert new node before tp->child - nextp->next = tp->child; - break; - } - - tp = tp->child; - while (tp->next && sym >= tp->next->sym) - tp = tp->next; - - // Insert new sibling after tp - if (sym > tp->sym) { - nextp->next = tp->next; - tp = tp->next = nextp++; - tp->sym = sym; - tp->back = Tree; - } - } - - for (; j < (int) strv[i].len; ++j) { - tp = tp->child = nextp++; - tp->sym = symv[(uint8_t)strv[i].ptr[j]]; - tp->back = Tree; - } - - tp->match = i + 1; // Encode strno as nonzero - } - - return nextp - Tree; -} - -static void -add_backlinks(TNODE *troot, TNODE **v1, TNODE **v2) -{ - TNODE *tp, **tmp; - - for (tp = troot->child, tmp = v1; tp; tp = tp->next) - *tmp++ = tp; - *tmp = NULL; - - while (*v1) { - TNODE **spp = v1, **dpp = v2, *srcp, *dstp; - while ((srcp = *spp++)) { - for (dstp = srcp->child; dstp; dstp = dstp->next) { - TNODE *bp = NULL; - *dpp++ = dstp; - for (tp = srcp->back; tp; tp = tp->back) - if ((bp = find_child(tp, dstp->sym))) break; - if (!bp) - bp = troot; - - dstp->back = dstp->child ? bp : tp ? tp : troot; - dstp->back->x.nrefs++; - dstp->is_suffix = bp->match || bp->is_suffix; - } - } - *dpp = 0; - tmp = v1; v1 = v2; v2 = tmp; - } -} - -static void -prune_backlinks(TNODE *tp) -{ - if (tp->x.nrefs || !tp->child) - return; - - TNODE *bp; - // (bp != bp->back IFF bp != troot) - while ((bp = tp->back) && !bp->match && bp != bp->back) { - HIT("backlinks"); - TNODE *cp = tp->child, *pp = bp->child; - - // Search for a child of bp that's not a child of tp - for (; cp && pp && pp->sym >= cp->sym; cp = cp->next) { - if (pp->sym == cp->sym) { - if (pp->match && cp->is_suffix) break; - pp = pp->next; - } - } - - if (pp) break; - - // So, target of back link is not a suffix match - // of this node, and its children are a subset - // of this node's children: prune it. - HIT("pruned"); - if ((tp->back = bp->back)) { - tp->back->x.nrefs++; - if (!--bp->x.nrefs) - prune_backlinks(bp); - } - } -} - -static int -interleave(TNODE *troot, int nnodes, int nsyms, TNODE **v1, TNODE **v2) -{ - unsigned usev_size = nnodes + nsyms; - char *usev = calloc(usev_size, sizeof*usev); - STATE last_trans = 0, startv[nsyms][2]; - TNODE *cp, **tmp; - - memset(startv, 0, nsyms * sizeof*startv); - - // Iterate through one level of the Tree at a time. - // That srsly improves locality (L1-cache use). - - v1[0] = troot, v1[1] = NULL; - for (; *v1; tmp = v1, v1 = v2, v2 = tmp) { - TNODE **srcp = v1, **dstp = v2, *tp; - while ((tp = *srcp++)) { - if (!tp->child) continue; - - HIT("nonleaf"); - if (tp->back == troot) tp->back = NULL; // simplify tests. - cp = tp->child; - - STATE pos, *startp = &startv[cp->sym][!!tp->back]; - while ((cp = cp->next)) { - STATE *newp = &startv[cp->sym][!!tp->back]; - if (*startp < *newp) startp = newp; - } - - // If (tp) has a backref, we need a slot at offset 0 - // that is free as a base AND to be used (filled in). - char need = tp->back ? BASE|USED : BASE; - for (pos = *startp;; ++pos) { - if (usev[pos] & need) { - HIT("inner loop"); - continue; - } - - for (cp = tp->child; cp; cp = cp->next) { - HIT("child loop"); - if (usev[pos + cp->sym] & USED) break; - } - - // No child needs an in-use slot? We're done. - if (!cp) break; - } - tp->x.state = pos; - - // Mark node's base and children as used: - usev[pos] |= need; - STATE last = 0; // Make compiler happy - int nkids = 0; - for (cp = tp->child; cp; *dstp++ = cp, cp = cp->next, ++nkids) - usev[last = pos + cp->sym] |= USED; - - // This is a HEURISTIC for advancing search for other nodes - *startp += (pos - *startp) / nkids; - - if (last_trans < last) { - last_trans = last; - if (last + nsyms >= usev_size) { - usev = realloc(usev, usev_size << 1); - memset(usev + usev_size, 0, usev_size); - usev_size <<= 1; - } - } - } - - *dstp = NULL; - } - - free(usev); - return last_trans + 1; -} - -static void -fill_hashv(ac_trie_t *psp, TNODE const treev[], int nnodes) -{ - STRASH *sv = malloc(psp->hash_mod * sizeof*sv), *sp = sv; - int i; - - // First pass: insert without resolving collisions. - for (i = 0; i < nnodes; ++i) { - STATE base = treev[i].x.state; - TNODE const *tp; - for (tp = treev[i].child; tp; tp = tp->next) { - if (tp->match && tp->child) { - STATE state = base + tp->sym; - STRASH *hp = &psp->hashv[p_hash(psp, state)]; - *(hp->state ? sp++ : hp) = (STRASH){state, tp->match - 1}; - } - } - } - - while (--sp >= sv) { - HIT("hash collisions"); - for (i = p_hash(psp, sp->state); psp->hashv[i].state; ++i) - HIT("hash displacements"); - psp->hashv[i] = *sp; - } - - free(sv); -} - -static void -fill_tranv(ac_trie_t *psp, TNODE const*tp) -{ - TNODE const *cp = tp->child; - - if (cp && tp->back) - set_tran(psp, tp->x.state, 0, 0, 0, tp->back->x.state); - - for (; cp; cp = cp->next) { - //NOTE: cp->match is (strno+1) so that !cp->match means "no match". - set_tran(psp, tp->x.state, cp->sym, cp->match, cp->is_suffix, - cp->child ? cp->x.state : cp->match - 1 + psp->tran_size); - if (cp->child) - fill_tranv(psp, cp); - } -} - -static TNODE * -find_child(TNODE *tp, SYMBOL sym) -{ - for (tp = tp->child; tp && tp->sym < sym; tp = tp->next); - return tp && tp->sym == sym ? tp : NULL; -} - -#ifdef ACISM_STATS -PSSTAT psstat[__LINE__] = {{__LINE__,0}}; -#endif//ACISM_STATS -//EOF diff --git a/contrib/aho-corasick/CMakeLists.txt b/contrib/aho-corasick/CMakeLists.txt new file mode 100644 index 000000000..2240f9b64 --- /dev/null +++ b/contrib/aho-corasick/CMakeLists.txt @@ -0,0 +1,8 @@ +SET(AHOCORASICSRC acism_create.c + acism.c) +if ("${CMAKE_C_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_C_COMPILER_ID}" STREQUAL "GNU") + SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3") +endif () +ADD_LIBRARY(rspamd-actrie SHARED ${AHOCORASICSRC}) +INSTALL(TARGETS rspamd-actrie + LIBRARY DESTINATION lib) \ No newline at end of file diff --git a/contrib/aho-corasick/LICENSE b/contrib/aho-corasick/LICENSE new file mode 100644 index 000000000..65c5ca88a --- /dev/null +++ b/contrib/aho-corasick/LICENSE @@ -0,0 +1,165 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + + This version of the GNU Lesser General Public License incorporates +the terms and conditions of version 3 of the GNU General Public +License, supplemented by the additional permissions listed below. + + 0. Additional Definitions. + + As used herein, "this License" refers to version 3 of the GNU Lesser +General Public License, and the "GNU GPL" refers to version 3 of the GNU +General Public License. + + "The Library" refers to a covered work governed by this License, +other than an Application or a Combined Work as defined below. + + An "Application" is any work that makes use of an interface provided +by the Library, but which is not otherwise based on the Library. +Defining a subclass of a class defined by the Library is deemed a mode +of using an interface provided by the Library. + + A "Combined Work" is a work produced by combining or linking an +Application with the Library. The particular version of the Library +with which the Combined Work was made is also called the "Linked +Version". + + The "Minimal Corresponding Source" for a Combined Work means the +Corresponding Source for the Combined Work, excluding any source code +for portions of the Combined Work that, considered in isolation, are +based on the Application, and not on the Linked Version. + + The "Corresponding Application Code" for a Combined Work means the +object code and/or source code for the Application, including any data +and utility programs needed for reproducing the Combined Work from the +Application, but excluding the System Libraries of the Combined Work. + + 1. Exception to Section 3 of the GNU GPL. + + You may convey a covered work under sections 3 and 4 of this License +without being bound by section 3 of the GNU GPL. + + 2. Conveying Modified Versions. + + If you modify a copy of the Library, and, in your modifications, a +facility refers to a function or data to be supplied by an Application +that uses the facility (other than as an argument passed when the +facility is invoked), then you may convey a copy of the modified +version: + + a) under this License, provided that you make a good faith effort to + ensure that, in the event an Application does not supply the + function or data, the facility still operates, and performs + whatever part of its purpose remains meaningful, or + + b) under the GNU GPL, with none of the additional permissions of + this License applicable to that copy. + + 3. Object Code Incorporating Material from Library Header Files. + + The object code form of an Application may incorporate material from +a header file that is part of the Library. You may convey such object +code under terms of your choice, provided that, if the incorporated +material is not limited to numerical parameters, data structure +layouts and accessors, or small macros, inline functions and templates +(ten or fewer lines in length), you do both of the following: + + a) Give prominent notice with each copy of the object code that the + Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the object code with a copy of the GNU GPL and this license + document. + + 4. Combined Works. + + You may convey a Combined Work under terms of your choice that, +taken together, effectively do not restrict modification of the +portions of the Library contained in the Combined Work and reverse +engineering for debugging such modifications, if you also do each of +the following: + + a) Give prominent notice with each copy of the Combined Work that + the Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the Combined Work with a copy of the GNU GPL and this license + document. + + c) For a Combined Work that displays copyright notices during + execution, include the copyright notice for the Library among + these notices, as well as a reference directing the user to the + copies of the GNU GPL and this license document. + + d) Do one of the following: + + 0) Convey the Minimal Corresponding Source under the terms of this + License, and the Corresponding Application Code in a form + suitable for, and under terms that permit, the user to + recombine or relink the Application with a modified version of + the Linked Version to produce a modified Combined Work, in the + manner specified by section 6 of the GNU GPL for conveying + Corresponding Source. + + 1) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (a) uses at run time + a copy of the Library already present on the user's computer + system, and (b) will operate properly with a modified version + of the Library that is interface-compatible with the Linked + Version. + + e) Provide Installation Information, but only if you would otherwise + be required to provide such information under section 6 of the + GNU GPL, and only to the extent that such information is + necessary to install and execute a modified version of the + Combined Work produced by recombining or relinking the + Application with a modified version of the Linked Version. (If + you use option 4d0, the Installation Information must accompany + the Minimal Corresponding Source and Corresponding Application + Code. If you use option 4d1, you must provide the Installation + Information in the manner specified by section 6 of the GNU GPL + for conveying Corresponding Source.) + + 5. Combined Libraries. + + You may place library facilities that are a work based on the +Library side by side in a single library together with other library +facilities that are not Applications and are not covered by this +License, and convey such a combined library under terms of your +choice, if you do both of the following: + + a) Accompany the combined library with a copy of the same work based + on the Library, uncombined with any other library facilities, + conveyed under the terms of this License. + + b) Give prominent notice with the combined library that part of it + is a work based on the Library, and explaining where to find the + accompanying uncombined form of the same work. + + 6. Revised Versions of the GNU Lesser General Public License. + + The Free Software Foundation may publish revised and/or new versions +of the GNU Lesser General Public License from time to time. Such new +versions will be similar in spirit to the present version, but may +differ in detail to address new problems or concerns. + + Each version is given a distinguishing version number. If the +Library as you received it specifies that a certain numbered version +of the GNU Lesser General Public License "or any later version" +applies to it, you have the option of following the terms and +conditions either of that published version or of any later version +published by the Free Software Foundation. If the Library as you +received it does not specify a version number of the GNU Lesser +General Public License, you may choose any version of the GNU Lesser +General Public License ever published by the Free Software Foundation. + + If the Library as you received it specifies that a proxy can decide +whether future versions of the GNU Lesser General Public License shall +apply, that proxy's public statement of acceptance of any version is +permanent authorization for you to choose that version for the +Library. diff --git a/contrib/aho-corasick/README.md b/contrib/aho-corasick/README.md new file mode 100644 index 000000000..c245d6fc9 --- /dev/null +++ b/contrib/aho-corasick/README.md @@ -0,0 +1,67 @@ +aho-corasick +== + +Aho-Corasick parallel string search, using interleaved arrays. + +Mischa Sandberg mischasan@gmail.com + +ACISM is an implementation of Aho-Corasick parallel string search, +using an Interleaved State-transition Matrix. +It combines the fastest possible Aho-Corasick implementation, +with the smallest possible data structure (!). + +FEATURES +-------- + +* Fast. No hashing, no tree traversal; just a straight look-up equivalent to + matrix[state, input-byte] per input character. + +* Tiny. On average, the whole data structure (mostly the array) takes about 2-3 bytes per + input pattern byte. The original set of pattern strings can be reverse-generated from the machine. + +* Shareable. The state machine contains no pointers, so it can be compiled once, + then memory-mapped by many processes. + +* Searches byte vectors, not null-terminated strings. + Suitable for searching machine code as much as searching text. + +* DOS-proof. Well, that's an attribute of Aho-Corasick, + so no real points for that. + +* Stream-ready. The state can be saved between calls to search data. + +DOCUMENTATION +------------- + +The GoogleDocs description is at http://goo.gl/lE6zG +I originally called it "psearch", but found that name was overused by other authors. + +LICENSE +------- + +Though I've had strong suggestions to go with BSD license, I'm going with GPL2 until I figure out +how to keep in touch with people who download and use the code. Hence the "CONTACT ME IF..." line in the license. + +GETTING STARTED +--------------- + +Download the source, type "gmake". +"gmake install" exports lib/libacism.a, include/acism.h and bin/acism_x. +"acism_x.c" is a good example of calling acism_create and acism_scan/acism_more. + +(If you're interested in the GNUmakefile and rules.mk, + check my blog posts on non-recursive make, at mischasan.wordpress.com.) + +HISTORY +------- + +The interleaved-array approach was tried and discarded in the late 70's, because the compile time was O(n^2). +acism_create beats the problem with a "hint" array that tracks the restart points for searches. +That, plus discarding the original idea of how to get maximal density, resulted in the tiny-fast win-win. + +ACKNOWLEDGEMENTS +---------------- + +I'd like to thank Mike Shannon, who wanted to see a machine built to make best use of L1/L2 cache. +The change to do that doubled performance on hardware with a much larger cache than the matrix. +Go figure. diff --git a/contrib/aho-corasick/_acism.h b/contrib/aho-corasick/_acism.h new file mode 100644 index 000000000..b994bf073 --- /dev/null +++ b/contrib/aho-corasick/_acism.h @@ -0,0 +1,105 @@ +/* +** Copyright (C) 2009-2014 Mischa Sandberg +** +** This program is free software; you can redistribute it and/or modify +** it under the terms of the GNU Lesser General Public License Version as +** published by the Free Software Foundation. You may not use, modify or +** distribute this program under any other version of the GNU Lesser General +** Public License. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU Lesser General Public License for more details. +** +** You should have received a copy of the GNU Lesser General Public License +** along with this program; if not, write to the Free Software +** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ +#ifndef _ACISM_H +#define _ACISM_H + +#include +#include // malloc +#include // memcpy + +typedef int (*qsort_cmp)(const void *, const void *); + +#ifndef ACISM_SIZE +# define ACISM_SIZE 4 +#endif + +#if ACISM_SIZE == 8 + typedef uint64_t TRAN, STATE, STRNO; +# define SYM_BITS 9U +# define SYM_MASK 511 +#else + typedef uint32_t TRAN, STATE, STRNO; +# define SYM_BITS psp->sym_bits +# define SYM_MASK psp->sym_mask +#endif + +typedef uint16_t SYMBOL; +typedef unsigned _SYMBOL; // An efficient stacklocal SYMBOL + +enum { + IS_MATCH = (TRAN)1 << (8*sizeof(TRAN) - 1), + IS_SUFFIX = (TRAN)1 << (8*sizeof(TRAN) - 2), + T_FLAGS = IS_MATCH | IS_SUFFIX +}; + +typedef struct { STATE state; STRNO strno; } STRASH; + +typedef enum { BASE=2, USED=1 } USES; + +struct acism { + TRAN* tranv; + STRASH* hashv; + unsigned flags; +# define IS_MMAP 1 + +#if ACISM_SIZE < 8 + TRAN sym_mask; + unsigned sym_bits; +#endif + unsigned hash_mod; // search hashv starting at (state + sym) % hash_mod. + unsigned hash_size; // #(hashv): hash_mod plus the overflows past [hash_mod-1] + unsigned tran_size; // #(tranv) + unsigned nsyms, nchars, nstrs, maxlen; + SYMBOL symv[256]; +}; + +#include "acism.h" + +// p_size: size of tranv + hashv +static inline unsigned p_size(ac_trie_t const *psp) +{ return psp->hash_size * sizeof*psp->hashv + + psp->tran_size * sizeof*psp->tranv; } + +static inline unsigned p_hash(ac_trie_t const *psp, STATE s) + { return s * 107 % psp->hash_mod; } + +static inline void set_tranv(ac_trie_t *psp, void *mem) + { psp->hashv = (STRASH*)&(psp->tranv = mem)[psp->tran_size]; } + +// TRAN accessors. For ACISM_SIZE=8, SYM_{BITS,MASK} do not use psp. + +static inline TRAN p_tran(ac_trie_t const *psp, STATE s, _SYMBOL sym) + { return psp->tranv[s + sym] ^ sym; } + +static inline _SYMBOL t_sym(ac_trie_t const *psp, TRAN t) + { (void)psp; return t & SYM_MASK; } + +static inline STATE t_next(ac_trie_t const *psp, TRAN t) + { (void)psp; return (t & ~T_FLAGS) >> SYM_BITS; } + +static inline int t_isleaf(ac_trie_t const *psp, TRAN t) + { return t_next(psp, t) >= psp->tran_size; } + +static inline int t_strno(ac_trie_t const *psp, TRAN t) + { return t_next(psp, t) - psp->tran_size; } + +static inline _SYMBOL t_valid(ac_trie_t const *psp, TRAN t) + { return !t_sym(psp, t); } + +#endif//_ACISM_H diff --git a/contrib/aho-corasick/acism.c b/contrib/aho-corasick/acism.c new file mode 100644 index 000000000..a4c678154 --- /dev/null +++ b/contrib/aho-corasick/acism.c @@ -0,0 +1,119 @@ +/* +** Copyright (C) 2009-2014 Mischa Sandberg +** +** This program is free software; you can redistribute it and/or modify +** it under the terms of the GNU Lesser General Public License Version as +** published by the Free Software Foundation. You may not use, modify or +** distribute this program under any other version of the GNU Lesser General +** Public License. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU Lesser General Public License for more details. +** +** You should have received a copy of the GNU Lesser General Public License +** along with this program; if not, write to the Free Software +** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ + +#include "_acism.h" + +#define BACK ((SYMBOL)0) +#define ROOT ((STATE) 0) + +int +acism_lookup(ac_trie_t const *psp, const char *text, size_t len, + ACISM_ACTION *cb, void *context, int *statep) +{ + ac_trie_t const ps = *psp; + char const *cp = text, *endp = cp + len; + STATE state = *statep; + int ret = 0; + + while (cp < endp) { + _SYMBOL sym = ps.symv[(uint8_t)*cp++]; + if (!sym) { + // Input byte is not in any pattern string. + state = ROOT; + continue; + } + + // Search for a valid transition from this (state, sym), + // following the backref chain. + + TRAN next; + while (!t_valid(&ps, next = p_tran(&ps, state, sym)) && state != ROOT) { + TRAN back = p_tran(&ps, state, BACK); + state = t_valid(&ps, back) ? t_next(&ps, back) : ROOT; + } + + if (!t_valid(&ps, next)) + continue; + + if (!(next & (IS_MATCH | IS_SUFFIX))) { + // No complete match yet; keep going. + state = t_next(&ps, next); + continue; + } + + // At this point, one or more patterns have matched. + // Find all matches by following the backref chain. + // A valid node for (sym) with no SUFFIX flag marks the + // end of the suffix chain. + // In the same backref traversal, find a new (state), + // if the original transition is to a leaf. + + STATE s = state; + + // Initially state is ROOT. The chain search saves the + // first state from which the next char has a transition. + state = t_isleaf(&ps, next) ? 0 : t_next(&ps, next); + + while (1) { + + if (t_valid(&ps, next)) { + + if (next & IS_MATCH) { + unsigned strno, ss = s + sym, i; + if (t_isleaf(&ps, ps.tranv[ss])) { + strno = t_strno(&ps, ps.tranv[ss]); + } else { + for (i = p_hash(&ps, ss); ps.hashv[i].state != ss; ++i); + strno = ps.hashv[i].strno; + } + + if ((ret = cb(strno, cp - text, context))) + goto EXIT; + } + + if (!state && !t_isleaf(&ps, next)) + state = t_next(&ps, next); + if ( state && !(next & IS_SUFFIX)) + break; + } + + if (s == ROOT) + break; + + TRAN b = p_tran(&ps, s, BACK); + s = t_valid(&ps, b) ? t_next(&ps, b) : ROOT; + next = p_tran(&ps, s, sym); + } + } +EXIT: + *statep = state; + return ret; +} + +void +acism_destroy(ac_trie_t *psp) +{ + if (!psp) return; + if (psp->flags & IS_MMAP) + munmap((char*)psp->tranv - sizeof(ac_trie_t), + sizeof(ac_trie_t) + p_size(psp)); + else free(psp->tranv); + free(psp); +} +//EOF diff --git a/contrib/aho-corasick/acism.h b/contrib/aho-corasick/acism.h new file mode 100644 index 000000000..3886b149e --- /dev/null +++ b/contrib/aho-corasick/acism.h @@ -0,0 +1,51 @@ +/* +** Copyright (C) 2009-2014 Mischa Sandberg +** Copyright (C) 2015 Vsevolod Stakhov +** +** This program is free software; you can redistribute it and/or modify +** it under the terms of the GNU Lesser General Public License as +** published by the Free Software Foundation. You may not use, modify or +** distribute this program under any other version of the GNU Lesser General +** Public License. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU Lesser General Public License for more details. +** +** You should have received a copy of the GNU Lesser General Public License +** along with this program; if not, write to the Free Software +** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ + +#ifndef ACISM_H +#define ACISM_H + +#include "config.h" +// "acism" uses MEMREF {ptr,len} bytevec structs for "string" args, +// rather than NUL-terminated "C" strings. + +typedef struct { char const *ptr; size_t len; } ac_trie_pat_t; + +typedef struct acism ac_trie_t; + +ac_trie_t* acism_create(ac_trie_pat_t const *strv, int nstrs); +void acism_destroy(ac_trie_t*); + +// For each match, acism_scan calls its ACISM_ACTION fn, +// giving it the strv[] index of the matched string, +// and the text[] offset of the byte PAST the end of the string. +// If ACISM_ACTION returns 0, search continues; otherwise, +// acism_more returns that nonzero value immediately. + +typedef int (ACISM_ACTION)(int strnum, int textpos, void *context); + +// If sequential blocks of (text) are passed to repeated acism_more calls, +// then search continues where the previous acism_more left off -- +// string matches can cross block boundaries. +// *state should initially be (0). + +int acism_lookup(ac_trie_t const *psp, const char *text, size_t len, + ACISM_ACTION *cb, void *context, int *statep); + +#endif//ACISM_H diff --git a/contrib/aho-corasick/acism_create.c b/contrib/aho-corasick/acism_create.c new file mode 100644 index 000000000..b1d9b51e3 --- /dev/null +++ b/contrib/aho-corasick/acism_create.c @@ -0,0 +1,396 @@ +/* +** Copyright (C) 2009-2014 Mischa Sandberg +** +** This program is free software; you can redistribute it and/or modify +** it under the terms of the GNU Lesser General Public License Version as +** published by the Free Software Foundation. You may not use, modify or +** distribute this program under any other version of the GNU Lesser General +** Public License. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU Lesser General Public License for more details. +** +** You should have received a copy of the GNU Lesser General Public License +** along with this program; if not, write to the Free Software +** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ +#include "_acism.h" + +typedef struct tnode { + struct tnode *child, *next, *back; + union { unsigned nrefs; STATE state; } x; + STRNO match; + SYMBOL sym, is_suffix; +} TNODE; + +//--------------|--------------------------------------------- +// bitwid: 1+floor(log2(u)) +static inline int bitwid(unsigned u) +{ + int ret = !!u; + if (u & 0xFFFF0000) u >>= 16, ret += 16; + if (u & 0x0000FF00) u >>= 8, ret += 8; + if (u & 0x000000F0) u >>= 4, ret += 4; + if (u & 0x0000000C) u >>= 2, ret += 2; + if (u & 0x00000002) ret++; + return ret; +} +static void fill_symv(ac_trie_t*, ac_trie_pat_t const*, int ns); +static int create_tree(TNODE*, SYMBOL const*symv, ac_trie_pat_t const*strv, int nstrs); +static void add_backlinks(TNODE*, TNODE**, TNODE**); +static void prune_backlinks(TNODE*); +static int interleave(TNODE*, int nnodes, int nsyms, TNODE**, TNODE**); +static void fill_tranv(ac_trie_t*, TNODE const*); +static void fill_hashv(ac_trie_t*, TNODE const*, int nn); + +static TNODE* find_child(TNODE*, SYMBOL); + +// (ns) is either a STATE, or a (STRNO + tran_size) +static inline void +set_tran(ac_trie_t *psp, STATE s, SYMBOL sym, int match, int suffix, TRAN ns) +{ + psp->tranv[s + sym] = sym | (match ? IS_MATCH : 0) + | (suffix ? IS_SUFFIX : 0) + | (ns << SYM_BITS); +} + +// Track statistics for construction +#ifdef ACISM_STATS +typedef struct { long long val; const char *name; } PSSTAT; +extern PSSTAT psstat[]; +# define NOTE(n) (psstat[__LINE__] = (PSSTAT) {n, #n}) +# define HIT(id) (psstat[__LINE__].val++, psstat[__LINE__].name = id) +#else +# define NOTE(n) (void)0 +# define HIT(id) (void)0 +#endif //ACISM_STATS + +//--------------|--------------------------------------------- +ac_trie_t* +acism_create(ac_trie_pat_t const* strv, int nstrs) +{ + TNODE *tp, **v1 = NULL, **v2 = NULL; + ac_trie_t *psp = calloc(1, sizeof*psp); + + fill_symv(psp, strv, nstrs); + TNODE *troot = calloc(psp->nchars + 1, sizeof*troot); + + int nnodes = create_tree(troot, psp->symv, strv, nstrs); + NOTE(nnodes); + + // v1, v2: breadth-first work vectors for add_backlink and interleave. + int nhash, i = (nstrs + 1) * sizeof*tp; + add_backlinks(troot, v1 = malloc(i), v2 = malloc(i)); + + for (tp = troot + nnodes, nhash = 0; --tp > troot;) { + prune_backlinks(tp); + nhash += tp->match && tp->child; + } + + // Calculate each node's offset in tranv[]: + psp->tran_size = interleave(troot, nnodes, psp->nsyms, v1, v2); + if (bitwid(psp->tran_size + nstrs - 1) + SYM_BITS > sizeof(TRAN)*8 - 2) + goto FAIL; + + if (nhash) { + // Hash table is for match info of non-leaf nodes (only). + // Set hash_size for p_size(psp): + psp->hash_mod = nhash * 5 / 4 + 1; + // Initially oversize the table for overflows without wraparound. + psp->hash_size = psp->hash_mod + nhash; + } + + set_tranv(psp, calloc(p_size(psp), 1)); + if (!psp->tranv) goto FAIL; + fill_tranv(psp, troot); + // The root state (0) must not look like a valid backref. + // Any symbol value other than (0) in tranv[0] ensures that. + psp->tranv[0] = 1; + + if (nhash) { + fill_hashv(psp, troot, nnodes); + // Adjust hash_size to include trailing overflows + // but trim trailing empty slots. + psp->hash_size = psp->hash_mod; + while ( psp->hashv[psp->hash_size].state) ++psp->hash_size; + while (!psp->hashv[psp->hash_size - 1].state) --psp->hash_size; + set_tranv(psp, realloc(psp->tranv, p_size(psp))); + } + + // Diagnostics/statistics only: + psp->nstrs = nstrs; + for (i = psp->maxlen = 0; i < nstrs; ++i) + if (psp->maxlen < strv[i].len) psp->maxlen = strv[i].len; + + goto DONE; +FAIL: acism_destroy(psp), psp = NULL; +DONE: free(troot), free(v1), free(v2); + return psp; +} + +typedef struct { int freq, rank; } FRANK; +static int frcmp(FRANK*a, FRANK*b) { return a->freq - b->freq; } + +static void +fill_symv(ac_trie_t *psp, ac_trie_pat_t const *strv, int nstrs) +{ + int i, j; + FRANK frv[256]; + + for (i = 0; i < 256; ++i) frv[i] = (FRANK){0,i}; + for (i = 0; i < nstrs; ++i) + for (psp->nchars += j = strv[i].len; --j >= 0;) + frv[(uint8_t)strv[i].ptr[j]].freq++; + + qsort(frv, 256, sizeof*frv, (qsort_cmp)frcmp); + + for (i = 256; --i >= 0 && frv[i].freq;) + psp->symv[frv[i].rank] = ++psp->nsyms; + ++psp->nsyms; +#if ACISM_SIZE < 8 + psp->sym_bits = bitwid(psp->nsyms); + psp->sym_mask = ~(-1 << psp->sym_bits); +#endif +} + +static int +create_tree(TNODE *Tree, SYMBOL const *symv, ac_trie_pat_t const *strv, int nstrs) +{ + int i, j; + TNODE *nextp = Tree + 1; + + for (i = 0; i < nstrs; ++i) { + TNODE *tp = Tree; + + for (j = 0; tp->child && j < (int)strv[i].len; ++j) { + SYMBOL sym = symv[(uint8_t)strv[i].ptr[j]]; + + if (sym < tp->child->sym) { + // Prep to insert new node before tp->child + nextp->next = tp->child; + break; + } + + tp = tp->child; + while (tp->next && sym >= tp->next->sym) + tp = tp->next; + + // Insert new sibling after tp + if (sym > tp->sym) { + nextp->next = tp->next; + tp = tp->next = nextp++; + tp->sym = sym; + tp->back = Tree; + } + } + + for (; j < (int) strv[i].len; ++j) { + tp = tp->child = nextp++; + tp->sym = symv[(uint8_t)strv[i].ptr[j]]; + tp->back = Tree; + } + + tp->match = i + 1; // Encode strno as nonzero + } + + return nextp - Tree; +} + +static void +add_backlinks(TNODE *troot, TNODE **v1, TNODE **v2) +{ + TNODE *tp, **tmp; + + for (tp = troot->child, tmp = v1; tp; tp = tp->next) + *tmp++ = tp; + *tmp = NULL; + + while (*v1) { + TNODE **spp = v1, **dpp = v2, *srcp, *dstp; + while ((srcp = *spp++)) { + for (dstp = srcp->child; dstp; dstp = dstp->next) { + TNODE *bp = NULL; + *dpp++ = dstp; + for (tp = srcp->back; tp; tp = tp->back) + if ((bp = find_child(tp, dstp->sym))) break; + if (!bp) + bp = troot; + + dstp->back = dstp->child ? bp : tp ? tp : troot; + dstp->back->x.nrefs++; + dstp->is_suffix = bp->match || bp->is_suffix; + } + } + *dpp = 0; + tmp = v1; v1 = v2; v2 = tmp; + } +} + +static void +prune_backlinks(TNODE *tp) +{ + if (tp->x.nrefs || !tp->child) + return; + + TNODE *bp; + // (bp != bp->back IFF bp != troot) + while ((bp = tp->back) && !bp->match && bp != bp->back) { + HIT("backlinks"); + TNODE *cp = tp->child, *pp = bp->child; + + // Search for a child of bp that's not a child of tp + for (; cp && pp && pp->sym >= cp->sym; cp = cp->next) { + if (pp->sym == cp->sym) { + if (pp->match && cp->is_suffix) break; + pp = pp->next; + } + } + + if (pp) break; + + // So, target of back link is not a suffix match + // of this node, and its children are a subset + // of this node's children: prune it. + HIT("pruned"); + if ((tp->back = bp->back)) { + tp->back->x.nrefs++; + if (!--bp->x.nrefs) + prune_backlinks(bp); + } + } +} + +static int +interleave(TNODE *troot, int nnodes, int nsyms, TNODE **v1, TNODE **v2) +{ + unsigned usev_size = nnodes + nsyms; + char *usev = calloc(usev_size, sizeof*usev); + STATE last_trans = 0, startv[nsyms][2]; + TNODE *cp, **tmp; + + memset(startv, 0, nsyms * sizeof*startv); + + // Iterate through one level of the Tree at a time. + // That srsly improves locality (L1-cache use). + + v1[0] = troot, v1[1] = NULL; + for (; *v1; tmp = v1, v1 = v2, v2 = tmp) { + TNODE **srcp = v1, **dstp = v2, *tp; + while ((tp = *srcp++)) { + if (!tp->child) continue; + + HIT("nonleaf"); + if (tp->back == troot) tp->back = NULL; // simplify tests. + cp = tp->child; + + STATE pos, *startp = &startv[cp->sym][!!tp->back]; + while ((cp = cp->next)) { + STATE *newp = &startv[cp->sym][!!tp->back]; + if (*startp < *newp) startp = newp; + } + + // If (tp) has a backref, we need a slot at offset 0 + // that is free as a base AND to be used (filled in). + char need = tp->back ? BASE|USED : BASE; + for (pos = *startp;; ++pos) { + if (usev[pos] & need) { + HIT("inner loop"); + continue; + } + + for (cp = tp->child; cp; cp = cp->next) { + HIT("child loop"); + if (usev[pos + cp->sym] & USED) break; + } + + // No child needs an in-use slot? We're done. + if (!cp) break; + } + tp->x.state = pos; + + // Mark node's base and children as used: + usev[pos] |= need; + STATE last = 0; // Make compiler happy + int nkids = 0; + for (cp = tp->child; cp; *dstp++ = cp, cp = cp->next, ++nkids) + usev[last = pos + cp->sym] |= USED; + + // This is a HEURISTIC for advancing search for other nodes + *startp += (pos - *startp) / nkids; + + if (last_trans < last) { + last_trans = last; + if (last + nsyms >= usev_size) { + usev = realloc(usev, usev_size << 1); + memset(usev + usev_size, 0, usev_size); + usev_size <<= 1; + } + } + } + + *dstp = NULL; + } + + free(usev); + return last_trans + 1; +} + +static void +fill_hashv(ac_trie_t *psp, TNODE const treev[], int nnodes) +{ + STRASH *sv = malloc(psp->hash_mod * sizeof*sv), *sp = sv; + int i; + + // First pass: insert without resolving collisions. + for (i = 0; i < nnodes; ++i) { + STATE base = treev[i].x.state; + TNODE const *tp; + for (tp = treev[i].child; tp; tp = tp->next) { + if (tp->match && tp->child) { + STATE state = base + tp->sym; + STRASH *hp = &psp->hashv[p_hash(psp, state)]; + *(hp->state ? sp++ : hp) = (STRASH){state, tp->match - 1}; + } + } + } + + while (--sp >= sv) { + HIT("hash collisions"); + for (i = p_hash(psp, sp->state); psp->hashv[i].state; ++i) + HIT("hash displacements"); + psp->hashv[i] = *sp; + } + + free(sv); +} + +static void +fill_tranv(ac_trie_t *psp, TNODE const*tp) +{ + TNODE const *cp = tp->child; + + if (cp && tp->back) + set_tran(psp, tp->x.state, 0, 0, 0, tp->back->x.state); + + for (; cp; cp = cp->next) { + //NOTE: cp->match is (strno+1) so that !cp->match means "no match". + set_tran(psp, tp->x.state, cp->sym, cp->match, cp->is_suffix, + cp->child ? cp->x.state : cp->match - 1 + psp->tran_size); + if (cp->child) + fill_tranv(psp, cp); + } +} + +static TNODE * +find_child(TNODE *tp, SYMBOL sym) +{ + for (tp = tp->child; tp && tp->sym < sym; tp = tp->next); + return tp && tp->sym == sym ? tp : NULL; +} + +#ifdef ACISM_STATS +PSSTAT psstat[__LINE__] = {{__LINE__,0}}; +#endif//ACISM_STATS +//EOF