From 8eca74d2778a0b272778501e908128386c5dd8f0 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 6 Apr 2015 18:04:58 +0100 Subject: Corasic -> Corasick. --- contrib/aho-corasick/CMakeLists.txt | 8 + contrib/aho-corasick/LICENSE | 165 +++++++++++++++ contrib/aho-corasick/README.md | 67 ++++++ contrib/aho-corasick/_acism.h | 105 ++++++++++ contrib/aho-corasick/acism.c | 119 +++++++++++ contrib/aho-corasick/acism.h | 51 +++++ contrib/aho-corasick/acism_create.c | 396 ++++++++++++++++++++++++++++++++++++ 7 files changed, 911 insertions(+) create mode 100644 contrib/aho-corasick/CMakeLists.txt create mode 100644 contrib/aho-corasick/LICENSE create mode 100644 contrib/aho-corasick/README.md create mode 100644 contrib/aho-corasick/_acism.h create mode 100644 contrib/aho-corasick/acism.c create mode 100644 contrib/aho-corasick/acism.h create mode 100644 contrib/aho-corasick/acism_create.c (limited to 'contrib/aho-corasick') diff --git a/contrib/aho-corasick/CMakeLists.txt b/contrib/aho-corasick/CMakeLists.txt new file mode 100644 index 000000000..2240f9b64 --- /dev/null +++ b/contrib/aho-corasick/CMakeLists.txt @@ -0,0 +1,8 @@ +SET(AHOCORASICSRC acism_create.c + acism.c) +if ("${CMAKE_C_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_C_COMPILER_ID}" STREQUAL "GNU") + SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3") +endif () +ADD_LIBRARY(rspamd-actrie SHARED ${AHOCORASICSRC}) +INSTALL(TARGETS rspamd-actrie + LIBRARY DESTINATION lib) \ No newline at end of file diff --git a/contrib/aho-corasick/LICENSE b/contrib/aho-corasick/LICENSE new file mode 100644 index 000000000..65c5ca88a --- /dev/null +++ b/contrib/aho-corasick/LICENSE @@ -0,0 +1,165 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + + This version of the GNU Lesser General Public License incorporates +the terms and conditions of version 3 of the GNU General Public +License, supplemented by the additional permissions listed below. + + 0. Additional Definitions. + + As used herein, "this License" refers to version 3 of the GNU Lesser +General Public License, and the "GNU GPL" refers to version 3 of the GNU +General Public License. + + "The Library" refers to a covered work governed by this License, +other than an Application or a Combined Work as defined below. + + An "Application" is any work that makes use of an interface provided +by the Library, but which is not otherwise based on the Library. +Defining a subclass of a class defined by the Library is deemed a mode +of using an interface provided by the Library. + + A "Combined Work" is a work produced by combining or linking an +Application with the Library. The particular version of the Library +with which the Combined Work was made is also called the "Linked +Version". + + The "Minimal Corresponding Source" for a Combined Work means the +Corresponding Source for the Combined Work, excluding any source code +for portions of the Combined Work that, considered in isolation, are +based on the Application, and not on the Linked Version. + + The "Corresponding Application Code" for a Combined Work means the +object code and/or source code for the Application, including any data +and utility programs needed for reproducing the Combined Work from the +Application, but excluding the System Libraries of the Combined Work. + + 1. Exception to Section 3 of the GNU GPL. + + You may convey a covered work under sections 3 and 4 of this License +without being bound by section 3 of the GNU GPL. + + 2. Conveying Modified Versions. + + If you modify a copy of the Library, and, in your modifications, a +facility refers to a function or data to be supplied by an Application +that uses the facility (other than as an argument passed when the +facility is invoked), then you may convey a copy of the modified +version: + + a) under this License, provided that you make a good faith effort to + ensure that, in the event an Application does not supply the + function or data, the facility still operates, and performs + whatever part of its purpose remains meaningful, or + + b) under the GNU GPL, with none of the additional permissions of + this License applicable to that copy. + + 3. Object Code Incorporating Material from Library Header Files. + + The object code form of an Application may incorporate material from +a header file that is part of the Library. You may convey such object +code under terms of your choice, provided that, if the incorporated +material is not limited to numerical parameters, data structure +layouts and accessors, or small macros, inline functions and templates +(ten or fewer lines in length), you do both of the following: + + a) Give prominent notice with each copy of the object code that the + Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the object code with a copy of the GNU GPL and this license + document. + + 4. Combined Works. + + You may convey a Combined Work under terms of your choice that, +taken together, effectively do not restrict modification of the +portions of the Library contained in the Combined Work and reverse +engineering for debugging such modifications, if you also do each of +the following: + + a) Give prominent notice with each copy of the Combined Work that + the Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the Combined Work with a copy of the GNU GPL and this license + document. + + c) For a Combined Work that displays copyright notices during + execution, include the copyright notice for the Library among + these notices, as well as a reference directing the user to the + copies of the GNU GPL and this license document. + + d) Do one of the following: + + 0) Convey the Minimal Corresponding Source under the terms of this + License, and the Corresponding Application Code in a form + suitable for, and under terms that permit, the user to + recombine or relink the Application with a modified version of + the Linked Version to produce a modified Combined Work, in the + manner specified by section 6 of the GNU GPL for conveying + Corresponding Source. + + 1) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (a) uses at run time + a copy of the Library already present on the user's computer + system, and (b) will operate properly with a modified version + of the Library that is interface-compatible with the Linked + Version. + + e) Provide Installation Information, but only if you would otherwise + be required to provide such information under section 6 of the + GNU GPL, and only to the extent that such information is + necessary to install and execute a modified version of the + Combined Work produced by recombining or relinking the + Application with a modified version of the Linked Version. (If + you use option 4d0, the Installation Information must accompany + the Minimal Corresponding Source and Corresponding Application + Code. If you use option 4d1, you must provide the Installation + Information in the manner specified by section 6 of the GNU GPL + for conveying Corresponding Source.) + + 5. Combined Libraries. + + You may place library facilities that are a work based on the +Library side by side in a single library together with other library +facilities that are not Applications and are not covered by this +License, and convey such a combined library under terms of your +choice, if you do both of the following: + + a) Accompany the combined library with a copy of the same work based + on the Library, uncombined with any other library facilities, + conveyed under the terms of this License. + + b) Give prominent notice with the combined library that part of it + is a work based on the Library, and explaining where to find the + accompanying uncombined form of the same work. + + 6. Revised Versions of the GNU Lesser General Public License. + + The Free Software Foundation may publish revised and/or new versions +of the GNU Lesser General Public License from time to time. Such new +versions will be similar in spirit to the present version, but may +differ in detail to address new problems or concerns. + + Each version is given a distinguishing version number. If the +Library as you received it specifies that a certain numbered version +of the GNU Lesser General Public License "or any later version" +applies to it, you have the option of following the terms and +conditions either of that published version or of any later version +published by the Free Software Foundation. If the Library as you +received it does not specify a version number of the GNU Lesser +General Public License, you may choose any version of the GNU Lesser +General Public License ever published by the Free Software Foundation. + + If the Library as you received it specifies that a proxy can decide +whether future versions of the GNU Lesser General Public License shall +apply, that proxy's public statement of acceptance of any version is +permanent authorization for you to choose that version for the +Library. diff --git a/contrib/aho-corasick/README.md b/contrib/aho-corasick/README.md new file mode 100644 index 000000000..c245d6fc9 --- /dev/null +++ b/contrib/aho-corasick/README.md @@ -0,0 +1,67 @@ +aho-corasick +== + +Aho-Corasick parallel string search, using interleaved arrays. + +Mischa Sandberg mischasan@gmail.com + +ACISM is an implementation of Aho-Corasick parallel string search, +using an Interleaved State-transition Matrix. +It combines the fastest possible Aho-Corasick implementation, +with the smallest possible data structure (!). + +FEATURES +-------- + +* Fast. No hashing, no tree traversal; just a straight look-up equivalent to + matrix[state, input-byte] per input character. + +* Tiny. On average, the whole data structure (mostly the array) takes about 2-3 bytes per + input pattern byte. The original set of pattern strings can be reverse-generated from the machine. + +* Shareable. The state machine contains no pointers, so it can be compiled once, + then memory-mapped by many processes. + +* Searches byte vectors, not null-terminated strings. + Suitable for searching machine code as much as searching text. + +* DOS-proof. Well, that's an attribute of Aho-Corasick, + so no real points for that. + +* Stream-ready. The state can be saved between calls to search data. + +DOCUMENTATION +------------- + +The GoogleDocs description is at http://goo.gl/lE6zG +I originally called it "psearch", but found that name was overused by other authors. + +LICENSE +------- + +Though I've had strong suggestions to go with BSD license, I'm going with GPL2 until I figure out +how to keep in touch with people who download and use the code. Hence the "CONTACT ME IF..." line in the license. + +GETTING STARTED +--------------- + +Download the source, type "gmake". +"gmake install" exports lib/libacism.a, include/acism.h and bin/acism_x. +"acism_x.c" is a good example of calling acism_create and acism_scan/acism_more. + +(If you're interested in the GNUmakefile and rules.mk, + check my blog posts on non-recursive make, at mischasan.wordpress.com.) + +HISTORY +------- + +The interleaved-array approach was tried and discarded in the late 70's, because the compile time was O(n^2). +acism_create beats the problem with a "hint" array that tracks the restart points for searches. +That, plus discarding the original idea of how to get maximal density, resulted in the tiny-fast win-win. + +ACKNOWLEDGEMENTS +---------------- + +I'd like to thank Mike Shannon, who wanted to see a machine built to make best use of L1/L2 cache. +The change to do that doubled performance on hardware with a much larger cache than the matrix. +Go figure. diff --git a/contrib/aho-corasick/_acism.h b/contrib/aho-corasick/_acism.h new file mode 100644 index 000000000..b994bf073 --- /dev/null +++ b/contrib/aho-corasick/_acism.h @@ -0,0 +1,105 @@ +/* +** Copyright (C) 2009-2014 Mischa Sandberg +** +** This program is free software; you can redistribute it and/or modify +** it under the terms of the GNU Lesser General Public License Version as +** published by the Free Software Foundation. You may not use, modify or +** distribute this program under any other version of the GNU Lesser General +** Public License. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU Lesser General Public License for more details. +** +** You should have received a copy of the GNU Lesser General Public License +** along with this program; if not, write to the Free Software +** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ +#ifndef _ACISM_H +#define _ACISM_H + +#include +#include // malloc +#include // memcpy + +typedef int (*qsort_cmp)(const void *, const void *); + +#ifndef ACISM_SIZE +# define ACISM_SIZE 4 +#endif + +#if ACISM_SIZE == 8 + typedef uint64_t TRAN, STATE, STRNO; +# define SYM_BITS 9U +# define SYM_MASK 511 +#else + typedef uint32_t TRAN, STATE, STRNO; +# define SYM_BITS psp->sym_bits +# define SYM_MASK psp->sym_mask +#endif + +typedef uint16_t SYMBOL; +typedef unsigned _SYMBOL; // An efficient stacklocal SYMBOL + +enum { + IS_MATCH = (TRAN)1 << (8*sizeof(TRAN) - 1), + IS_SUFFIX = (TRAN)1 << (8*sizeof(TRAN) - 2), + T_FLAGS = IS_MATCH | IS_SUFFIX +}; + +typedef struct { STATE state; STRNO strno; } STRASH; + +typedef enum { BASE=2, USED=1 } USES; + +struct acism { + TRAN* tranv; + STRASH* hashv; + unsigned flags; +# define IS_MMAP 1 + +#if ACISM_SIZE < 8 + TRAN sym_mask; + unsigned sym_bits; +#endif + unsigned hash_mod; // search hashv starting at (state + sym) % hash_mod. + unsigned hash_size; // #(hashv): hash_mod plus the overflows past [hash_mod-1] + unsigned tran_size; // #(tranv) + unsigned nsyms, nchars, nstrs, maxlen; + SYMBOL symv[256]; +}; + +#include "acism.h" + +// p_size: size of tranv + hashv +static inline unsigned p_size(ac_trie_t const *psp) +{ return psp->hash_size * sizeof*psp->hashv + + psp->tran_size * sizeof*psp->tranv; } + +static inline unsigned p_hash(ac_trie_t const *psp, STATE s) + { return s * 107 % psp->hash_mod; } + +static inline void set_tranv(ac_trie_t *psp, void *mem) + { psp->hashv = (STRASH*)&(psp->tranv = mem)[psp->tran_size]; } + +// TRAN accessors. For ACISM_SIZE=8, SYM_{BITS,MASK} do not use psp. + +static inline TRAN p_tran(ac_trie_t const *psp, STATE s, _SYMBOL sym) + { return psp->tranv[s + sym] ^ sym; } + +static inline _SYMBOL t_sym(ac_trie_t const *psp, TRAN t) + { (void)psp; return t & SYM_MASK; } + +static inline STATE t_next(ac_trie_t const *psp, TRAN t) + { (void)psp; return (t & ~T_FLAGS) >> SYM_BITS; } + +static inline int t_isleaf(ac_trie_t const *psp, TRAN t) + { return t_next(psp, t) >= psp->tran_size; } + +static inline int t_strno(ac_trie_t const *psp, TRAN t) + { return t_next(psp, t) - psp->tran_size; } + +static inline _SYMBOL t_valid(ac_trie_t const *psp, TRAN t) + { return !t_sym(psp, t); } + +#endif//_ACISM_H diff --git a/contrib/aho-corasick/acism.c b/contrib/aho-corasick/acism.c new file mode 100644 index 000000000..a4c678154 --- /dev/null +++ b/contrib/aho-corasick/acism.c @@ -0,0 +1,119 @@ +/* +** Copyright (C) 2009-2014 Mischa Sandberg +** +** This program is free software; you can redistribute it and/or modify +** it under the terms of the GNU Lesser General Public License Version as +** published by the Free Software Foundation. You may not use, modify or +** distribute this program under any other version of the GNU Lesser General +** Public License. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU Lesser General Public License for more details. +** +** You should have received a copy of the GNU Lesser General Public License +** along with this program; if not, write to the Free Software +** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ + +#include "_acism.h" + +#define BACK ((SYMBOL)0) +#define ROOT ((STATE) 0) + +int +acism_lookup(ac_trie_t const *psp, const char *text, size_t len, + ACISM_ACTION *cb, void *context, int *statep) +{ + ac_trie_t const ps = *psp; + char const *cp = text, *endp = cp + len; + STATE state = *statep; + int ret = 0; + + while (cp < endp) { + _SYMBOL sym = ps.symv[(uint8_t)*cp++]; + if (!sym) { + // Input byte is not in any pattern string. + state = ROOT; + continue; + } + + // Search for a valid transition from this (state, sym), + // following the backref chain. + + TRAN next; + while (!t_valid(&ps, next = p_tran(&ps, state, sym)) && state != ROOT) { + TRAN back = p_tran(&ps, state, BACK); + state = t_valid(&ps, back) ? t_next(&ps, back) : ROOT; + } + + if (!t_valid(&ps, next)) + continue; + + if (!(next & (IS_MATCH | IS_SUFFIX))) { + // No complete match yet; keep going. + state = t_next(&ps, next); + continue; + } + + // At this point, one or more patterns have matched. + // Find all matches by following the backref chain. + // A valid node for (sym) with no SUFFIX flag marks the + // end of the suffix chain. + // In the same backref traversal, find a new (state), + // if the original transition is to a leaf. + + STATE s = state; + + // Initially state is ROOT. The chain search saves the + // first state from which the next char has a transition. + state = t_isleaf(&ps, next) ? 0 : t_next(&ps, next); + + while (1) { + + if (t_valid(&ps, next)) { + + if (next & IS_MATCH) { + unsigned strno, ss = s + sym, i; + if (t_isleaf(&ps, ps.tranv[ss])) { + strno = t_strno(&ps, ps.tranv[ss]); + } else { + for (i = p_hash(&ps, ss); ps.hashv[i].state != ss; ++i); + strno = ps.hashv[i].strno; + } + + if ((ret = cb(strno, cp - text, context))) + goto EXIT; + } + + if (!state && !t_isleaf(&ps, next)) + state = t_next(&ps, next); + if ( state && !(next & IS_SUFFIX)) + break; + } + + if (s == ROOT) + break; + + TRAN b = p_tran(&ps, s, BACK); + s = t_valid(&ps, b) ? t_next(&ps, b) : ROOT; + next = p_tran(&ps, s, sym); + } + } +EXIT: + *statep = state; + return ret; +} + +void +acism_destroy(ac_trie_t *psp) +{ + if (!psp) return; + if (psp->flags & IS_MMAP) + munmap((char*)psp->tranv - sizeof(ac_trie_t), + sizeof(ac_trie_t) + p_size(psp)); + else free(psp->tranv); + free(psp); +} +//EOF diff --git a/contrib/aho-corasick/acism.h b/contrib/aho-corasick/acism.h new file mode 100644 index 000000000..3886b149e --- /dev/null +++ b/contrib/aho-corasick/acism.h @@ -0,0 +1,51 @@ +/* +** Copyright (C) 2009-2014 Mischa Sandberg +** Copyright (C) 2015 Vsevolod Stakhov +** +** This program is free software; you can redistribute it and/or modify +** it under the terms of the GNU Lesser General Public License as +** published by the Free Software Foundation. You may not use, modify or +** distribute this program under any other version of the GNU Lesser General +** Public License. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU Lesser General Public License for more details. +** +** You should have received a copy of the GNU Lesser General Public License +** along with this program; if not, write to the Free Software +** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ + +#ifndef ACISM_H +#define ACISM_H + +#include "config.h" +// "acism" uses MEMREF {ptr,len} bytevec structs for "string" args, +// rather than NUL-terminated "C" strings. + +typedef struct { char const *ptr; size_t len; } ac_trie_pat_t; + +typedef struct acism ac_trie_t; + +ac_trie_t* acism_create(ac_trie_pat_t const *strv, int nstrs); +void acism_destroy(ac_trie_t*); + +// For each match, acism_scan calls its ACISM_ACTION fn, +// giving it the strv[] index of the matched string, +// and the text[] offset of the byte PAST the end of the string. +// If ACISM_ACTION returns 0, search continues; otherwise, +// acism_more returns that nonzero value immediately. + +typedef int (ACISM_ACTION)(int strnum, int textpos, void *context); + +// If sequential blocks of (text) are passed to repeated acism_more calls, +// then search continues where the previous acism_more left off -- +// string matches can cross block boundaries. +// *state should initially be (0). + +int acism_lookup(ac_trie_t const *psp, const char *text, size_t len, + ACISM_ACTION *cb, void *context, int *statep); + +#endif//ACISM_H diff --git a/contrib/aho-corasick/acism_create.c b/contrib/aho-corasick/acism_create.c new file mode 100644 index 000000000..b1d9b51e3 --- /dev/null +++ b/contrib/aho-corasick/acism_create.c @@ -0,0 +1,396 @@ +/* +** Copyright (C) 2009-2014 Mischa Sandberg +** +** This program is free software; you can redistribute it and/or modify +** it under the terms of the GNU Lesser General Public License Version as +** published by the Free Software Foundation. You may not use, modify or +** distribute this program under any other version of the GNU Lesser General +** Public License. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU Lesser General Public License for more details. +** +** You should have received a copy of the GNU Lesser General Public License +** along with this program; if not, write to the Free Software +** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ +#include "_acism.h" + +typedef struct tnode { + struct tnode *child, *next, *back; + union { unsigned nrefs; STATE state; } x; + STRNO match; + SYMBOL sym, is_suffix; +} TNODE; + +//--------------|--------------------------------------------- +// bitwid: 1+floor(log2(u)) +static inline int bitwid(unsigned u) +{ + int ret = !!u; + if (u & 0xFFFF0000) u >>= 16, ret += 16; + if (u & 0x0000FF00) u >>= 8, ret += 8; + if (u & 0x000000F0) u >>= 4, ret += 4; + if (u & 0x0000000C) u >>= 2, ret += 2; + if (u & 0x00000002) ret++; + return ret; +} +static void fill_symv(ac_trie_t*, ac_trie_pat_t const*, int ns); +static int create_tree(TNODE*, SYMBOL const*symv, ac_trie_pat_t const*strv, int nstrs); +static void add_backlinks(TNODE*, TNODE**, TNODE**); +static void prune_backlinks(TNODE*); +static int interleave(TNODE*, int nnodes, int nsyms, TNODE**, TNODE**); +static void fill_tranv(ac_trie_t*, TNODE const*); +static void fill_hashv(ac_trie_t*, TNODE const*, int nn); + +static TNODE* find_child(TNODE*, SYMBOL); + +// (ns) is either a STATE, or a (STRNO + tran_size) +static inline void +set_tran(ac_trie_t *psp, STATE s, SYMBOL sym, int match, int suffix, TRAN ns) +{ + psp->tranv[s + sym] = sym | (match ? IS_MATCH : 0) + | (suffix ? IS_SUFFIX : 0) + | (ns << SYM_BITS); +} + +// Track statistics for construction +#ifdef ACISM_STATS +typedef struct { long long val; const char *name; } PSSTAT; +extern PSSTAT psstat[]; +# define NOTE(n) (psstat[__LINE__] = (PSSTAT) {n, #n}) +# define HIT(id) (psstat[__LINE__].val++, psstat[__LINE__].name = id) +#else +# define NOTE(n) (void)0 +# define HIT(id) (void)0 +#endif //ACISM_STATS + +//--------------|--------------------------------------------- +ac_trie_t* +acism_create(ac_trie_pat_t const* strv, int nstrs) +{ + TNODE *tp, **v1 = NULL, **v2 = NULL; + ac_trie_t *psp = calloc(1, sizeof*psp); + + fill_symv(psp, strv, nstrs); + TNODE *troot = calloc(psp->nchars + 1, sizeof*troot); + + int nnodes = create_tree(troot, psp->symv, strv, nstrs); + NOTE(nnodes); + + // v1, v2: breadth-first work vectors for add_backlink and interleave. + int nhash, i = (nstrs + 1) * sizeof*tp; + add_backlinks(troot, v1 = malloc(i), v2 = malloc(i)); + + for (tp = troot + nnodes, nhash = 0; --tp > troot;) { + prune_backlinks(tp); + nhash += tp->match && tp->child; + } + + // Calculate each node's offset in tranv[]: + psp->tran_size = interleave(troot, nnodes, psp->nsyms, v1, v2); + if (bitwid(psp->tran_size + nstrs - 1) + SYM_BITS > sizeof(TRAN)*8 - 2) + goto FAIL; + + if (nhash) { + // Hash table is for match info of non-leaf nodes (only). + // Set hash_size for p_size(psp): + psp->hash_mod = nhash * 5 / 4 + 1; + // Initially oversize the table for overflows without wraparound. + psp->hash_size = psp->hash_mod + nhash; + } + + set_tranv(psp, calloc(p_size(psp), 1)); + if (!psp->tranv) goto FAIL; + fill_tranv(psp, troot); + // The root state (0) must not look like a valid backref. + // Any symbol value other than (0) in tranv[0] ensures that. + psp->tranv[0] = 1; + + if (nhash) { + fill_hashv(psp, troot, nnodes); + // Adjust hash_size to include trailing overflows + // but trim trailing empty slots. + psp->hash_size = psp->hash_mod; + while ( psp->hashv[psp->hash_size].state) ++psp->hash_size; + while (!psp->hashv[psp->hash_size - 1].state) --psp->hash_size; + set_tranv(psp, realloc(psp->tranv, p_size(psp))); + } + + // Diagnostics/statistics only: + psp->nstrs = nstrs; + for (i = psp->maxlen = 0; i < nstrs; ++i) + if (psp->maxlen < strv[i].len) psp->maxlen = strv[i].len; + + goto DONE; +FAIL: acism_destroy(psp), psp = NULL; +DONE: free(troot), free(v1), free(v2); + return psp; +} + +typedef struct { int freq, rank; } FRANK; +static int frcmp(FRANK*a, FRANK*b) { return a->freq - b->freq; } + +static void +fill_symv(ac_trie_t *psp, ac_trie_pat_t const *strv, int nstrs) +{ + int i, j; + FRANK frv[256]; + + for (i = 0; i < 256; ++i) frv[i] = (FRANK){0,i}; + for (i = 0; i < nstrs; ++i) + for (psp->nchars += j = strv[i].len; --j >= 0;) + frv[(uint8_t)strv[i].ptr[j]].freq++; + + qsort(frv, 256, sizeof*frv, (qsort_cmp)frcmp); + + for (i = 256; --i >= 0 && frv[i].freq;) + psp->symv[frv[i].rank] = ++psp->nsyms; + ++psp->nsyms; +#if ACISM_SIZE < 8 + psp->sym_bits = bitwid(psp->nsyms); + psp->sym_mask = ~(-1 << psp->sym_bits); +#endif +} + +static int +create_tree(TNODE *Tree, SYMBOL const *symv, ac_trie_pat_t const *strv, int nstrs) +{ + int i, j; + TNODE *nextp = Tree + 1; + + for (i = 0; i < nstrs; ++i) { + TNODE *tp = Tree; + + for (j = 0; tp->child && j < (int)strv[i].len; ++j) { + SYMBOL sym = symv[(uint8_t)strv[i].ptr[j]]; + + if (sym < tp->child->sym) { + // Prep to insert new node before tp->child + nextp->next = tp->child; + break; + } + + tp = tp->child; + while (tp->next && sym >= tp->next->sym) + tp = tp->next; + + // Insert new sibling after tp + if (sym > tp->sym) { + nextp->next = tp->next; + tp = tp->next = nextp++; + tp->sym = sym; + tp->back = Tree; + } + } + + for (; j < (int) strv[i].len; ++j) { + tp = tp->child = nextp++; + tp->sym = symv[(uint8_t)strv[i].ptr[j]]; + tp->back = Tree; + } + + tp->match = i + 1; // Encode strno as nonzero + } + + return nextp - Tree; +} + +static void +add_backlinks(TNODE *troot, TNODE **v1, TNODE **v2) +{ + TNODE *tp, **tmp; + + for (tp = troot->child, tmp = v1; tp; tp = tp->next) + *tmp++ = tp; + *tmp = NULL; + + while (*v1) { + TNODE **spp = v1, **dpp = v2, *srcp, *dstp; + while ((srcp = *spp++)) { + for (dstp = srcp->child; dstp; dstp = dstp->next) { + TNODE *bp = NULL; + *dpp++ = dstp; + for (tp = srcp->back; tp; tp = tp->back) + if ((bp = find_child(tp, dstp->sym))) break; + if (!bp) + bp = troot; + + dstp->back = dstp->child ? bp : tp ? tp : troot; + dstp->back->x.nrefs++; + dstp->is_suffix = bp->match || bp->is_suffix; + } + } + *dpp = 0; + tmp = v1; v1 = v2; v2 = tmp; + } +} + +static void +prune_backlinks(TNODE *tp) +{ + if (tp->x.nrefs || !tp->child) + return; + + TNODE *bp; + // (bp != bp->back IFF bp != troot) + while ((bp = tp->back) && !bp->match && bp != bp->back) { + HIT("backlinks"); + TNODE *cp = tp->child, *pp = bp->child; + + // Search for a child of bp that's not a child of tp + for (; cp && pp && pp->sym >= cp->sym; cp = cp->next) { + if (pp->sym == cp->sym) { + if (pp->match && cp->is_suffix) break; + pp = pp->next; + } + } + + if (pp) break; + + // So, target of back link is not a suffix match + // of this node, and its children are a subset + // of this node's children: prune it. + HIT("pruned"); + if ((tp->back = bp->back)) { + tp->back->x.nrefs++; + if (!--bp->x.nrefs) + prune_backlinks(bp); + } + } +} + +static int +interleave(TNODE *troot, int nnodes, int nsyms, TNODE **v1, TNODE **v2) +{ + unsigned usev_size = nnodes + nsyms; + char *usev = calloc(usev_size, sizeof*usev); + STATE last_trans = 0, startv[nsyms][2]; + TNODE *cp, **tmp; + + memset(startv, 0, nsyms * sizeof*startv); + + // Iterate through one level of the Tree at a time. + // That srsly improves locality (L1-cache use). + + v1[0] = troot, v1[1] = NULL; + for (; *v1; tmp = v1, v1 = v2, v2 = tmp) { + TNODE **srcp = v1, **dstp = v2, *tp; + while ((tp = *srcp++)) { + if (!tp->child) continue; + + HIT("nonleaf"); + if (tp->back == troot) tp->back = NULL; // simplify tests. + cp = tp->child; + + STATE pos, *startp = &startv[cp->sym][!!tp->back]; + while ((cp = cp->next)) { + STATE *newp = &startv[cp->sym][!!tp->back]; + if (*startp < *newp) startp = newp; + } + + // If (tp) has a backref, we need a slot at offset 0 + // that is free as a base AND to be used (filled in). + char need = tp->back ? BASE|USED : BASE; + for (pos = *startp;; ++pos) { + if (usev[pos] & need) { + HIT("inner loop"); + continue; + } + + for (cp = tp->child; cp; cp = cp->next) { + HIT("child loop"); + if (usev[pos + cp->sym] & USED) break; + } + + // No child needs an in-use slot? We're done. + if (!cp) break; + } + tp->x.state = pos; + + // Mark node's base and children as used: + usev[pos] |= need; + STATE last = 0; // Make compiler happy + int nkids = 0; + for (cp = tp->child; cp; *dstp++ = cp, cp = cp->next, ++nkids) + usev[last = pos + cp->sym] |= USED; + + // This is a HEURISTIC for advancing search for other nodes + *startp += (pos - *startp) / nkids; + + if (last_trans < last) { + last_trans = last; + if (last + nsyms >= usev_size) { + usev = realloc(usev, usev_size << 1); + memset(usev + usev_size, 0, usev_size); + usev_size <<= 1; + } + } + } + + *dstp = NULL; + } + + free(usev); + return last_trans + 1; +} + +static void +fill_hashv(ac_trie_t *psp, TNODE const treev[], int nnodes) +{ + STRASH *sv = malloc(psp->hash_mod * sizeof*sv), *sp = sv; + int i; + + // First pass: insert without resolving collisions. + for (i = 0; i < nnodes; ++i) { + STATE base = treev[i].x.state; + TNODE const *tp; + for (tp = treev[i].child; tp; tp = tp->next) { + if (tp->match && tp->child) { + STATE state = base + tp->sym; + STRASH *hp = &psp->hashv[p_hash(psp, state)]; + *(hp->state ? sp++ : hp) = (STRASH){state, tp->match - 1}; + } + } + } + + while (--sp >= sv) { + HIT("hash collisions"); + for (i = p_hash(psp, sp->state); psp->hashv[i].state; ++i) + HIT("hash displacements"); + psp->hashv[i] = *sp; + } + + free(sv); +} + +static void +fill_tranv(ac_trie_t *psp, TNODE const*tp) +{ + TNODE const *cp = tp->child; + + if (cp && tp->back) + set_tran(psp, tp->x.state, 0, 0, 0, tp->back->x.state); + + for (; cp; cp = cp->next) { + //NOTE: cp->match is (strno+1) so that !cp->match means "no match". + set_tran(psp, tp->x.state, cp->sym, cp->match, cp->is_suffix, + cp->child ? cp->x.state : cp->match - 1 + psp->tran_size); + if (cp->child) + fill_tranv(psp, cp); + } +} + +static TNODE * +find_child(TNODE *tp, SYMBOL sym) +{ + for (tp = tp->child; tp && tp->sym < sym; tp = tp->next); + return tp && tp->sym == sym ? tp : NULL; +} + +#ifdef ACISM_STATS +PSSTAT psstat[__LINE__] = {{__LINE__,0}}; +#endif//ACISM_STATS +//EOF -- cgit v1.2.3