summaryrefslogtreecommitdiffstats
path: root/contrib/aho-corasick
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2015-04-06 18:04:58 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2015-04-06 18:04:58 +0100
commit8eca74d2778a0b272778501e908128386c5dd8f0 (patch)
tree626480c50fd9b6c55933a1e242728324aa546e25 /contrib/aho-corasick
parentd6724d926dcbc2fd204f989c1b71b6a0cc52e02a (diff)
downloadrspamd-8eca74d2778a0b272778501e908128386c5dd8f0.tar.gz
rspamd-8eca74d2778a0b272778501e908128386c5dd8f0.zip
Corasic -> Corasick.
Diffstat (limited to 'contrib/aho-corasick')
-rw-r--r--contrib/aho-corasick/CMakeLists.txt8
-rw-r--r--contrib/aho-corasick/LICENSE165
-rw-r--r--contrib/aho-corasick/README.md67
-rw-r--r--contrib/aho-corasick/_acism.h105
-rw-r--r--contrib/aho-corasick/acism.c119
-rw-r--r--contrib/aho-corasick/acism.h51
-rw-r--r--contrib/aho-corasick/acism_create.c396
7 files changed, 911 insertions, 0 deletions
diff --git a/contrib/aho-corasick/CMakeLists.txt b/contrib/aho-corasick/CMakeLists.txt
new file mode 100644
index 000000000..2240f9b64
--- /dev/null
+++ b/contrib/aho-corasick/CMakeLists.txt
@@ -0,0 +1,8 @@
+SET(AHOCORASICSRC acism_create.c
+ acism.c)
+if ("${CMAKE_C_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_C_COMPILER_ID}" STREQUAL "GNU")
+ SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3")
+endif ()
+ADD_LIBRARY(rspamd-actrie SHARED ${AHOCORASICSRC})
+INSTALL(TARGETS rspamd-actrie
+ LIBRARY DESTINATION lib) \ No newline at end of file
diff --git a/contrib/aho-corasick/LICENSE b/contrib/aho-corasick/LICENSE
new file mode 100644
index 000000000..65c5ca88a
--- /dev/null
+++ b/contrib/aho-corasick/LICENSE
@@ -0,0 +1,165 @@
+ GNU LESSER GENERAL PUBLIC LICENSE
+ Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+
+ This version of the GNU Lesser General Public License incorporates
+the terms and conditions of version 3 of the GNU General Public
+License, supplemented by the additional permissions listed below.
+
+ 0. Additional Definitions.
+
+ As used herein, "this License" refers to version 3 of the GNU Lesser
+General Public License, and the "GNU GPL" refers to version 3 of the GNU
+General Public License.
+
+ "The Library" refers to a covered work governed by this License,
+other than an Application or a Combined Work as defined below.
+
+ An "Application" is any work that makes use of an interface provided
+by the Library, but which is not otherwise based on the Library.
+Defining a subclass of a class defined by the Library is deemed a mode
+of using an interface provided by the Library.
+
+ A "Combined Work" is a work produced by combining or linking an
+Application with the Library. The particular version of the Library
+with which the Combined Work was made is also called the "Linked
+Version".
+
+ The "Minimal Corresponding Source" for a Combined Work means the
+Corresponding Source for the Combined Work, excluding any source code
+for portions of the Combined Work that, considered in isolation, are
+based on the Application, and not on the Linked Version.
+
+ The "Corresponding Application Code" for a Combined Work means the
+object code and/or source code for the Application, including any data
+and utility programs needed for reproducing the Combined Work from the
+Application, but excluding the System Libraries of the Combined Work.
+
+ 1. Exception to Section 3 of the GNU GPL.
+
+ You may convey a covered work under sections 3 and 4 of this License
+without being bound by section 3 of the GNU GPL.
+
+ 2. Conveying Modified Versions.
+
+ If you modify a copy of the Library, and, in your modifications, a
+facility refers to a function or data to be supplied by an Application
+that uses the facility (other than as an argument passed when the
+facility is invoked), then you may convey a copy of the modified
+version:
+
+ a) under this License, provided that you make a good faith effort to
+ ensure that, in the event an Application does not supply the
+ function or data, the facility still operates, and performs
+ whatever part of its purpose remains meaningful, or
+
+ b) under the GNU GPL, with none of the additional permissions of
+ this License applicable to that copy.
+
+ 3. Object Code Incorporating Material from Library Header Files.
+
+ The object code form of an Application may incorporate material from
+a header file that is part of the Library. You may convey such object
+code under terms of your choice, provided that, if the incorporated
+material is not limited to numerical parameters, data structure
+layouts and accessors, or small macros, inline functions and templates
+(ten or fewer lines in length), you do both of the following:
+
+ a) Give prominent notice with each copy of the object code that the
+ Library is used in it and that the Library and its use are
+ covered by this License.
+
+ b) Accompany the object code with a copy of the GNU GPL and this license
+ document.
+
+ 4. Combined Works.
+
+ You may convey a Combined Work under terms of your choice that,
+taken together, effectively do not restrict modification of the
+portions of the Library contained in the Combined Work and reverse
+engineering for debugging such modifications, if you also do each of
+the following:
+
+ a) Give prominent notice with each copy of the Combined Work that
+ the Library is used in it and that the Library and its use are
+ covered by this License.
+
+ b) Accompany the Combined Work with a copy of the GNU GPL and this license
+ document.
+
+ c) For a Combined Work that displays copyright notices during
+ execution, include the copyright notice for the Library among
+ these notices, as well as a reference directing the user to the
+ copies of the GNU GPL and this license document.
+
+ d) Do one of the following:
+
+ 0) Convey the Minimal Corresponding Source under the terms of this
+ License, and the Corresponding Application Code in a form
+ suitable for, and under terms that permit, the user to
+ recombine or relink the Application with a modified version of
+ the Linked Version to produce a modified Combined Work, in the
+ manner specified by section 6 of the GNU GPL for conveying
+ Corresponding Source.
+
+ 1) Use a suitable shared library mechanism for linking with the
+ Library. A suitable mechanism is one that (a) uses at run time
+ a copy of the Library already present on the user's computer
+ system, and (b) will operate properly with a modified version
+ of the Library that is interface-compatible with the Linked
+ Version.
+
+ e) Provide Installation Information, but only if you would otherwise
+ be required to provide such information under section 6 of the
+ GNU GPL, and only to the extent that such information is
+ necessary to install and execute a modified version of the
+ Combined Work produced by recombining or relinking the
+ Application with a modified version of the Linked Version. (If
+ you use option 4d0, the Installation Information must accompany
+ the Minimal Corresponding Source and Corresponding Application
+ Code. If you use option 4d1, you must provide the Installation
+ Information in the manner specified by section 6 of the GNU GPL
+ for conveying Corresponding Source.)
+
+ 5. Combined Libraries.
+
+ You may place library facilities that are a work based on the
+Library side by side in a single library together with other library
+facilities that are not Applications and are not covered by this
+License, and convey such a combined library under terms of your
+choice, if you do both of the following:
+
+ a) Accompany the combined library with a copy of the same work based
+ on the Library, uncombined with any other library facilities,
+ conveyed under the terms of this License.
+
+ b) Give prominent notice with the combined library that part of it
+ is a work based on the Library, and explaining where to find the
+ accompanying uncombined form of the same work.
+
+ 6. Revised Versions of the GNU Lesser General Public License.
+
+ The Free Software Foundation may publish revised and/or new versions
+of the GNU Lesser General Public License from time to time. Such new
+versions will be similar in spirit to the present version, but may
+differ in detail to address new problems or concerns.
+
+ Each version is given a distinguishing version number. If the
+Library as you received it specifies that a certain numbered version
+of the GNU Lesser General Public License "or any later version"
+applies to it, you have the option of following the terms and
+conditions either of that published version or of any later version
+published by the Free Software Foundation. If the Library as you
+received it does not specify a version number of the GNU Lesser
+General Public License, you may choose any version of the GNU Lesser
+General Public License ever published by the Free Software Foundation.
+
+ If the Library as you received it specifies that a proxy can decide
+whether future versions of the GNU Lesser General Public License shall
+apply, that proxy's public statement of acceptance of any version is
+permanent authorization for you to choose that version for the
+Library.
diff --git a/contrib/aho-corasick/README.md b/contrib/aho-corasick/README.md
new file mode 100644
index 000000000..c245d6fc9
--- /dev/null
+++ b/contrib/aho-corasick/README.md
@@ -0,0 +1,67 @@
+aho-corasick
+==
+
+Aho-Corasick parallel string search, using interleaved arrays.
+
+Mischa Sandberg mischasan@gmail.com
+
+ACISM is an implementation of Aho-Corasick parallel string search,
+using an Interleaved State-transition Matrix.
+It combines the fastest possible Aho-Corasick implementation,
+with the smallest possible data structure (!).
+
+FEATURES
+--------
+
+* Fast. No hashing, no tree traversal; just a straight look-up equivalent to
+ matrix[state, input-byte] per input character.
+
+* Tiny. On average, the whole data structure (mostly the array) takes about 2-3 bytes per
+ input pattern byte. The original set of pattern strings can be reverse-generated from the machine.
+
+* Shareable. The state machine contains no pointers, so it can be compiled once,
+ then memory-mapped by many processes.
+
+* Searches byte vectors, not null-terminated strings.
+ Suitable for searching machine code as much as searching text.
+
+* DOS-proof. Well, that's an attribute of Aho-Corasick,
+ so no real points for that.
+
+* Stream-ready. The state can be saved between calls to search data.
+
+DOCUMENTATION
+-------------
+
+The GoogleDocs description is at http://goo.gl/lE6zG
+I originally called it "psearch", but found that name was overused by other authors.
+
+LICENSE
+-------
+
+Though I've had strong suggestions to go with BSD license, I'm going with GPL2 until I figure out
+how to keep in touch with people who download and use the code. Hence the "CONTACT ME IF..." line in the license.
+
+GETTING STARTED
+---------------
+
+Download the source, type "gmake".
+"gmake install" exports lib/libacism.a, include/acism.h and bin/acism_x.
+"acism_x.c" is a good example of calling acism_create and acism_scan/acism_more.
+
+(If you're interested in the GNUmakefile and rules.mk,
+ check my blog posts on non-recursive make, at mischasan.wordpress.com.)
+
+HISTORY
+-------
+
+The interleaved-array approach was tried and discarded in the late 70's, because the compile time was O(n^2).
+acism_create beats the problem with a "hint" array that tracks the restart points for searches.
+That, plus discarding the original idea of how to get maximal density, resulted in the tiny-fast win-win.
+
+ACKNOWLEDGEMENTS
+----------------
+
+I'd like to thank Mike Shannon, who wanted to see a machine built to make best use of L1/L2 cache.
+The change to do that doubled performance on hardware with a much larger cache than the matrix.
+Go figure.
diff --git a/contrib/aho-corasick/_acism.h b/contrib/aho-corasick/_acism.h
new file mode 100644
index 000000000..b994bf073
--- /dev/null
+++ b/contrib/aho-corasick/_acism.h
@@ -0,0 +1,105 @@
+/*
+** Copyright (C) 2009-2014 Mischa Sandberg <mischasan@gmail.com>
+**
+** This program is free software; you can redistribute it and/or modify
+** it under the terms of the GNU Lesser General Public License Version as
+** published by the Free Software Foundation. You may not use, modify or
+** distribute this program under any other version of the GNU Lesser General
+** Public License.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+** GNU Lesser General Public License for more details.
+**
+** You should have received a copy of the GNU Lesser General Public License
+** along with this program; if not, write to the Free Software
+** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+*/
+#ifndef _ACISM_H
+#define _ACISM_H
+
+#include <stdint.h>
+#include <stdlib.h> // malloc
+#include <string.h> // memcpy
+
+typedef int (*qsort_cmp)(const void *, const void *);
+
+#ifndef ACISM_SIZE
+# define ACISM_SIZE 4
+#endif
+
+#if ACISM_SIZE == 8
+ typedef uint64_t TRAN, STATE, STRNO;
+# define SYM_BITS 9U
+# define SYM_MASK 511
+#else
+ typedef uint32_t TRAN, STATE, STRNO;
+# define SYM_BITS psp->sym_bits
+# define SYM_MASK psp->sym_mask
+#endif
+
+typedef uint16_t SYMBOL;
+typedef unsigned _SYMBOL; // An efficient stacklocal SYMBOL
+
+enum {
+ IS_MATCH = (TRAN)1 << (8*sizeof(TRAN) - 1),
+ IS_SUFFIX = (TRAN)1 << (8*sizeof(TRAN) - 2),
+ T_FLAGS = IS_MATCH | IS_SUFFIX
+};
+
+typedef struct { STATE state; STRNO strno; } STRASH;
+
+typedef enum { BASE=2, USED=1 } USES;
+
+struct acism {
+ TRAN* tranv;
+ STRASH* hashv;
+ unsigned flags;
+# define IS_MMAP 1
+
+#if ACISM_SIZE < 8
+ TRAN sym_mask;
+ unsigned sym_bits;
+#endif
+ unsigned hash_mod; // search hashv starting at (state + sym) % hash_mod.
+ unsigned hash_size; // #(hashv): hash_mod plus the overflows past [hash_mod-1]
+ unsigned tran_size; // #(tranv)
+ unsigned nsyms, nchars, nstrs, maxlen;
+ SYMBOL symv[256];
+};
+
+#include "acism.h"
+
+// p_size: size of tranv + hashv
+static inline unsigned p_size(ac_trie_t const *psp)
+{ return psp->hash_size * sizeof*psp->hashv
+ + psp->tran_size * sizeof*psp->tranv; }
+
+static inline unsigned p_hash(ac_trie_t const *psp, STATE s)
+ { return s * 107 % psp->hash_mod; }
+
+static inline void set_tranv(ac_trie_t *psp, void *mem)
+ { psp->hashv = (STRASH*)&(psp->tranv = mem)[psp->tran_size]; }
+
+// TRAN accessors. For ACISM_SIZE=8, SYM_{BITS,MASK} do not use psp.
+
+static inline TRAN p_tran(ac_trie_t const *psp, STATE s, _SYMBOL sym)
+ { return psp->tranv[s + sym] ^ sym; }
+
+static inline _SYMBOL t_sym(ac_trie_t const *psp, TRAN t)
+ { (void)psp; return t & SYM_MASK; }
+
+static inline STATE t_next(ac_trie_t const *psp, TRAN t)
+ { (void)psp; return (t & ~T_FLAGS) >> SYM_BITS; }
+
+static inline int t_isleaf(ac_trie_t const *psp, TRAN t)
+ { return t_next(psp, t) >= psp->tran_size; }
+
+static inline int t_strno(ac_trie_t const *psp, TRAN t)
+ { return t_next(psp, t) - psp->tran_size; }
+
+static inline _SYMBOL t_valid(ac_trie_t const *psp, TRAN t)
+ { return !t_sym(psp, t); }
+
+#endif//_ACISM_H
diff --git a/contrib/aho-corasick/acism.c b/contrib/aho-corasick/acism.c
new file mode 100644
index 000000000..a4c678154
--- /dev/null
+++ b/contrib/aho-corasick/acism.c
@@ -0,0 +1,119 @@
+/*
+** Copyright (C) 2009-2014 Mischa Sandberg <mischasan@gmail.com>
+**
+** This program is free software; you can redistribute it and/or modify
+** it under the terms of the GNU Lesser General Public License Version as
+** published by the Free Software Foundation. You may not use, modify or
+** distribute this program under any other version of the GNU Lesser General
+** Public License.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+** GNU Lesser General Public License for more details.
+**
+** You should have received a copy of the GNU Lesser General Public License
+** along with this program; if not, write to the Free Software
+** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+*/
+
+#include "_acism.h"
+
+#define BACK ((SYMBOL)0)
+#define ROOT ((STATE) 0)
+
+int
+acism_lookup(ac_trie_t const *psp, const char *text, size_t len,
+ ACISM_ACTION *cb, void *context, int *statep)
+{
+ ac_trie_t const ps = *psp;
+ char const *cp = text, *endp = cp + len;
+ STATE state = *statep;
+ int ret = 0;
+
+ while (cp < endp) {
+ _SYMBOL sym = ps.symv[(uint8_t)*cp++];
+ if (!sym) {
+ // Input byte is not in any pattern string.
+ state = ROOT;
+ continue;
+ }
+
+ // Search for a valid transition from this (state, sym),
+ // following the backref chain.
+
+ TRAN next;
+ while (!t_valid(&ps, next = p_tran(&ps, state, sym)) && state != ROOT) {
+ TRAN back = p_tran(&ps, state, BACK);
+ state = t_valid(&ps, back) ? t_next(&ps, back) : ROOT;
+ }
+
+ if (!t_valid(&ps, next))
+ continue;
+
+ if (!(next & (IS_MATCH | IS_SUFFIX))) {
+ // No complete match yet; keep going.
+ state = t_next(&ps, next);
+ continue;
+ }
+
+ // At this point, one or more patterns have matched.
+ // Find all matches by following the backref chain.
+ // A valid node for (sym) with no SUFFIX flag marks the
+ // end of the suffix chain.
+ // In the same backref traversal, find a new (state),
+ // if the original transition is to a leaf.
+
+ STATE s = state;
+
+ // Initially state is ROOT. The chain search saves the
+ // first state from which the next char has a transition.
+ state = t_isleaf(&ps, next) ? 0 : t_next(&ps, next);
+
+ while (1) {
+
+ if (t_valid(&ps, next)) {
+
+ if (next & IS_MATCH) {
+ unsigned strno, ss = s + sym, i;
+ if (t_isleaf(&ps, ps.tranv[ss])) {
+ strno = t_strno(&ps, ps.tranv[ss]);
+ } else {
+ for (i = p_hash(&ps, ss); ps.hashv[i].state != ss; ++i);
+ strno = ps.hashv[i].strno;
+ }
+
+ if ((ret = cb(strno, cp - text, context)))
+ goto EXIT;
+ }
+
+ if (!state && !t_isleaf(&ps, next))
+ state = t_next(&ps, next);
+ if ( state && !(next & IS_SUFFIX))
+ break;
+ }
+
+ if (s == ROOT)
+ break;
+
+ TRAN b = p_tran(&ps, s, BACK);
+ s = t_valid(&ps, b) ? t_next(&ps, b) : ROOT;
+ next = p_tran(&ps, s, sym);
+ }
+ }
+EXIT:
+ *statep = state;
+ return ret;
+}
+
+void
+acism_destroy(ac_trie_t *psp)
+{
+ if (!psp) return;
+ if (psp->flags & IS_MMAP)
+ munmap((char*)psp->tranv - sizeof(ac_trie_t),
+ sizeof(ac_trie_t) + p_size(psp));
+ else free(psp->tranv);
+ free(psp);
+}
+//EOF
diff --git a/contrib/aho-corasick/acism.h b/contrib/aho-corasick/acism.h
new file mode 100644
index 000000000..3886b149e
--- /dev/null
+++ b/contrib/aho-corasick/acism.h
@@ -0,0 +1,51 @@
+/*
+** Copyright (C) 2009-2014 Mischa Sandberg <mischasan@gmail.com>
+** Copyright (C) 2015 Vsevolod Stakhov <vsevolod@highsecure.ru>
+**
+** This program is free software; you can redistribute it and/or modify
+** it under the terms of the GNU Lesser General Public License as
+** published by the Free Software Foundation. You may not use, modify or
+** distribute this program under any other version of the GNU Lesser General
+** Public License.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+** GNU Lesser General Public License for more details.
+**
+** You should have received a copy of the GNU Lesser General Public License
+** along with this program; if not, write to the Free Software
+** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+*/
+
+#ifndef ACISM_H
+#define ACISM_H
+
+#include "config.h"
+// "acism" uses MEMREF {ptr,len} bytevec structs for "string" args,
+// rather than NUL-terminated "C" strings.
+
+typedef struct { char const *ptr; size_t len; } ac_trie_pat_t;
+
+typedef struct acism ac_trie_t;
+
+ac_trie_t* acism_create(ac_trie_pat_t const *strv, int nstrs);
+void acism_destroy(ac_trie_t*);
+
+// For each match, acism_scan calls its ACISM_ACTION fn,
+// giving it the strv[] index of the matched string,
+// and the text[] offset of the byte PAST the end of the string.
+// If ACISM_ACTION returns 0, search continues; otherwise,
+// acism_more returns that nonzero value immediately.
+
+typedef int (ACISM_ACTION)(int strnum, int textpos, void *context);
+
+// If sequential blocks of (text) are passed to repeated acism_more calls,
+// then search continues where the previous acism_more left off --
+// string matches can cross block boundaries.
+// *state should initially be (0).
+
+int acism_lookup(ac_trie_t const *psp, const char *text, size_t len,
+ ACISM_ACTION *cb, void *context, int *statep);
+
+#endif//ACISM_H
diff --git a/contrib/aho-corasick/acism_create.c b/contrib/aho-corasick/acism_create.c
new file mode 100644
index 000000000..b1d9b51e3
--- /dev/null
+++ b/contrib/aho-corasick/acism_create.c
@@ -0,0 +1,396 @@
+/*
+** Copyright (C) 2009-2014 Mischa Sandberg <mischasan@gmail.com>
+**
+** This program is free software; you can redistribute it and/or modify
+** it under the terms of the GNU Lesser General Public License Version as
+** published by the Free Software Foundation. You may not use, modify or
+** distribute this program under any other version of the GNU Lesser General
+** Public License.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+** GNU Lesser General Public License for more details.
+**
+** You should have received a copy of the GNU Lesser General Public License
+** along with this program; if not, write to the Free Software
+** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+*/
+#include "_acism.h"
+
+typedef struct tnode {
+ struct tnode *child, *next, *back;
+ union { unsigned nrefs; STATE state; } x;
+ STRNO match;
+ SYMBOL sym, is_suffix;
+} TNODE;
+
+//--------------|---------------------------------------------
+// bitwid: 1+floor(log2(u))
+static inline int bitwid(unsigned u)
+{
+ int ret = !!u;
+ if (u & 0xFFFF0000) u >>= 16, ret += 16;
+ if (u & 0x0000FF00) u >>= 8, ret += 8;
+ if (u & 0x000000F0) u >>= 4, ret += 4;
+ if (u & 0x0000000C) u >>= 2, ret += 2;
+ if (u & 0x00000002) ret++;
+ return ret;
+}
+static void fill_symv(ac_trie_t*, ac_trie_pat_t const*, int ns);
+static int create_tree(TNODE*, SYMBOL const*symv, ac_trie_pat_t const*strv, int nstrs);
+static void add_backlinks(TNODE*, TNODE**, TNODE**);
+static void prune_backlinks(TNODE*);
+static int interleave(TNODE*, int nnodes, int nsyms, TNODE**, TNODE**);
+static void fill_tranv(ac_trie_t*, TNODE const*);
+static void fill_hashv(ac_trie_t*, TNODE const*, int nn);
+
+static TNODE* find_child(TNODE*, SYMBOL);
+
+// (ns) is either a STATE, or a (STRNO + tran_size)
+static inline void
+set_tran(ac_trie_t *psp, STATE s, SYMBOL sym, int match, int suffix, TRAN ns)
+{
+ psp->tranv[s + sym] = sym | (match ? IS_MATCH : 0)
+ | (suffix ? IS_SUFFIX : 0)
+ | (ns << SYM_BITS);
+}
+
+// Track statistics for construction
+#ifdef ACISM_STATS
+typedef struct { long long val; const char *name; } PSSTAT;
+extern PSSTAT psstat[];
+# define NOTE(n) (psstat[__LINE__] = (PSSTAT) {n, #n})
+# define HIT(id) (psstat[__LINE__].val++, psstat[__LINE__].name = id)
+#else
+# define NOTE(n) (void)0
+# define HIT(id) (void)0
+#endif //ACISM_STATS
+
+//--------------|---------------------------------------------
+ac_trie_t*
+acism_create(ac_trie_pat_t const* strv, int nstrs)
+{
+ TNODE *tp, **v1 = NULL, **v2 = NULL;
+ ac_trie_t *psp = calloc(1, sizeof*psp);
+
+ fill_symv(psp, strv, nstrs);
+ TNODE *troot = calloc(psp->nchars + 1, sizeof*troot);
+
+ int nnodes = create_tree(troot, psp->symv, strv, nstrs);
+ NOTE(nnodes);
+
+ // v1, v2: breadth-first work vectors for add_backlink and interleave.
+ int nhash, i = (nstrs + 1) * sizeof*tp;
+ add_backlinks(troot, v1 = malloc(i), v2 = malloc(i));
+
+ for (tp = troot + nnodes, nhash = 0; --tp > troot;) {
+ prune_backlinks(tp);
+ nhash += tp->match && tp->child;
+ }
+
+ // Calculate each node's offset in tranv[]:
+ psp->tran_size = interleave(troot, nnodes, psp->nsyms, v1, v2);
+ if (bitwid(psp->tran_size + nstrs - 1) + SYM_BITS > sizeof(TRAN)*8 - 2)
+ goto FAIL;
+
+ if (nhash) {
+ // Hash table is for match info of non-leaf nodes (only).
+ // Set hash_size for p_size(psp):
+ psp->hash_mod = nhash * 5 / 4 + 1;
+ // Initially oversize the table for overflows without wraparound.
+ psp->hash_size = psp->hash_mod + nhash;
+ }
+
+ set_tranv(psp, calloc(p_size(psp), 1));
+ if (!psp->tranv) goto FAIL;
+ fill_tranv(psp, troot);
+ // The root state (0) must not look like a valid backref.
+ // Any symbol value other than (0) in tranv[0] ensures that.
+ psp->tranv[0] = 1;
+
+ if (nhash) {
+ fill_hashv(psp, troot, nnodes);
+ // Adjust hash_size to include trailing overflows
+ // but trim trailing empty slots.
+ psp->hash_size = psp->hash_mod;
+ while ( psp->hashv[psp->hash_size].state) ++psp->hash_size;
+ while (!psp->hashv[psp->hash_size - 1].state) --psp->hash_size;
+ set_tranv(psp, realloc(psp->tranv, p_size(psp)));
+ }
+
+ // Diagnostics/statistics only:
+ psp->nstrs = nstrs;
+ for (i = psp->maxlen = 0; i < nstrs; ++i)
+ if (psp->maxlen < strv[i].len) psp->maxlen = strv[i].len;
+
+ goto DONE;
+FAIL: acism_destroy(psp), psp = NULL;
+DONE: free(troot), free(v1), free(v2);
+ return psp;
+}
+
+typedef struct { int freq, rank; } FRANK;
+static int frcmp(FRANK*a, FRANK*b) { return a->freq - b->freq; }
+
+static void
+fill_symv(ac_trie_t *psp, ac_trie_pat_t const *strv, int nstrs)
+{
+ int i, j;
+ FRANK frv[256];
+
+ for (i = 0; i < 256; ++i) frv[i] = (FRANK){0,i};
+ for (i = 0; i < nstrs; ++i)
+ for (psp->nchars += j = strv[i].len; --j >= 0;)
+ frv[(uint8_t)strv[i].ptr[j]].freq++;
+
+ qsort(frv, 256, sizeof*frv, (qsort_cmp)frcmp);
+
+ for (i = 256; --i >= 0 && frv[i].freq;)
+ psp->symv[frv[i].rank] = ++psp->nsyms;
+ ++psp->nsyms;
+#if ACISM_SIZE < 8
+ psp->sym_bits = bitwid(psp->nsyms);
+ psp->sym_mask = ~(-1 << psp->sym_bits);
+#endif
+}
+
+static int
+create_tree(TNODE *Tree, SYMBOL const *symv, ac_trie_pat_t const *strv, int nstrs)
+{
+ int i, j;
+ TNODE *nextp = Tree + 1;
+
+ for (i = 0; i < nstrs; ++i) {
+ TNODE *tp = Tree;
+
+ for (j = 0; tp->child && j < (int)strv[i].len; ++j) {
+ SYMBOL sym = symv[(uint8_t)strv[i].ptr[j]];
+
+ if (sym < tp->child->sym) {
+ // Prep to insert new node before tp->child
+ nextp->next = tp->child;
+ break;
+ }
+
+ tp = tp->child;
+ while (tp->next && sym >= tp->next->sym)
+ tp = tp->next;
+
+ // Insert new sibling after tp
+ if (sym > tp->sym) {
+ nextp->next = tp->next;
+ tp = tp->next = nextp++;
+ tp->sym = sym;
+ tp->back = Tree;
+ }
+ }
+
+ for (; j < (int) strv[i].len; ++j) {
+ tp = tp->child = nextp++;
+ tp->sym = symv[(uint8_t)strv[i].ptr[j]];
+ tp->back = Tree;
+ }
+
+ tp->match = i + 1; // Encode strno as nonzero
+ }
+
+ return nextp - Tree;
+}
+
+static void
+add_backlinks(TNODE *troot, TNODE **v1, TNODE **v2)
+{
+ TNODE *tp, **tmp;
+
+ for (tp = troot->child, tmp = v1; tp; tp = tp->next)
+ *tmp++ = tp;
+ *tmp = NULL;
+
+ while (*v1) {
+ TNODE **spp = v1, **dpp = v2, *srcp, *dstp;
+ while ((srcp = *spp++)) {
+ for (dstp = srcp->child; dstp; dstp = dstp->next) {
+ TNODE *bp = NULL;
+ *dpp++ = dstp;
+ for (tp = srcp->back; tp; tp = tp->back)
+ if ((bp = find_child(tp, dstp->sym))) break;
+ if (!bp)
+ bp = troot;
+
+ dstp->back = dstp->child ? bp : tp ? tp : troot;
+ dstp->back->x.nrefs++;
+ dstp->is_suffix = bp->match || bp->is_suffix;
+ }
+ }
+ *dpp = 0;
+ tmp = v1; v1 = v2; v2 = tmp;
+ }
+}
+
+static void
+prune_backlinks(TNODE *tp)
+{
+ if (tp->x.nrefs || !tp->child)
+ return;
+
+ TNODE *bp;
+ // (bp != bp->back IFF bp != troot)
+ while ((bp = tp->back) && !bp->match && bp != bp->back) {
+ HIT("backlinks");
+ TNODE *cp = tp->child, *pp = bp->child;
+
+ // Search for a child of bp that's not a child of tp
+ for (; cp && pp && pp->sym >= cp->sym; cp = cp->next) {
+ if (pp->sym == cp->sym) {
+ if (pp->match && cp->is_suffix) break;
+ pp = pp->next;
+ }
+ }
+
+ if (pp) break;
+
+ // So, target of back link is not a suffix match
+ // of this node, and its children are a subset
+ // of this node's children: prune it.
+ HIT("pruned");
+ if ((tp->back = bp->back)) {
+ tp->back->x.nrefs++;
+ if (!--bp->x.nrefs)
+ prune_backlinks(bp);
+ }
+ }
+}
+
+static int
+interleave(TNODE *troot, int nnodes, int nsyms, TNODE **v1, TNODE **v2)
+{
+ unsigned usev_size = nnodes + nsyms;
+ char *usev = calloc(usev_size, sizeof*usev);
+ STATE last_trans = 0, startv[nsyms][2];
+ TNODE *cp, **tmp;
+
+ memset(startv, 0, nsyms * sizeof*startv);
+
+ // Iterate through one level of the Tree at a time.
+ // That srsly improves locality (L1-cache use).
+
+ v1[0] = troot, v1[1] = NULL;
+ for (; *v1; tmp = v1, v1 = v2, v2 = tmp) {
+ TNODE **srcp = v1, **dstp = v2, *tp;
+ while ((tp = *srcp++)) {
+ if (!tp->child) continue;
+
+ HIT("nonleaf");
+ if (tp->back == troot) tp->back = NULL; // simplify tests.
+ cp = tp->child;
+
+ STATE pos, *startp = &startv[cp->sym][!!tp->back];
+ while ((cp = cp->next)) {
+ STATE *newp = &startv[cp->sym][!!tp->back];
+ if (*startp < *newp) startp = newp;
+ }
+
+ // If (tp) has a backref, we need a slot at offset 0
+ // that is free as a base AND to be used (filled in).
+ char need = tp->back ? BASE|USED : BASE;
+ for (pos = *startp;; ++pos) {
+ if (usev[pos] & need) {
+ HIT("inner loop");
+ continue;
+ }
+
+ for (cp = tp->child; cp; cp = cp->next) {
+ HIT("child loop");
+ if (usev[pos + cp->sym] & USED) break;
+ }
+
+ // No child needs an in-use slot? We're done.
+ if (!cp) break;
+ }
+ tp->x.state = pos;
+
+ // Mark node's base and children as used:
+ usev[pos] |= need;
+ STATE last = 0; // Make compiler happy
+ int nkids = 0;
+ for (cp = tp->child; cp; *dstp++ = cp, cp = cp->next, ++nkids)
+ usev[last = pos + cp->sym] |= USED;
+
+ // This is a HEURISTIC for advancing search for other nodes
+ *startp += (pos - *startp) / nkids;
+
+ if (last_trans < last) {
+ last_trans = last;
+ if (last + nsyms >= usev_size) {
+ usev = realloc(usev, usev_size << 1);
+ memset(usev + usev_size, 0, usev_size);
+ usev_size <<= 1;
+ }
+ }
+ }
+
+ *dstp = NULL;
+ }
+
+ free(usev);
+ return last_trans + 1;
+}
+
+static void
+fill_hashv(ac_trie_t *psp, TNODE const treev[], int nnodes)
+{
+ STRASH *sv = malloc(psp->hash_mod * sizeof*sv), *sp = sv;
+ int i;
+
+ // First pass: insert without resolving collisions.
+ for (i = 0; i < nnodes; ++i) {
+ STATE base = treev[i].x.state;
+ TNODE const *tp;
+ for (tp = treev[i].child; tp; tp = tp->next) {
+ if (tp->match && tp->child) {
+ STATE state = base + tp->sym;
+ STRASH *hp = &psp->hashv[p_hash(psp, state)];
+ *(hp->state ? sp++ : hp) = (STRASH){state, tp->match - 1};
+ }
+ }
+ }
+
+ while (--sp >= sv) {
+ HIT("hash collisions");
+ for (i = p_hash(psp, sp->state); psp->hashv[i].state; ++i)
+ HIT("hash displacements");
+ psp->hashv[i] = *sp;
+ }
+
+ free(sv);
+}
+
+static void
+fill_tranv(ac_trie_t *psp, TNODE const*tp)
+{
+ TNODE const *cp = tp->child;
+
+ if (cp && tp->back)
+ set_tran(psp, tp->x.state, 0, 0, 0, tp->back->x.state);
+
+ for (; cp; cp = cp->next) {
+ //NOTE: cp->match is (strno+1) so that !cp->match means "no match".
+ set_tran(psp, tp->x.state, cp->sym, cp->match, cp->is_suffix,
+ cp->child ? cp->x.state : cp->match - 1 + psp->tran_size);
+ if (cp->child)
+ fill_tranv(psp, cp);
+ }
+}
+
+static TNODE *
+find_child(TNODE *tp, SYMBOL sym)
+{
+ for (tp = tp->child; tp && tp->sym < sym; tp = tp->next);
+ return tp && tp->sym == sym ? tp : NULL;
+}
+
+#ifdef ACISM_STATS
+PSSTAT psstat[__LINE__] = {{__LINE__,0}};
+#endif//ACISM_STATS
+//EOF