diff options
Diffstat (limited to 'contrib/aho-corasick')
-rw-r--r-- | contrib/aho-corasick/acism.c | 191 |
1 files changed, 104 insertions, 87 deletions
diff --git a/contrib/aho-corasick/acism.c b/contrib/aho-corasick/acism.c index e2b48a590..b0cee0d66 100644 --- a/contrib/aho-corasick/acism.c +++ b/contrib/aho-corasick/acism.c @@ -1,4 +1,20 @@ /* + * Copyright 2024 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ** Copyright (C) 2009-2014 Mischa Sandberg <mischasan@gmail.com> ** ** This program is free software; you can redistribute it and/or modify @@ -22,103 +38,104 @@ #include "_acism.h" #include "unix-std.h" -#define BACK ((SYMBOL)0) +#define BACK ((SYMBOL) 0) #define ROOT ((STATE) 0) -extern const guchar lc_map[256]; +extern const unsigned char lc_map[256]; -int -acism_lookup(ac_trie_t const *psp, const char *text, size_t len, - ACISM_ACTION *cb, void *context, int *statep, bool caseless) +int acism_lookup(ac_trie_t const *psp, const char *text, size_t len, + ACISM_ACTION *cb, void *context, int *statep, bool caseless) { - char const *cp = text, *endp = cp + len; - uint8_t s; - STATE state = *statep; - int ret = 0; - - while (cp < endp) { - s = caseless ? lc_map[(guint8)*cp++] : *cp++; - _SYMBOL sym = psp->symv[s]; - if (!sym) { - // Input byte is not in any pattern string. - state = ROOT; - continue; - } - - // Search for a valid transition from this (state, sym), - // following the backref chain. - - TRAN next; - while (!t_valid(psp, next = p_tran(psp, state, sym)) && state != ROOT) { - TRAN back = p_tran(psp, state, BACK); - state = t_valid(psp, back) ? t_next(psp, back) : ROOT; - } - - if (!t_valid(psp, next)) - continue; - - if (!(next & (IS_MATCH | IS_SUFFIX))) { - // No complete match yet; keep going. - state = t_next(psp, next); - continue; - } - - // At this point, one or more patterns have matched. - // Find all matches by following the backref chain. - // A valid node for (sym) with no SUFFIX flag marks the - // end of the suffix chain. - // In the same backref traversal, find a new (state), - // if the original transition is to a leaf. - - STATE s = state; - - // Initially state is ROOT. The chain search saves the - // first state from which the next char has a transition. - state = t_isleaf(psp, next) ? 0 : t_next(psp, next); - - while (1) { - - if (t_valid(psp, next)) { - - if (next & IS_MATCH) { - unsigned strno, ss = s + sym, i; - if (t_isleaf(psp, psp->tranv[ss])) { - strno = t_strno(psp, psp->tranv[ss]); - } else { - for (i = p_hash(psp, ss); psp->hashv[i].state != ss; ++i); - strno = psp->hashv[i].strno; - } - - if ((ret = cb(strno, cp - text, context))) - goto EXIT; - } - - if (!state && !t_isleaf(psp, next)) - state = t_next(psp, next); - if ( state && !(next & IS_SUFFIX)) - break; - } - - if (s == ROOT) - break; - - TRAN b = p_tran(psp, s, BACK); - s = t_valid(psp, b) ? t_next(psp, b) : ROOT; - next = p_tran(psp, s, sym); - } - } + char const *cp = text, *endp = cp + len; + uint8_t s; + STATE state = *statep; + int ret = 0; + + while (cp < endp) { + s = caseless ? lc_map[(uint8_t) *cp++] : *cp++; + _SYMBOL sym = psp->symv[s]; + if (!sym) { + // Input byte is not in any pattern string. + state = ROOT; + continue; + } + + // Search for a valid transition from this (state, sym), + // following the backref chain. + + TRAN next; + while (!t_valid(psp, next = p_tran(psp, state, sym)) && state != ROOT) { + TRAN back = p_tran(psp, state, BACK); + state = t_valid(psp, back) ? t_next(psp, back) : ROOT; + } + + if (!t_valid(psp, next)) + continue; + + if (!(next & (IS_MATCH | IS_SUFFIX))) { + // No complete match yet; keep going. + state = t_next(psp, next); + continue; + } + + // At this point, one or more patterns have matched. + // Find all matches by following the backref chain. + // A valid node for (sym) with no SUFFIX flag marks the + // end of the suffix chain. + // In the same backref traversal, find a new (state), + // if the original transition is to a leaf. + + STATE s = state; + + // Initially state is ROOT. The chain search saves the + // first state from which the next char has a transition. + state = t_isleaf(psp, next) ? 0 : t_next(psp, next); + + while (1) { + + if (t_valid(psp, next)) { + + if (next & IS_MATCH) { + unsigned strno, ss = s + sym, i; + if (t_isleaf(psp, psp->tranv[ss])) { + strno = t_strno(psp, psp->tranv[ss]); + } + else { + for (i = p_hash(psp, ss); psp->hashv[i].state != ss; ++i) + ; + strno = psp->hashv[i].strno; + } + + if ((ret = cb(strno, cp - text, context))) + goto EXIT; + } + + if (!state && !t_isleaf(psp, next)) + state = t_next(psp, next); + if (state && !(next & IS_SUFFIX)) + break; + } + + if (s == ROOT) + break; + + TRAN b = p_tran(psp, s, BACK); + s = t_valid(psp, b) ? t_next(psp, b) : ROOT; + next = p_tran(psp, s, sym); + } + } EXIT: *statep = state; - return ret; + return ret; } -void -acism_destroy(ac_trie_t *psp) +void acism_destroy(ac_trie_t *psp) { if (!psp) return; if (psp->flags & IS_MMAP) - munmap((char*)psp->tranv - sizeof(ac_trie_t), - sizeof(ac_trie_t) + p_size(psp)); - else g_free(psp->tranv); + munmap((char *) psp->tranv - sizeof(ac_trie_t), + sizeof(ac_trie_t) + p_size(psp)); + else + g_free(psp->tranv); g_free(psp); } //EOF |