123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382 |
- /*
- ** Copyright (C) 2009-2014 Mischa Sandberg <mischasan@gmail.com>
- **
- ** This program is free software; you can redistribute it and/or modify
- ** it under the terms of the GNU Lesser General Public License Version 3 as
- ** published by the Free Software Foundation. You may not use, modify or
- ** distribute this program under any other version of the GNU Lesser General
- ** Public License.
- **
- ** This program is distributed in the hope that it will be useful,
- ** but WITHOUT ANY WARRANTY; without even the implied warranty of
- ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- ** GNU Lesser General Public License for more details.
- **
- ** You should have received a copy of the GNU Lesser General Public License
- ** along with this program; if not, write to the Free Software
- ** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- */
- #include "_acism.h"
-
- typedef enum { BASE=2, USED=1 } USES;
-
- typedef struct tnode {
- struct tnode *child, *next, *back;
- // nrefs was used in "prune_backlinks".
- // It will be used again in "curtail".
- unsigned nrefs;
- STATE state;
- STRNO match;
- SYMBOL sym;
- char is_suffix; // "bool"
- } TNODE;
-
- //--------------|---------------------------------------------
- // bitwid: 1+floor(log2(u))
- static inline int bitwid(unsigned u)
- {
- int ret = !!u;
- if (u & 0xFFFF0000) u >>= 16, ret += 16;
- if (u & 0x0000FF00) u >>= 8, ret += 8;
- if (u & 0x000000F0) u >>= 4, ret += 4;
- if (u & 0x0000000C) u >>= 2, ret += 2;
- if (u & 0x00000002) ret++;
- return ret;
- }
-
- static void fill_symv(ACISM*, MEMREF const*, int ns);
- static int create_tree(TNODE*, SYMBOL const*symv, MEMREF const*strv, int nstrs);
- static void add_backlinks(TNODE*, TNODE**, TNODE**);
- static int interleave(TNODE*, int nnodes, int nsyms, TNODE**, TNODE**);
- static void fill_tranv(ACISM*, TNODE const*);
- static void fill_hashv(ACISM*, TNODE const*, int nn);
-
- static TNODE* find_child(TNODE*, SYMBOL);
-
- // (ns) is either a STATE, or a (STRNO + tran_size)
- static inline void
- set_tran(ACISM *psp, STATE s, SYMBOL sym, int match, int suffix, TRAN ns)
- {
- psp->tranv[s + sym] = sym | (match ? IS_MATCH : 0)
- | (suffix ? IS_SUFFIX : 0)
- | (ns << SYM_BITS);
- }
-
- // Track statistics for construction
- #ifdef ACISM_STATS
- typedef struct { long long val; const char *name; } PSSTAT;
- extern PSSTAT psstat[];
- # define NOTE(n) (psstat[__LINE__] = (PSSTAT) {n, #n})
- # define HIT(id) (psstat[__LINE__].val++, psstat[__LINE__].name = id)
- #else
- # define NOTE(n) (void)0
- # define HIT(id) (void)0
- #endif //ACISM_STATS
-
- //--------------|---------------------------------------------
- ACISM*
- acism_create(MEMREF const* strv, int nstrs)
- {
- TNODE **v1 = NULL, **v2 = NULL;
- ACISM *psp = g_malloc0(sizeof*psp);
-
- fill_symv(psp, strv, nstrs);
- TNODE *troot = g_malloc0((psp->nchars + 1) * sizeof(*troot));
-
- int nnodes = create_tree(troot, psp->symv, strv, nstrs);
- NOTE(nnodes);
-
- // v1, v2: breadth-first work vectors for add_backlink and interleave.
- int i = (nstrs + 1) * sizeof(TNODE);
- add_backlinks(troot, v1 = g_malloc0(i), v2 = g_malloc0(i));
-
- int nhash = 0;
- TNODE* tp = troot + nnodes;
- while (--tp > troot)
- nhash += tp->match && tp->child;
-
- // Calculate each node's offset in tranv[]:
- psp->tran_size = interleave(troot, nnodes, psp->nsyms, v1, v2);
- if (bitwid(psp->tran_size + nstrs - 1) + SYM_BITS > sizeof(TRAN)*8 - 2)
- goto FAIL;
-
- if (nhash) {
- // Hash table is for match info of non-leaf nodes (only).
- // Set hash_size for p_size(psp):
- psp->hash_mod = nhash * 5 / 4 + 1;
- // Initially oversize the table for overflows without wraparound.
- psp->hash_size = psp->hash_mod + nhash;
- }
-
- set_tranv(psp, g_malloc0(p_size(psp) + sizeof(TRAN)));
- if (!psp->tranv) goto FAIL;
- fill_tranv(psp, troot);
- // The root state (0) must not look like a valid backref.
- // Any symbol value other than (0) in tranv[0] ensures that.
- psp->tranv[0] = 1;
-
- if (nhash) {
- fill_hashv(psp, troot, nnodes);
- // Adjust hash_size to include trailing overflows
- // but trim trailing empty slots.
- psp->hash_size = psp->hash_mod;
- while ( psp->hashv[psp->hash_size].state) ++psp->hash_size;
- while (!psp->hashv[psp->hash_size - 1].state) --psp->hash_size;
- set_tranv(psp, g_realloc(psp->tranv, p_size(psp)));
- }
-
- // Diagnostics/statistics only:
- psp->nstrs = nstrs;
- for (i = psp->maxlen = 0; i < nstrs; ++i)
- if (psp->maxlen < strv[i].len) psp->maxlen = strv[i].len;
-
- goto DONE;
- FAIL: acism_destroy(psp), psp = NULL;
- DONE: free(troot), free(v1), free(v2);
- return psp;
- }
-
- typedef struct { int freq, rank; } FRANK;
- static int frcmp(FRANK*a, FRANK*b) { return a->freq - b->freq; }
-
- static void
- fill_symv(ACISM *psp, MEMREF const *strv, int nstrs)
- {
- int i, j;
- FRANK frv[256];
-
- for (i = 0; i < 256; ++i) frv[i] = (FRANK){0,i};
- for (i = 0; i < nstrs; ++i)
- for (psp->nchars += j = strv[i].len; --j >= 0;)
- frv[(uint8_t)strv[i].ptr[j]].freq++;
-
- qsort(frv, 256, sizeof*frv, (qsort_cmp)frcmp);
-
- for (i = 256; --i >= 0 && frv[i].freq;)
- psp->symv[frv[i].rank] = ++psp->nsyms;
- ++psp->nsyms;
-
- #if ACISM_SIZE < 8
- psp->sym_bits = bitwid(psp->nsyms);
- psp->sym_mask = ~((~0u) << psp->sym_bits);
- #endif
- }
-
- static int
- create_tree(TNODE *Tree, SYMBOL const *symv, MEMREF const *strv, int nstrs)
- {
- int i, j;
- TNODE *nextp = Tree + 1;
-
- for (i = 0; i < nstrs; ++i) {
- TNODE *tp = Tree;
-
- for (j = 0; tp->child && j < (int)strv[i].len; ++j) {
- SYMBOL sym = symv[(uint8_t)strv[i].ptr[j]];
-
- if (sym < tp->child->sym) {
- // Prep to insert new node before tp->child
- nextp->next = tp->child;
- break;
- }
-
- tp = tp->child;
- while (tp->next && sym >= tp->next->sym)
- tp = tp->next;
-
- // Insert new sibling after tp
- if (sym > tp->sym) {
- nextp->next = tp->next;
- tp = tp->next = nextp++;
- tp->sym = sym;
- tp->back = Tree;
- }
- }
-
- for (; j < (int) strv[i].len; ++j) {
- tp = tp->child = nextp++;
- tp->sym = symv[(uint8_t)strv[i].ptr[j]];
- tp->back = Tree;
- }
-
- tp->match = i + 1; // Encode strno as nonzero
- }
-
- return nextp - Tree;
- }
-
- static void
- add_backlinks(TNODE *troot, TNODE **v1, TNODE **v2)
- {
- TNODE *tp, **tmp;
-
- for (tp = troot->child, tmp = v1; tp; tp = tp->next)
- *tmp++ = tp;
- *tmp = NULL;
-
- while (*v1) {
- TNODE **spp = v1, **dpp = v2, *srcp, *dstp;
-
- while ((srcp = *spp++)) {
- for (dstp = srcp->child; dstp; dstp = dstp->next) {
- TNODE *bp = NULL;
- if (dstp->child)
- *dpp++ = dstp;
-
- // Go through the parent (srcp) node's backlink chain,
- // looking for a useful backlink for the child (dstp).
- // If the parent (srcp) has a backlink to (tp),
- // and (tp) has a child matching the transition sym
- // for (srcp -> dstp), then it is a useful backlink
- // for the child (dstp).
- // Note that backlinks do not point at the suffix match;
- // they point at the PARENT of that match.
-
- for (tp = srcp->back; tp; tp = tp->back)
- if ((bp = find_child(tp, dstp->sym)))
- break;
- if (!bp)
- bp = troot;
-
- dstp->back = dstp->child ? bp : tp ? tp : troot;
- dstp->back->nrefs++;
- dstp->is_suffix = bp->match || bp->is_suffix;
- }
- }
- *dpp = 0;
- tmp = v1; v1 = v2; v2 = tmp;
- }
- }
-
- static int
- interleave(TNODE *troot, int nnodes, int nsyms, TNODE **v1, TNODE **v2)
- {
- unsigned usev_size = nnodes + nsyms;
- char *usev = g_malloc0(usev_size * sizeof(*usev));
- STATE last_trans = 0, startv[257][2];
- TNODE *cp, **tmp;
-
- memset(startv, 0, nsyms * sizeof*startv);
-
- // Iterate through one level of the Tree at a time.
- // That srsly improves locality (L1-cache use).
-
- v1[0] = troot, v1[1] = NULL;
- for (; *v1; tmp = v1, v1 = v2, v2 = tmp) {
- TNODE **srcp = v1, **dstp = v2, *tp;
- while ((tp = *srcp++)) {
- if (!tp->child) continue;
-
- HIT("nonleaf");
- if (tp->back == troot) tp->back = NULL; // simplify tests.
- cp = tp->child;
-
- STATE pos, *startp = &startv[cp->sym][!!tp->back];
- while ((cp = cp->next)) {
- STATE *newp = &startv[cp->sym][!!tp->back];
- if (*startp < *newp) startp = newp;
- }
-
- // If (tp) has a backref, we need a slot at offset 0
- // that is free as a base AND to be used (filled in).
- char need = tp->back ? BASE|USED : BASE;
- for (pos = *startp;; ++pos) {
- if (usev[pos] & need) {
- HIT("inner loop");
- continue;
- }
-
- for (cp = tp->child; cp; cp = cp->next) {
- HIT("child loop");
- if (usev[pos + cp->sym] & USED) break;
- }
-
- // No child needs an in-use slot? We're done.
- if (!cp) break;
- }
- tp->state = pos;
-
- // Mark node's base and children as used:
- usev[pos] |= need;
- STATE last = 0; // Make compiler happy
- int nkids = 0;
- for (cp = tp->child; cp; *dstp++ = cp, cp = cp->next, ++nkids)
- usev[last = pos + cp->sym] |= USED;
-
- // This is a HEURISTIC for advancing search for other nodes
- *startp += (pos - *startp) / nkids;
-
- if (last_trans < last) {
- last_trans = last;
- if (last + nsyms >= usev_size) {
- usev = g_realloc(usev, usev_size << 1);
- memset(usev + usev_size, 0, usev_size);
- usev_size <<= 1;
- }
- }
- }
-
- *dstp = NULL;
- }
-
- free(usev);
- return last_trans + 1;
- }
-
- static void
- fill_hashv(ACISM *psp, TNODE const treev[], int nnodes)
- {
- STRASH *sv = g_malloc0(psp->hash_mod * sizeof*sv), *sp = sv;
- int i;
-
- // First pass: insert without resolving collisions.
- for (i = 0; i < nnodes; ++i) {
- STATE base = treev[i].state;
- TNODE const *tp;
- for (tp = treev[i].child; tp; tp = tp->next) {
- if (tp->match && tp->child) {
- STATE state = base + tp->sym;
- STRASH *hp = &psp->hashv[p_hash(psp, state)];
- *(hp->state ? sp++ : hp) = (STRASH){state, tp->match - 1};
- }
- }
- }
-
- while (--sp >= sv) {
- HIT("hash collisions");
- for (i = p_hash(psp, sp->state); psp->hashv[i].state; ++i)
- HIT("hash displacements");
- psp->hashv[i] = *sp;
- }
-
- free(sv);
- }
-
- static void
- fill_tranv(ACISM *psp, TNODE const*tp)
- {
- TNODE const *cp = tp->child;
-
- if (cp && tp->back)
- set_tran(psp, tp->state, 0, 0, 0, tp->back->state);
-
- for (; cp; cp = cp->next) {
- //NOTE: cp->match is (strno+1) so that !cp->match means "no match".
- set_tran(psp, tp->state, cp->sym, cp->match, cp->is_suffix,
- cp->child ? cp->state : cp->match - 1 + psp->tran_size);
- if (cp->child)
- fill_tranv(psp, cp);
- }
- }
-
- static TNODE *
- find_child(TNODE *tp, SYMBOL sym)
- {
- for (tp = tp->child; tp && tp->sym < sym; tp = tp->next);
- return tp && tp->sym == sym ? tp : NULL;
- }
-
- #ifdef ACISM_STATS
- PSSTAT psstat[__LINE__] = {{__LINE__,0}};
- #endif//ACISM_STATS
- //EOF
|