You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

acism_create.c 11KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368
  1. /*
  2. ** Copyright (C) 2009-2014 Mischa Sandberg <mischasan@gmail.com>
  3. **
  4. ** This program is free software; you can redistribute it and/or modify
  5. ** it under the terms of the GNU Lesser General Public License Version as
  6. ** published by the Free Software Foundation. You may not use, modify or
  7. ** distribute this program under any other version of the GNU Lesser General
  8. ** Public License.
  9. **
  10. ** This program is distributed in the hope that it will be useful,
  11. ** but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. ** GNU Lesser General Public License for more details.
  14. **
  15. ** You should have received a copy of the GNU Lesser General Public License
  16. ** along with this program; if not, write to the Free Software
  17. ** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18. */
  19. #include "_acism.h"
  20. typedef struct tnode {
  21. struct tnode *child, *next, *back;
  22. union { unsigned nrefs; STATE state; } x;
  23. STRNO match;
  24. SYMBOL sym, is_suffix;
  25. } TNODE;
  26. //--------------|---------------------------------------------
  27. // bitwid: 1+floor(log2(u))
  28. static inline int bitwid(unsigned u)
  29. {
  30. int ret = !!u;
  31. if (u & 0xFFFF0000) u >>= 16, ret += 16;
  32. if (u & 0x0000FF00) u >>= 8, ret += 8;
  33. if (u & 0x000000F0) u >>= 4, ret += 4;
  34. if (u & 0x0000000C) u >>= 2, ret += 2;
  35. if (u & 0x00000002) ret++;
  36. return ret;
  37. }
  38. static void fill_symv(ac_trie_t*, ac_trie_pat_t const*, int ns);
  39. static int create_tree(TNODE*, SYMBOL const*symv, ac_trie_pat_t const*strv, int nstrs);
  40. static void add_backlinks(TNODE*, TNODE**, TNODE**);
  41. static int interleave(TNODE*, int nnodes, int nsyms, TNODE**, TNODE**);
  42. static void fill_tranv(ac_trie_t*, TNODE const*);
  43. static void fill_hashv(ac_trie_t*, TNODE const*, int nn);
  44. static TNODE* find_child(TNODE*, SYMBOL);
  45. // (ns) is either a STATE, or a (STRNO + tran_size)
  46. static inline void
  47. set_tran(ac_trie_t *psp, STATE s, SYMBOL sym, int match, int suffix, TRAN ns)
  48. {
  49. psp->tranv[s + sym] = sym | (match ? IS_MATCH : 0)
  50. | (suffix ? IS_SUFFIX : 0)
  51. | (ns << SYM_BITS);
  52. }
  53. // Track statistics for construction
  54. #ifdef ACISM_STATS
  55. typedef struct { long long val; const char *name; } PSSTAT;
  56. extern PSSTAT psstat[];
  57. # define NOTE(n) (psstat[__LINE__] = (PSSTAT) {n, #n})
  58. # define HIT(id) (psstat[__LINE__].val++, psstat[__LINE__].name = id)
  59. #else
  60. # define NOTE(n) (void)0
  61. # define HIT(id) (void)0
  62. #endif //ACISM_STATS
  63. //--------------|---------------------------------------------
  64. ac_trie_t*
  65. acism_create(ac_trie_pat_t const* strv, int nstrs)
  66. {
  67. TNODE *tp, **v1 = NULL, **v2 = NULL;
  68. ac_trie_t *psp = g_malloc0(sizeof(*psp));
  69. fill_symv(psp, strv, nstrs);
  70. TNODE *troot = g_new0(TNODE, psp->nchars + 1);
  71. int nnodes = create_tree(troot, psp->symv, strv, nstrs);
  72. NOTE(nnodes);
  73. // v1, v2: breadth-first work vectors for add_backlink and interleave.
  74. int nhash = 0, i = (nstrs + 1) * sizeof*tp;
  75. for (tp = troot + nnodes; --tp > troot;) {
  76. nhash += tp->match && tp->child;
  77. }
  78. v1 = g_malloc(i);
  79. v2 = g_malloc(i);
  80. add_backlinks(troot, v1, v2);
  81. // Calculate each node's offset in tranv[]:
  82. psp->tran_size = interleave(troot, nnodes, psp->nsyms, v1, v2);
  83. if (bitwid(psp->tran_size + nstrs - 1) + SYM_BITS > sizeof(TRAN)*8 - 2)
  84. goto FAIL;
  85. if (nhash) {
  86. // Hash table is for match info of non-leaf nodes (only).
  87. // Set hash_size for p_size(psp):
  88. psp->hash_mod = nhash * 5 / 4 + 1;
  89. // Initially oversize the table for overflows without wraparound.
  90. psp->hash_size = psp->hash_mod + nhash;
  91. }
  92. set_tranv(psp, g_malloc0(p_size(psp)));
  93. if (!psp->tranv) goto FAIL;
  94. fill_tranv(psp, troot);
  95. // The root state (0) must not look like a valid backref.
  96. // Any symbol value other than (0) in tranv[0] ensures that.
  97. psp->tranv[0] = 1;
  98. if (nhash) {
  99. fill_hashv(psp, troot, nnodes);
  100. // Adjust hash_size to include trailing overflows
  101. // but trim trailing empty slots.
  102. psp->hash_size = psp->hash_mod;
  103. while ( psp->hashv[psp->hash_size].state) ++psp->hash_size;
  104. while (!psp->hashv[psp->hash_size - 1].state) --psp->hash_size;
  105. set_tranv(psp, g_realloc(psp->tranv, p_size(psp)));
  106. }
  107. // Diagnostics/statistics only:
  108. psp->nstrs = nstrs;
  109. for (i = psp->maxlen = 0; i < nstrs; ++i)
  110. if (psp->maxlen < strv[i].len) psp->maxlen = strv[i].len;
  111. goto DONE;
  112. FAIL: acism_destroy(psp), psp = NULL;
  113. DONE: g_free(troot), g_free(v1), g_free(v2);
  114. return psp;
  115. }
  116. typedef struct { int freq, rank; } FRANK;
  117. static int frcmp(FRANK*a, FRANK*b) { return a->freq - b->freq; }
  118. static void
  119. fill_symv(ac_trie_t *psp, ac_trie_pat_t const *strv, int nstrs)
  120. {
  121. int i, j;
  122. FRANK frv[256];
  123. for (i = 0; i < 256; ++i) frv[i] = (FRANK){0,i};
  124. for (i = 0; i < nstrs; ++i)
  125. for (psp->nchars += j = strv[i].len; --j >= 0;)
  126. frv[(uint8_t)strv[i].ptr[j]].freq++;
  127. qsort(frv, 256, sizeof*frv, (qsort_cmp)frcmp);
  128. for (i = 256; --i >= 0 && frv[i].freq;)
  129. psp->symv[frv[i].rank] = ++psp->nsyms;
  130. ++psp->nsyms;
  131. #if ACISM_SIZE < 8
  132. psp->sym_bits = bitwid(psp->nsyms);
  133. psp->sym_mask = ~(((TRAN)-1) << psp->sym_bits);
  134. #endif
  135. }
  136. static int
  137. create_tree(TNODE *Tree, SYMBOL const *symv, ac_trie_pat_t const *strv, int nstrs)
  138. {
  139. int i, j;
  140. TNODE *nextp = Tree + 1;
  141. for (i = 0; i < nstrs; ++i) {
  142. TNODE *tp = Tree;
  143. for (j = 0; tp->child && j < (int)strv[i].len; ++j) {
  144. SYMBOL sym = symv[(uint8_t)strv[i].ptr[j]];
  145. if (sym < tp->child->sym) {
  146. // Prep to insert new node before tp->child
  147. nextp->next = tp->child;
  148. break;
  149. }
  150. tp = tp->child;
  151. while (tp->next && sym >= tp->next->sym)
  152. tp = tp->next;
  153. // Insert new sibling after tp
  154. if (sym > tp->sym) {
  155. nextp->next = tp->next;
  156. tp = tp->next = nextp++;
  157. tp->sym = sym;
  158. tp->back = Tree;
  159. }
  160. }
  161. for (; j < (int) strv[i].len; ++j) {
  162. tp = tp->child = nextp++;
  163. tp->sym = symv[(uint8_t)strv[i].ptr[j]];
  164. tp->back = Tree;
  165. }
  166. tp->match = i + 1; // Encode strno as nonzero
  167. }
  168. return nextp - Tree;
  169. }
  170. static void
  171. add_backlinks(TNODE *troot, TNODE **v1, TNODE **v2)
  172. {
  173. TNODE *tp, **tmp;
  174. for (tp = troot->child, tmp = v1; tp; tp = tp->next)
  175. *tmp++ = tp;
  176. *tmp = NULL;
  177. while (*v1) {
  178. TNODE **spp = v1, **dpp = v2, *srcp, *dstp;
  179. while ((srcp = *spp++)) {
  180. for (dstp = srcp->child; dstp; dstp = dstp->next) {
  181. TNODE *bp = NULL;
  182. *dpp++ = dstp;
  183. for (tp = srcp->back; tp; tp = tp->back)
  184. if ((bp = find_child(tp, dstp->sym)) && bp->child) break;
  185. if (!bp)
  186. bp = troot;
  187. dstp->back = dstp->child ? bp : tp ? tp : troot;
  188. dstp->back->x.nrefs++;
  189. dstp->is_suffix = bp->match || bp->is_suffix;
  190. }
  191. }
  192. *dpp = 0;
  193. tmp = v1; v1 = v2; v2 = tmp;
  194. }
  195. }
  196. static int
  197. interleave(TNODE *troot, int nnodes, int nsyms, TNODE **v1, TNODE **v2)
  198. {
  199. unsigned usev_size = nnodes + nsyms;
  200. char *usev = g_new0(char, usev_size);
  201. STATE last_trans = 0, startv[nsyms][2];
  202. TNODE *cp, **tmp;
  203. memset(startv, 0, nsyms * sizeof*startv);
  204. // Iterate through one level of the Tree at a time.
  205. // That srsly improves locality (L1-cache use).
  206. v1[0] = troot, v1[1] = NULL;
  207. for (; *v1; tmp = v1, v1 = v2, v2 = tmp) {
  208. TNODE **srcp = v1, **dstp = v2, *tp;
  209. while ((tp = *srcp++)) {
  210. if (!tp->child) continue;
  211. HIT("nonleaf");
  212. if (tp->back == troot) tp->back = NULL; // simplify tests.
  213. cp = tp->child;
  214. STATE pos, *startp = &startv[cp->sym][!!tp->back];
  215. while ((cp = cp->next)) {
  216. STATE *newp = &startv[cp->sym][!!tp->back];
  217. if (*startp < *newp) startp = newp;
  218. }
  219. // If (tp) has a backref, we need a slot at offset 0
  220. // that is free as a base AND to be used (filled in).
  221. char need = tp->back ? BASE|USED : BASE;
  222. for (pos = *startp;; ++pos) {
  223. if (usev[pos] & need) {
  224. HIT("inner loop");
  225. continue;
  226. }
  227. for (cp = tp->child; cp; cp = cp->next) {
  228. HIT("child loop");
  229. if (usev[pos + cp->sym] & USED) break;
  230. }
  231. // No child needs an in-use slot? We're done.
  232. if (!cp) break;
  233. }
  234. tp->x.state = pos;
  235. // Mark node's base and children as used:
  236. usev[pos] |= need;
  237. STATE last = 0; // Make compiler happy
  238. int nkids = 0;
  239. for (cp = tp->child; cp; *dstp++ = cp, cp = cp->next, ++nkids)
  240. usev[last = pos + cp->sym] |= USED;
  241. // This is a HEURISTIC for advancing search for other nodes
  242. *startp += (pos - *startp) / nkids;
  243. if (last_trans < last) {
  244. last_trans = last;
  245. if (last + nsyms >= usev_size) {
  246. char *tmp = g_realloc(usev, usev_size << 1);
  247. if (tmp != NULL) {
  248. usev = tmp;
  249. memset(usev + usev_size, 0, usev_size);
  250. usev_size <<= 1;
  251. } else {
  252. g_free(usev);
  253. /* And handle error */
  254. }
  255. }
  256. }
  257. }
  258. *dstp = NULL;
  259. }
  260. g_free(usev);
  261. return last_trans + 1;
  262. }
  263. static void
  264. fill_hashv(ac_trie_t *psp, TNODE const treev[], int nnodes)
  265. {
  266. STRASH *sv = g_malloc(psp->hash_mod * sizeof*sv), *sp = sv;
  267. int i;
  268. // First pass: insert without resolving collisions.
  269. for (i = 0; i < nnodes; ++i) {
  270. STATE base = treev[i].x.state;
  271. TNODE const *tp;
  272. for (tp = treev[i].child; tp; tp = tp->next) {
  273. if (tp->match && tp->child) {
  274. STATE state = base + tp->sym;
  275. STRASH *hp = &psp->hashv[p_hash(psp, state)];
  276. *(hp->state ? sp++ : hp) = (STRASH){state, tp->match - 1};
  277. }
  278. }
  279. }
  280. while (--sp >= sv) {
  281. HIT("hash collisions");
  282. for (i = p_hash(psp, sp->state); psp->hashv[i].state; ++i)
  283. HIT("hash displacements");
  284. psp->hashv[i] = *sp;
  285. }
  286. g_free(sv);
  287. }
  288. static void
  289. fill_tranv(ac_trie_t *psp, TNODE const*tp)
  290. {
  291. TNODE const *cp = tp->child;
  292. if (cp && tp->back)
  293. set_tran(psp, tp->x.state, 0, 0, 0, tp->back->x.state);
  294. for (; cp; cp = cp->next) {
  295. //NOTE: cp->match is (strno+1) so that !cp->match means "no match".
  296. set_tran(psp, tp->x.state, cp->sym, cp->match, cp->is_suffix,
  297. cp->child ? cp->x.state : cp->match - 1 + psp->tran_size);
  298. if (cp->child)
  299. fill_tranv(psp, cp);
  300. }
  301. }
  302. static TNODE *
  303. find_child(TNODE *tp, SYMBOL sym)
  304. {
  305. for (tp = tp->child; tp && tp->sym < sym; tp = tp->next);
  306. return tp && tp->sym == sym ? tp : NULL;
  307. }
  308. #ifdef ACISM_STATS
  309. PSSTAT psstat[__LINE__] = {{__LINE__,0}};
  310. #endif//ACISM_STATS
  311. //EOF