aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/aho-corasick/acism.c
blob: b0cee0d667c0ffe183616d1b4dfb875fefa8acc7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
/*
 * Copyright 2024 Vsevolod Stakhov
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
** Copyright (C) 2009-2014 Mischa Sandberg <mischasan@gmail.com>
**
** This program is free software; you can redistribute it and/or modify
** it under the terms of the GNU Lesser General Public License Version as
** published by the Free Software Foundation.  You may not use, modify or
** distribute this program under any other version of the GNU Lesser General
** Public License.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
** GNU Lesser General Public License for more details.
**
** You should have received a copy of the GNU Lesser General Public License
** along with this program; if not, write to the Free Software
** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
*/

#include <glib.h>

#include "_acism.h"
#include "unix-std.h"

#define BACK ((SYMBOL) 0)
#define ROOT ((STATE) 0)
extern const unsigned char lc_map[256];

int acism_lookup(ac_trie_t const *psp, const char *text, size_t len,
				 ACISM_ACTION *cb, void *context, int *statep, bool caseless)
{
	char const *cp = text, *endp = cp + len;
	uint8_t s;
	STATE state = *statep;
	int ret = 0;

	while (cp < endp) {
		s = caseless ? lc_map[(uint8_t) *cp++] : *cp++;
		_SYMBOL sym = psp->symv[s];
		if (!sym) {
			// Input byte is not in any pattern string.
			state = ROOT;
			continue;
		}

		// Search for a valid transition from this (state, sym),
		//  following the backref chain.

		TRAN next;
		while (!t_valid(psp, next = p_tran(psp, state, sym)) && state != ROOT) {
			TRAN back = p_tran(psp, state, BACK);
			state = t_valid(psp, back) ? t_next(psp, back) : ROOT;
		}

		if (!t_valid(psp, next))
			continue;

		if (!(next & (IS_MATCH | IS_SUFFIX))) {
			// No complete match yet; keep going.
			state = t_next(psp, next);
			continue;
		}

		// At this point, one or more patterns have matched.
		// Find all matches by following the backref chain.
		// A valid node for (sym) with no SUFFIX flag marks the
		//  end of the suffix chain.
		// In the same backref traversal, find a new (state),
		//  if the original transition is to a leaf.

		STATE s = state;

		// Initially state is ROOT. The chain search saves the
		//  first state from which the next char has a transition.
		state = t_isleaf(psp, next) ? 0 : t_next(psp, next);

		while (1) {

			if (t_valid(psp, next)) {

				if (next & IS_MATCH) {
					unsigned strno, ss = s + sym, i;
					if (t_isleaf(psp, psp->tranv[ss])) {
						strno = t_strno(psp, psp->tranv[ss]);
					}
					else {
						for (i = p_hash(psp, ss); psp->hashv[i].state != ss; ++i)
							;
						strno = psp->hashv[i].strno;
					}

					if ((ret = cb(strno, cp - text, context)))
						goto EXIT;
				}

				if (!state && !t_isleaf(psp, next))
					state = t_next(psp, next);
				if (state && !(next & IS_SUFFIX))
					break;
			}

			if (s == ROOT)
				break;

			TRAN b = p_tran(psp, s, BACK);
			s = t_valid(psp, b) ? t_next(psp, b) : ROOT;
			next = p_tran(psp, s, sym);
		}
	}
EXIT:
	*statep = state;
	return ret;
}

void acism_destroy(ac_trie_t *psp)
{
	if (!psp) return;
	if (psp->flags & IS_MMAP)
		munmap((char *) psp->tranv - sizeof(ac_trie_t),
			   sizeof(ac_trie_t) + p_size(psp));
	else
		g_free(psp->tranv);
	g_free(psp);
}
//EOF