123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412 |
- /*-
- * Copyright 2016 Vsevolod Stakhov
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- #include "shingles.h"
- #include "fstring.h"
- #include "cryptobox.h"
- #include "images.h"
- #include "libstat/stat_api.h"
-
- #define SHINGLES_WINDOW 3
- #define SHINGLES_KEY_SIZE rspamd_cryptobox_SIPKEYBYTES
-
- static unsigned int
- rspamd_shingles_keys_hash(gconstpointer k)
- {
- return rspamd_cryptobox_fast_hash(k, SHINGLES_KEY_SIZE,
- rspamd_hash_seed());
- }
-
- static gboolean
- rspamd_shingles_keys_equal(gconstpointer k1, gconstpointer k2)
- {
- return (memcmp(k1, k2, SHINGLES_KEY_SIZE) == 0);
- }
-
- static void
- rspamd_shingles_keys_free(gpointer p)
- {
- unsigned char **k = p;
- unsigned int i;
-
- for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) {
- g_free(k[i]);
- }
-
- g_free(k);
- }
-
- static unsigned char **
- rspamd_shingles_keys_new(void)
- {
- unsigned char **k;
- unsigned int i;
-
- k = g_malloc0(sizeof(unsigned char *) * RSPAMD_SHINGLE_SIZE);
-
- for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) {
- k[i] = g_malloc0(sizeof(unsigned char) * SHINGLES_KEY_SIZE);
- }
-
- return k;
- }
-
- static unsigned char **
- rspamd_shingles_get_keys_cached(const unsigned char key[SHINGLES_KEY_SIZE])
- {
- static GHashTable *ht = NULL;
- unsigned char **keys = NULL, *key_cpy;
- rspamd_cryptobox_hash_state_t bs;
- const unsigned char *cur_key;
- unsigned char shabuf[rspamd_cryptobox_HASHBYTES], *out_key;
- unsigned int i;
-
- if (ht == NULL) {
- ht = g_hash_table_new_full(rspamd_shingles_keys_hash,
- rspamd_shingles_keys_equal, g_free, rspamd_shingles_keys_free);
- }
- else {
- keys = g_hash_table_lookup(ht, key);
- }
-
- if (keys == NULL) {
- keys = rspamd_shingles_keys_new();
- key_cpy = g_malloc(SHINGLES_KEY_SIZE);
- memcpy(key_cpy, key, SHINGLES_KEY_SIZE);
-
- /* Generate keys */
- rspamd_cryptobox_hash_init(&bs, NULL, 0);
- cur_key = key;
-
- for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) {
- /*
- * To generate a set of hashes we just apply sha256 to the
- * initial key as many times as many hashes are required and
- * xor left and right parts of sha256 to get a single 16 bytes SIP key.
- */
- out_key = keys[i];
- rspamd_cryptobox_hash_update(&bs, cur_key, 16);
- rspamd_cryptobox_hash_final(&bs, shabuf);
-
- memcpy(out_key, shabuf, 16);
- rspamd_cryptobox_hash_init(&bs, NULL, 0);
- cur_key = out_key;
- }
-
- g_hash_table_insert(ht, key_cpy, keys);
- }
-
- return keys;
- }
-
- struct rspamd_shingle *RSPAMD_OPTIMIZE("unroll-loops")
- rspamd_shingles_from_text(GArray *input,
- const unsigned char key[16],
- rspamd_mempool_t *pool,
- rspamd_shingles_filter filter,
- gpointer filterd,
- enum rspamd_shingle_alg alg)
- {
- struct rspamd_shingle *res;
- uint64_t **hashes;
- unsigned char **keys;
- rspamd_fstring_t *row;
- rspamd_stat_token_t *word;
- uint64_t val;
- int i, j, k;
- gsize hlen, ilen = 0, beg = 0, widx = 0;
- enum rspamd_cryptobox_fast_hash_type ht;
-
- if (pool != NULL) {
- res = rspamd_mempool_alloc(pool, sizeof(*res));
- }
- else {
- res = g_malloc(sizeof(*res));
- }
-
- row = rspamd_fstring_sized_new(256);
-
- for (i = 0; i < input->len; i++) {
- word = &g_array_index(input, rspamd_stat_token_t, i);
-
- if (!((word->flags & RSPAMD_STAT_TOKEN_FLAG_SKIPPED) || word->stemmed.len == 0)) {
- ilen++;
- }
- }
-
- /* Init hashes pipes and keys */
- hashes = g_malloc(sizeof(*hashes) * RSPAMD_SHINGLE_SIZE);
- hlen = ilen > SHINGLES_WINDOW ? (ilen - SHINGLES_WINDOW + 1) : 1;
- keys = rspamd_shingles_get_keys_cached(key);
-
- for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) {
- hashes[i] = g_malloc(hlen * sizeof(uint64_t));
- }
-
- /* Now parse input words into a vector of hashes using rolling window */
- if (alg == RSPAMD_SHINGLES_OLD) {
- for (i = 0; i <= (int) ilen; i++) {
- if (i - beg >= SHINGLES_WINDOW || i == (int) ilen) {
- for (j = beg; j < i; j++) {
-
- word = NULL;
- while (widx < input->len) {
- word = &g_array_index(input, rspamd_stat_token_t, widx);
-
- if ((word->flags & RSPAMD_STAT_TOKEN_FLAG_SKIPPED) || word->stemmed.len == 0) {
- widx++;
- }
- else {
- break;
- }
- }
-
- if (word == NULL) {
- /* Nothing but exceptions */
- for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) {
- g_free(hashes[i]);
- }
-
- g_free(hashes);
-
- if (pool == NULL) {
- g_free(res);
- }
-
- rspamd_fstring_free(row);
-
- return NULL;
- }
-
- row = rspamd_fstring_append(row, word->stemmed.begin,
- word->stemmed.len);
- }
-
- /* Now we need to create a new row here */
- for (j = 0; j < RSPAMD_SHINGLE_SIZE; j++) {
- rspamd_cryptobox_siphash((unsigned char *) &val, row->str, row->len,
- keys[j]);
- g_assert(hlen > beg);
- hashes[j][beg] = val;
- }
-
- beg++;
- widx++;
-
- row = rspamd_fstring_assign(row, "", 0);
- }
- }
- }
- else {
- uint64_t window[SHINGLES_WINDOW * RSPAMD_SHINGLE_SIZE], seed;
-
- switch (alg) {
- case RSPAMD_SHINGLES_XXHASH:
- ht = RSPAMD_CRYPTOBOX_XXHASH64;
- break;
- case RSPAMD_SHINGLES_MUMHASH:
- ht = RSPAMD_CRYPTOBOX_MUMHASH;
- break;
- default:
- ht = RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT;
- break;
- }
-
- memset(window, 0, sizeof(window));
- for (i = 0; i <= ilen; i++) {
- if (i - beg >= SHINGLES_WINDOW || i == ilen) {
-
- for (j = 0; j < RSPAMD_SHINGLE_SIZE; j++) {
- /* Shift hashes window to right */
- for (k = 0; k < SHINGLES_WINDOW - 1; k++) {
- window[j * SHINGLES_WINDOW + k] =
- window[j * SHINGLES_WINDOW + k + 1];
- }
-
- word = NULL;
-
- while (widx < input->len) {
- word = &g_array_index(input, rspamd_stat_token_t, widx);
-
- if ((word->flags & RSPAMD_STAT_TOKEN_FLAG_SKIPPED) || word->stemmed.len == 0) {
- widx++;
- }
- else {
- break;
- }
- }
-
- if (word == NULL) {
- /* Nothing but exceptions */
- for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) {
- g_free(hashes[i]);
- }
-
- if (pool == NULL) {
- g_free(res);
- }
-
- g_free(hashes);
- rspamd_fstring_free(row);
-
- return NULL;
- }
-
- /* Insert the last element to the pipe */
- memcpy(&seed, keys[j], sizeof(seed));
- window[j * SHINGLES_WINDOW + SHINGLES_WINDOW - 1] =
- rspamd_cryptobox_fast_hash_specific(ht,
- word->stemmed.begin, word->stemmed.len,
- seed);
- val = 0;
- for (k = 0; k < SHINGLES_WINDOW; k++) {
- val ^= window[j * SHINGLES_WINDOW + k] >>
- (8 * (SHINGLES_WINDOW - k - 1));
- }
-
- g_assert(hlen > beg);
- hashes[j][beg] = val;
- }
-
- beg++;
- widx++;
- }
- }
- }
-
- /* Now we need to filter all hashes and make a shingles result */
- for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) {
- res->hashes[i] = filter(hashes[i], hlen,
- i, key, filterd);
- g_free(hashes[i]);
- }
-
- g_free(hashes);
-
- rspamd_fstring_free(row);
-
- return res;
- }
-
- struct rspamd_shingle *RSPAMD_OPTIMIZE("unroll-loops")
- rspamd_shingles_from_image(unsigned char *dct,
- const unsigned char key[16],
- rspamd_mempool_t *pool,
- rspamd_shingles_filter filter,
- gpointer filterd,
- enum rspamd_shingle_alg alg)
- {
- struct rspamd_shingle *shingle;
- uint64_t **hashes;
- unsigned char **keys;
- uint64_t d;
- uint64_t val;
- int i, j;
- gsize hlen, beg = 0;
- enum rspamd_cryptobox_fast_hash_type ht;
- uint64_t res[SHINGLES_WINDOW * RSPAMD_SHINGLE_SIZE], seed;
-
- if (pool != NULL) {
- shingle = rspamd_mempool_alloc(pool, sizeof(*shingle));
- }
- else {
- shingle = g_malloc(sizeof(*shingle));
- }
-
- /* Init hashes pipes and keys */
- hashes = g_malloc(sizeof(*hashes) * RSPAMD_SHINGLE_SIZE);
- hlen = RSPAMD_DCT_LEN / NBBY + 1;
- keys = rspamd_shingles_get_keys_cached(key);
-
- for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) {
- hashes[i] = g_malloc(hlen * sizeof(uint64_t));
- }
-
- switch (alg) {
- case RSPAMD_SHINGLES_OLD:
- ht = RSPAMD_CRYPTOBOX_MUMHASH;
- break;
- case RSPAMD_SHINGLES_XXHASH:
- ht = RSPAMD_CRYPTOBOX_XXHASH64;
- break;
- case RSPAMD_SHINGLES_MUMHASH:
- ht = RSPAMD_CRYPTOBOX_MUMHASH;
- break;
- default:
- ht = RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT;
- break;
- }
-
- memset(res, 0, sizeof(res));
- #define INNER_CYCLE_SHINGLES(s, e) \
- for (j = (s); j < (e); j++) { \
- d = dct[beg]; \
- memcpy(&seed, keys[j], sizeof(seed)); \
- val = rspamd_cryptobox_fast_hash_specific(ht, \
- &d, sizeof(d), \
- seed); \
- hashes[j][beg] = val; \
- }
- for (i = 0; i < RSPAMD_DCT_LEN / NBBY; i++) {
- INNER_CYCLE_SHINGLES(0, RSPAMD_SHINGLE_SIZE / 4);
- INNER_CYCLE_SHINGLES(RSPAMD_SHINGLE_SIZE / 4, RSPAMD_SHINGLE_SIZE / 2);
- INNER_CYCLE_SHINGLES(RSPAMD_SHINGLE_SIZE / 2, 3 * RSPAMD_SHINGLE_SIZE / 4);
- INNER_CYCLE_SHINGLES(3 * RSPAMD_SHINGLE_SIZE / 4, RSPAMD_SHINGLE_SIZE);
-
- beg++;
- }
- #undef INNER_CYCLE_SHINGLES
- /* Now we need to filter all hashes and make a shingles result */
- for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) {
- shingle->hashes[i] = filter(hashes[i], hlen,
- i, key, filterd);
- g_free(hashes[i]);
- }
-
- g_free(hashes);
-
- return shingle;
- }
-
- uint64_t
- rspamd_shingles_default_filter(uint64_t *input, gsize count,
- int shno, const unsigned char *key, gpointer ud)
- {
- uint64_t minimal = G_MAXUINT64;
- gsize i;
-
- for (i = 0; i < count; i++) {
- if (minimal > input[i]) {
- minimal = input[i];
- }
- }
-
- return minimal;
- }
-
-
- double rspamd_shingles_compare(const struct rspamd_shingle *a,
- const struct rspamd_shingle *b)
- {
- int i, common = 0;
-
- for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) {
- if (a->hashes[i] == b->hashes[i]) {
- common++;
- }
- }
-
- return (double) common / (double) RSPAMD_SHINGLE_SIZE;
- }
|