From a0f7afdc2c97c742a92fd77bd3927b0c8fa422c7 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 28 Sep 2020 11:05:12 +0100 Subject: [PATCH] [Fix] Fuzzy_check: Disable shingles for short texts (really) --- conf/modules.d/fuzzy_check.conf | 3 ++- lualib/lua_fuzzy.lua | 10 +++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/conf/modules.d/fuzzy_check.conf b/conf/modules.d/fuzzy_check.conf index 5f02d864b..73e280f79 100644 --- a/conf/modules.d/fuzzy_check.conf +++ b/conf/modules.d/fuzzy_check.conf @@ -25,7 +25,8 @@ fuzzy_check { max_score = 20.0; read_only = yes; skip_unknown = yes; - short_text_direct_hash = true; + short_text_direct_hash = true; # If less than min_length then use direct hash + min_length = 64; # Minimum words count to consider shingles fuzzy_map = { FUZZY_DENIED { max_score = 20.0; diff --git a/lualib/lua_fuzzy.lua b/lualib/lua_fuzzy.lua index d2733d5d6..0131ef8e2 100644 --- a/lualib/lua_fuzzy.lua +++ b/lualib/lua_fuzzy.lua @@ -157,14 +157,18 @@ local function check_text_part(task, part, rule, text) if rule.text_shingles then -- Check number of words - if rule.min_length > 0 and wcnt < rule.min_length then + local min_words = rule.min_length or 0 + if min_words < 32 then + min_words = 32 -- Minimum for shingles + end + if wcnt < min_words then lua_util.debugm(N, task, 'text has less than %s words: %s; disable shingles', rule.min_length, wcnt) allow_shingles = false else lua_util.debugm(N, task, 'allow shingles in text %s, %s words', id, wcnt) - allow_shingles = wcnt > 0 + allow_shingles = true end if not rule.short_text_direct_hash and not allow_shingles then @@ -191,7 +195,7 @@ end local function has_sane_text_parts(task) local text_parts = task:get_text_parts() or {} - return fun.any(function(tp) return tp:get_words_count() > 10 end, text_parts) + return fun.any(function(tp) return tp:get_words_count() > 32 end, text_parts) end local function check_image_part(task, part, rule, image) -- 2.39.5