1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969 |
- \input texinfo
- @settitle "Rspamd Spam Filtering System"
- @titlepage
-
- @title Rspamd Spam Filtering System
- @subtitle A User's Guide for Rspamd
-
- @author Vsevolod Stakhov
-
-
- @end titlepage
- @contents
-
- @chapter Rspamd purposes and features.
-
- @section Introduction.
- Rspamd filtering system is created as a replacement of popular
- @code{spamassassin}
- spamd and is designed to be fast, modular and easily extendable system. Rspamd
- core is written in @code{C} language using event driven paradigma. Plugins for rspamd
- can be written in @code{lua}. Rspamd is designed to process connections
- completely asynchronous and do not block anywhere in code. Spam filtering system
- contains of several processes among them are:
- @itemize @bullet
- @item Main process
- @item Workers processes
- @item Controller process
- @item Other processes
- @end itemize
- Main process manages all other processes, accepting signals from OS (for example
- SIGHUP) and spawn all types of processes if any of them die. Workers processes
- do all tasks for filtering e-mail (or HTML messages in case of using rspamd as
- non-MIME filter). Controller process is designed to manage rspamd itself (for
- example get statistics or learning rspamd). Other processes can do different
- jobs among them now are implemented @code{LMTP} worker that implements
- @code{LMTP} protocol for filtering mail and fuzzy hashes storage server.
-
- @section Features.
- The main features of rspamd are:
- @itemize @bullet
- @item Completely asynchronous filtering that allows a big number of simultenious
- connections.
- @item Easily extendable architecture that can be extended by plugins written in
- @code{lua} and by dynamicaly loaded plugins written in @code{c}.
- @item Ability to work in cluster: rspamd is able to perform statfiles
- synchronization, dynamic load of lists via HTTP, to use distributed fuzzy hashes
- storage.
- @item Advanced statistics: rspamd now is shipped with winnow-osb classifier that
- provides more accurate statistics than traditional bayesian algorithms based on
- single words.
- @item Internal optimizer: rspamd first of all try to check rules that were met
- more often, so for huge spam storms it works very fast as it just checks only
- that rules that @emph{can} happen and skip all others.
- @item Ability to manage the whole cluster by using controller process.
- @item Compatibility with existing @code{spamassassin} SPAMC protocol.
- @item Extended @code{RSPAMC} protocol that allows to pass many additional data
- from SMTP dialog to rspamd filter.
- @item Internal support of IMAP in rspamc client for automated learning.
- @item Internal support of many anti-spam technologies, among them are
- @code{SPF} and @code{SURBL}.
- @item Active support and development of new features.
- @end itemize
-
- @chapter Installation of rspamd.
-
- @section Obtaining of rspamd.
-
- The main rspamd site is @url{http://rspamd.sourceforge.net/, sourceforge}. Here
- you can obtain source code package as well as pre-packed packages for different
- operating systems and architectures. Also, you can use SCM
- @url{http://mercurial.selenic.com, mercurial} for accessing rspamd development
- repository that can be found here:
- @url{http://rspamd.hg.sourceforge.net:8000/hgroot/rspamd/rspamd}. Rspamd is
- shipped with all modules and sample config by default. But there are some
- requirements for building and running rspamd.
-
- @section Requirements.
-
- For building rspamd from sources you need @code{CMake} system. CMake is very
- nice source building system and I decided to use it instead of GNU autotools.
- CMake can be obtained here: @url{http://cmake.org}. Also rspamd uses gmime and
- glib for MIME parsing and many other purposes (note that you are NOT required
- to install any GUI libraries - nor glib, nor gmime are GUI libraries). Gmime
- and glib can be obtained from gnome site: @url{http://ftp.gnome.org/}. For
- plugins and configuration system you also need lua language interpreter and
- libraries. They can be easily obtained from @url{http://lua.org, official lua
- site}. Also for rspamc client you need @code{perl} interpreter that could be
- installed from @url{http://www.perl.org}.
-
- @section Building and Installation.
-
- Build process of rspamd is rather simple:
- @itemize @bullet
- @item Configure rspamd build environment, using cmake:
- @example
- $ cmake .
- ...
- -- Configuring done
- -- Generating done
- -- Build files have been written to: /home/cebka/rspamd
- @end example
- @noindent
- For special configuring options you can use
- @example
- $ ccmake .
- CMAKE_BUILD_TYPE
- CMAKE_INSTALL_PREFIX /usr/local
- DEBUG_MODE ON
- ENABLE_GPERF_TOOLS OFF
- ENABLE_OPTIMIZATION OFF
- ENABLE_PERL OFF
- ENABLE_PROFILING OFF
- ENABLE_REDIRECTOR OFF
- ENABLE_STATIC OFF
- @end example
- @noindent
- Options allows building rspamd as static module (note that in this case
- dynamicaly loaded plugins are @strong{NOT} supported), linking rspamd with
- google performance tools for benchmarking and include some other flags while
- building.
- @item Build rspamd sources:
- @example
- $ make
- [ 6%] Built target rspamd_lua
- [ 11%] Built target rspamd_json
- [ 12%] Built target rspamd_evdns
- [ 12%] Built target perlmodule
- [ 58%] Built target rspamd
- [ 76%] Built target test/rspamd-test
- [ 85%] Built target utils/expression-parser
- [ 94%] Built target utils/url-extracter
- [ 97%] Built target rspamd_ipmark
- [100%] Built target rspamd_regmark
- @end example
- @noindent
- @item Install rspamd (as superuser):
- @example
- # make install
- Install the project...
- ...
- @end example
- @noindent
- @end itemize
-
- After installation you would have several new files installed:
- @itemize @bullet
-
- @item Binaries:
- @itemize @bullet
- @item PREFIX/bin/rspamd - main rspamd executable
- @item PREFIX/bin/rspamc - rspamd client program
- @end itemize
- @item Sample configuration files and rules:
- @itemize @bullet
- @item PREFIX/etc/rspamd.xml.sample - sample main config file
- @item PREFIX/etc/rspamd/lua/*.lua - rspamd rules
- @end itemize
- @item Lua plugins:
- @itemize @bullet
- @item PREFIX/etc/rspamd/plugins/lua/*.lua - lua plugins
- @end itemize
-
- @end itemize
- For @code{FreeBSD} system there also would be start script for running rspamd in
- @emph{PREFIX/etc/rc.d/rspamd.sh}.
-
- @section Running rspamd.
-
- Rspamd can be started by running main rspamd executable -
- @code{PREFIX/bin/rspamd}. There are several command-line options that can be
- passed to rspamd. All of them can be displayed by passing --help argument:
- @example
- $ rspamd --help
- Usage:
- rspamd [OPTION...] - run rspamd daemon
-
- Summary:
- Rspamd daemon version 0.3.0
-
- Help Options:
- -?, --help Show help options
-
- Application Options:
- -t, --config-test Do config test and exit
- -f, --no-fork Do not daemonize main process
- -c, --config Specify config file
- -u, --user User to run rspamd as
- -g, --group Group to run rspamd as
- -p, --pid Path to pidfile
- -V, --dump-vars Print all rspamd variables and exit
- -C, --dump-cache Dump symbols cache stats and exit
- -X, --convert-config Convert old style of config to xml one
- @end example
- @noindent
-
- All options are optional: by default rspamd would try to read
- @code{PREFIX/etc/rspamd.xml} config file and run as daemon. Also there is test
- mode that can be turned on by passing @option{-t} argument. In test mode rspamd
- would read config file and checks its syntax, if config file is OK, then exit
- code is zero and non zero otherwise. Test mode is useful for testing new config
- file without restarting of rspamd. With @option{-C} and @option{-V} arguments it is
- possible to dump variables or symbols cache data. The last ability can be used
- for determining which symbols are most often, which are most slow and to watch
- to real order of rules inside rspamd. @option{-X} option can be used to convert
- old style (pre 0.3.0) config to xml one:
- @example
- $ rspamd -c ./rspamd.conf -X ./rspamd.xml
- @end example
- @noindent
- After this command new xml config would be dumped to rspamd.xml file.
-
- @section Managing rspamd with signals.
- First of all it is important to note that all user's signals should be sent to
- rspamd main process and not to its children (as for child processes these
- signals may have other meanings). To determine which process is main you can use
- two ways:
- @itemize @bullet
- @item by reading pidfile:
- @example
- $ cat pidfile
- @end example
- @noindent
- @item by getting process info:
- @example
- $ ps auxwww | grep rspamd
- nobody 28378 0.0 0.2 49744 9424 rspamd: main process (rspamd)
- nobody 64082 0.0 0.2 50784 9520 rspamd: worker process (rspamd)
- nobody 64083 0.0 0.3 51792 11036 rspamd: worker process (rspamd)
- nobody 64084 0.0 2.7 158288 114200 rspamd: controller process (rspamd)
- nobody 64085 0.0 1.8 116304 75228 rspamd: fuzzy storage (rspamd)
-
- $ ps auxwww | grep rspamd | grep main
- nobody 28378 0.0 0.2 49744 9424 rspamd: main process (rspamd)
- @end example
- @noindent
- @end itemize
-
- After getting pid of main process it is possible to manage rspamd with signals:
- @itemize @bullet
- @item SIGHUP - restart rspamd: reread config file, start new workers (as well as
- controller and other processes), stop accepting connections by old workers,
- reopen all log files. Note that old workers would be terminated after one minute
- that should allow to process all pending requests. All new requests to rspamd
- would be processed by newly started workers.
- @item SIGTERM - terminate rspamd system.
- @end itemize
-
- These signals may be used in start scripts as it is done in @code{FreeBSD} start
- script. Restarting of rspamd is doing rather softly: no connections would be
- dropped and if new config is syntaxically incorrect old config would be used.
-
- @chapter Configuring of rspamd.
-
- @section Principles of work.
-
- We need to define several terms to explain configuration of rspamd. Rspamd
- operates with @strong{rules}, each rule defines some actions that should be done with
- message to obtain result. Result is called @strong{symbol} - a symbolic
- representation of rule. For example, if we have a rule to check DNS record for
- a url that contains in message we may insert resulting symbol if this DNS record
- is found. Each symbol has several attributes:
- @itemize @bullet
- @item name - symbolic name of symbol (usually uppercase, e.g. MIME_HTML_ONLY)
- @item weight - numeric weight of this symbol (this means how important this rule is), may
- be negative
- @item options - list of symbolic options that defines additional information about
- processing this rule
- @end itemize
-
- Weights of symbols are called @strong{factors}. Also when symbol is inserted it
- is possible to define additional multiplier to factor. This can be used for
- rules that have dynamic weights, for example statistical rules (when probability
- is higher weight must be higher as well).
-
- All symbols and corresponding rules are combined in @strong{metrics}. Metric
- defines a group of symbols that are designed for common purposes. Each metric
- has maximum weight: if sum of all rules' results (symbols) is bigger than this
- limit then this message is considered as spam in this metric. The default metric
- is called @emph{default} and rules that have not explicitly specified metric
- would insert their results to this default metric.
-
- Let's impress how this technics works:
- @enumerate 1
- @item First of all when rspamd is running each module (lua, internal or external
- dynamic module) can register symbols in any defined metric. After this process
- rspamd has a cache of symbols for each metric. This cache can be saved to file
- for speeding up process of optimizing order of calling of symbols.
- @item Rspamd gets a message from client and parse it with mime parsing and do
- other parsing jobs like extracting text parts, urls, and stripping html tags.
- @item For each metric rspamd is looking to metric's cache and select rules to
- check according to their order (this order depends on frequence of symbol, its
- weight and execution time).
- @item Rspamd calls rules of metric till the sum weight of symbols in metric is
- less than its limit.
- @item If sum weight of symbols is more than limit the processing of rules is
- stopped and message is counted as spam in this metric.
- @end enumerate
-
- After processing rules rspamd is also does statistic check of message. Rspamd
- statistic module is presented as a set of @strong{classifiers}. Each classifier
- defines algorithm of statistic checks of messages. Also classifier definition
- contains definition of @strong{statistic files} (or @strong{statfiles} shortly).
- Each statfile contains of number of patterns that are extracted from messages.
- These patterns are put into statfiles during learning process. A short example:
- you define classifier that contains two statfiles: @emph{ham} and @emph{spam}.
- Than you find 10000 messages that are spam and 10000 messages that contains ham.
- Then you learn rspamd with these messages. After this process @emph{ham}
- statfile contains patterns from ham messages and @emph{spam} statfile contains
- patterns from spam messages. Then when you are checking message via this
- statfiles messages that are like spam would have more probability/weight in
- @emph{spam} statfile than in @emph{ham} statfile and classifier would insert
- symbol of @emph{spam} statfile and would calculate how this message is like
- patterns that are contained in @emph{spam} statfile. But rspamd is not limiting
- you to define one classifier or two statfiles. It is possible to define a number
- of classifiers and a number of statfiles inside a classifier. It can be useful
- for personal statistic or for specific spam patterns. Note that each classifier
- can insert only one symbol - a symbol of statfile with max weight/probability.
- Also note that statfiles check is allways done after all rules. So statistic can
- @strong{correct} result of rules.
-
- Now some words about @strong{modules}. All rspamd rules are contained in
- modules. Modules can be internal (like SURBL, SPF, fuzzy check, email and
- others) and external written in @code{lua} language. In fact there is no differ
- in the way, how rules of these modules are called:
- @enumerate 1
- @item Rspamd loads config and loads specified modules.
- @item Rspamd calls init function for each module passing configurations
- arguments.
- @item Each module examines configuration arguments and register its rules (or
- not register depending on configuration) in rspamd metrics (or in a single
- metric).
- @item During metrics process rspamd calls registered callbacks for module's
- rules.
- @item These rules may insert results to metric.
- @end enumerate
-
- So there is no actual difference between lua and internal modules, each are just
- providing callbacks for processing messages. Also inside callback it is possible
- to change state of message's processing. For example this can be done when it is
- required to make DNS or other network request and to wait result. So modules can
- pause message's processing while waiting for some event. This is true for lua
- modules as well.
-
- @section Rspamd config file structure.
-
- Rspamd config file is placed in PREFIX/etc/rspamd.xml by default. You can
- specify other location by passing @option{-c} option to rspamd. Rspamd config file
- contains configuration parameters in XML format. XML was selected for rather
- simple manual editing config file and for simple automatic generation as well as
- for dynamic configuration. I've decided to move rules logic from XML file to
- keep it small and simple. So rules are defined in @code{lua} language and rspamd
- parameters are defined in xml file (rspamd.xml). Configuration rules are
- included by @strong{<lua>} tag that have @strong{src} attribute that defines
- relative path to lua file (relative to placement of rspamd.xml):
- @example
- <lua src="rspamd/lua/rspamd.lua">fake</lua>
- @end example
- @noindent
- Note that it is not currently possible to have empty tags. I hope this
- restriction would be fixed in future. Rspamd xml config consists of several
- sections:
- @itemize @bullet
- @item Main section - section where main config parameters are placed.
- @item Workers section - section where workers are described.
- @item Classifiers section - section where you define your classify logic
- @item Modules section - a set of sections that describes module's rules (in fact
- these rules should be in lua code)
- @item Metrics section - a section where you can set weights of symbols in metrics and metrics settings
- @item Logging section - a section that describes rspamd logging
- @item Views section - a section that defines rspamd views
- @end itemize
-
- So common structure of rspamd.xml can be described this way:
- @example
- <? xml version="1.0" encoding="utf-8" ?>
- <rspamd>
- <!-- Main section directives -->
- ...
- <!-- Workers directives -->
- <worker>
- ...
- </worker>
- ...
- <!-- Classifiers directives -->
- <classifier>
- ...
- </classifier>
- ...
- <!-- Logging section -->
- <logging>
- <type>console</type>
- <level>info</level>
- ...
- </logging>
- <!-- Views section -->
- <view>
- ...
- </view>
- ...
- <!-- Modules settings -->
- <module name="regexp">
- <option name="test">test</option>
- ...
- </module>
- ...
- </rspamd>
- @end example
-
- Each of these sections would be described further in details.
-
- @section Rspamd configuration atoms.
-
- There are several primitive types of rspamd configuration parameters:
- @itemize @bullet
- @item String - common string that defines option.
- @item Number - integer or fractional number (e.g.: 10 or -1.5).
- @item Time - ammount of time in milliseconds, may has suffixes:
- @itemize @bullet
- @item @emph{s} - for seconds (e.g. @emph{10s});
- @item @emph{m} - for minutes (e.g. @emph{10m});
- @item @emph{h} - for hours (e.g. @emph{10h});
- @item @emph{d} - for days (e.g. @emph{10d});
- @end itemize
- @item Size - like number numerci reprezentation of size, but may have a suffix:
- @itemize @bullet
- @item @emph{k} - 'kilo' - number * 1024 (e.g. @emph{10k});
- @item @emph{m} - 'mega' - number * 1024 * 1024 (e.g. @emph{10m});
- @item @emph{g} - 'giga' - number * 1024 * 1024 * 1024 (e.g. @emph{1g});
- @end itemize
- @noindent
- Size atoms are used for memory limits for example.
- @item Lists - path to dynamic rspamd list (e.g. @emph{http://some.host/some/path}).
- @end itemize
-
- While practically all atoms are rather trivial to understand rspamd lists may
- cause some confusion. Lists are widely used in rspamd for getting data that can
- be often changed for example white or black lists, lists of ip addresses, lists
- of domains. So for such purposes it is possible to use files that can be get
- either from local filesystem (e.g. @code{file:///var/run/rspamd/whitelsist}) or
- by HTTP (e.g. @code{http://some.host/some/path/list.txt}). Rspamd constantly
- looks for changes in this files, if using HTTP it also set
- @emph{If-Modified-Since} header and check for @emph{Not modified} reply. So it
- causes no overhead when lists are not modified and may allow to store huge lists
- and to distribute them over HTTP. Monitoring of lists is done with some random
- delay (jitter), so if you have many rspamd servers in cluster that are
- monitoring a single list they would come to check or download it in slightly different
- time. The two most common list formats are @emph{IP list} and @emph{domains
- list}. IP list contains of ip addresses in dot notation (e.g.
- @code{192.168.1.1}) or ip/network pairs in CIDR notation (e.g.
- @code{172.16.0.0/16}). Items in lists are separated by newline symbol. Lines
- that begin with @emph{#} symbol are considered as comments and are ignored while
- parsing. Domains list is very like ip list with difference that it contains
- domain names.
-
- @section Main rspamd configuration section.
-
- Main rspamd configurtion section contains several definitions that determine
- main parameters of rspamd for example path to pidfile, temporary directory, lua
- includes, several limits e.t.c. Here is list of this directives explained:
-
- @multitable @columnfractions .2 .8
- @headitem Tag @tab Mean
-
- @item @var{<tempdir>}
- @tab Defines temporary directory for rspamd. Default is to use @env{TEMP}
- environment variable or @code{/tmp}.
-
- @item @var{<pidfile>}
- @tab Path to rspamd pidfile. Here would be stored a pid of main process.
- Pidfile is used to manage rspamd from start scripts.
-
- @item @var{<statfile_pool_size>}
- @tab Limit of statfile pool size: a total number of bytes that can be used for
- mapping statistic files. Rspamd is using LRU system and would unmap the most
- unused statfile when this limit would be reached. The common sense is to set
- this variable equal to total size of all statfiles, but it can be less than this
- in case of dynamic statfiles (for per-user statistic).
-
- @item @var{<filters>}
- @tab List of enabled internal filters. Items in this list can be separated by
- spaces, semicolons or commas. If internal filter is not specified in this line
- it would not be loaded or enabled.
-
- @item @var{<raw_mode>}
- @tab Boolean flag that specify whether rspamd should try to convert all
- messages to UTF8 or not. If @var{raw_mode} is enabled all messages are
- processed @emph{as is} and are not converted. Raw mode is faster than utf mode
- but it may confuse statistics and regular expressions.
-
- @item @var{<lua>}
- @tab Defines path to lua file that should be loaded fro configuration. Path to
- this file is defined in @strong{src} attribute. Text inside tag is required but
- is not parsed (this is stupid limitation of parser's design).
- @end multitable
-
- @section Rspamd logging configuration.
-
- Rspamd has a number of logging variants. First of all there are three types of
- logs that are supported by rspamd: console loggging (just output log messages to
- console), file logging (output log messages to file) and logging via syslog.
- Also it is possible to filter logging to specific level:
- @itemize @bullet
- @item error - log only critical errors
- @item warning - log errors and warnings
- @item info - log all non-debug messages
- @item debug - log all including debug messages (huge amount of logging)
- @end itemize
- Also it is possible to turn on debug messages for specific ip addresses. This
- ability is usefull for testing.
-
- For each logging type there are special mandatory parameters: log facility for
- syslog (read @emph{syslog (3)} manual page for details about facilities), log
- file for file logging. Also file logging may be buffered for speeding up. For
- reducing logging noise rspamd detects for sequential identic log messages and
- replace them with total number of repeats:
- @example
- #81123(fuzzy): May 11 19:41:54 rspamd file_log_function: Last message repeated 155 times
- #81123(fuzzy): May 11 19:41:54 rspamd process_write_command: fuzzy hash was successfully added
- @end example
-
- Here is summary of logging parameters:
-
-
- @multitable @columnfractions .2 .8
- @headitem Tag @tab Mean
- @item @var{<type>}
- @tab Defines logging type (file, console or syslog). For each type mandatory
- attriute must be present:
- @itemize @bullet
- @item @emph{filename} - path to log file for file logging type;
- @item @emph{facility} - syslog logging facility.
- @end itemize
-
- @item @var{<level>}
- @tab Defines loggging level (error, warning, info or debug).
-
- @item @var{<log_buffer>}
- @tab For file and console logging defines buffer in bytes (kilo, mega or giga
- bytes) that would be used for logging output.
-
- @item @var{<log_urls>}
- @tab Flag that defines whether all urls in message would be logged. Useful for
- testing.
-
- @item @var{<debug_ip>}
- @tab List that contains ip addresses for which debugging would be turned on. For
- more information about ip lists look at config atoms section.
- @end multitable
-
- @section Metrics configuration.
-
- Setting of rspamd metrics is the main way to change rules' weights. You can set
- up weights for all rules: for those that have static weights (for example simple
- regexp rules) and for those that have dynamic weights (for example statistic
- rules). In all cases the base weight of rule is multiplied by metric's weight value.
- For static rules base weight is usually 1.0. So we have:
- @itemize @bullet
- @item @math{w_{symbol} = w_{static} * factor} - for static rules
- @item @math{w_{symbol} = w_{dynamic} * factor} - for dynamic rules
- @end itemize
- Also there is an ability to add so called "grow factor" - additional multiplier
- that would be used when we have more than one symbol in metric. So for each
- added symbol this factor would increment its power. This can be written as:
- @math{w_{total} = w_1 * gf ^ 0 + w_2 * gf ^ 1 + ... + w_n * gf ^ {n - 1}}
- Grow multiplier is used to increment weight of rules when message got many
- symbols (likely spammy). Note that only rules with positive weights would
- increase grow factor, those with negative weights would just be added. Also note
- that grow factor can be less than 1 but it is uncommon use (in this case we
- would have weight lowering when we have many symbols for this message). Metrics
- can be set up with config section(s) @emph{metric}:
- @example
- <metric>
- <name>test_metric</name>
- <action>reject</action>
- <symbol weight="0.1">MIME_HTML_ONLY</symbol>
- <grow_factor>1.1</grow_factor>
- </metric>
- @end example
-
- Note that you basically need to add symbols to metric when you add additional rules.
- The decision of weight of newly added rule basically depends on its importance. For
- example you are absolutely sure that some rule would add a symbol on only spam
- messages, so you can increase weight of such rule so it would filter such spam.
- But if you increase weight of rules you should be more or less sure that it
- would not increase false positive errors rate to unacceptable level (false
- positive errors are errors when good mail is treated as spam). Rspamd comes with
- a set of default rules and default weights of that rules are placed in
- rspamd.xml.sample. In most cases it is reasonable to change them for your mail
- system, for example increase weights of some rules or decrease for others. Also
- note that default grow factor is 1.0 that means that weights of rules do not
- depend on count of added symbols. For some situations it useful to set grow
- factor to value more than 1.0. Also by modifying weights it is possible to
- manage static multiplier for dynamic rules.
-
- @section Workers configuration.
-
- Workers are rspamd processes that are doing specific jobs. Now are supported 4
- types of workers:
- @enumerate 1
- @item Normal worker - a typical worker that process messages.
- @item Controller worker - a worker that manages rspamd, get statistics and do
- learning tasks.
- @item Fuzzy storage worker - a worker that contains a collection of fuzzy
- hashes.
- @item LMTP worker - experimental worker that acts as LMTP server.
- @end enumerate
-
- These types of workers has some common parameters:
- @multitable @columnfractions .2 .8
- @headitem Parameter @tab Mean
- @item @emph{<type>}
- @tab Type of worker (normal, controller, lmtp or fuzzy)
- @item @emph{<bind_socket>}
- @tab Socket credits to bind this worker to. Inet and unix sockets are supported:
- @example
- <bind_socket>localhost:11333</bind_socket>
- <bind_socket>/var/run/rspamd.sock</bind_socket>
- @end example
- @noindent
- Also for inet sockets you may specify @code{*} as address to bind to all
- available inet interfaces:
- @example
- <bind_socket>*:11333</bind_socket>
- @end example
- @noindent
- @item @emph{<count>}
- @tab Number of worker processes of this type. By default this number is
- equialent to number of logical processors in system.
- @item @emph{<maxfiles>}
- @tab Maximum number of file descriptors available to this worker process.
- @item @emph{<maxcore>}
- @tab Maximum size of core file that would be dumped in cause of critical errors
- (in mega/kilo/giga bytes).
- @end multitable
-
- Also each of workers types can have specific parameters:
- @itemize @bullet
- @item Normal worker:
- @itemize @bullet
- @item @var{<custom_filters>} - path to dynamically loaded plugins that would do real
- check of incoming messages. These modules are described further.
- @item @var{<mime>} - if this parameter is "no" than this worker assumes that incoming
- messages are in non-mime format (e.g. forum's messages) and standart mime
- headers are added to them.
- @end itemize
- @item Controller worker:
- @itemize @bullet
- @item @var{<password>} - a password that would be used to access to contorller's
- privilleged commands.
- @end itemize
- @item Fuzzy worker:
- @itemize @bullet
- @item @var{<hashfile>} - a path to file where fuzzy hashes would be permamently stored.
- @item @var{<use_judy>} - if libJudy is present in system use it for faster storage.
- @item @var{<frequent_score>} - if judy is not turned on use this score to place hashes
- with score that is more than this value to special faster list (this is designed
- to increase lookup speed for frequent hashes).
- @item @var{<expire>} - time to expire of fuzzy hashes after their placement in storage.
- @end itemize
- @end itemize
-
- These parameters can be set inside worker's definition:
- @example
- <worker>
- <type>fuzzy</type>
- <bind_socket>*:11335</bind_socket>
- <count>1</count>
- <maxfiles>2048</maxfiles>
- <maxcore>0</maxcore>
- <!-- Other params -->
- <param name="use_judy">yes</param>
- <param name="hashfile">/spool/rspamd/fuzzy.db</param>
- <param name="expire">10d</param>
- </worker>
- @end example
- @noindent
-
- The purpose of each worker's type would be described later. The main parameters
- that could be defined are bind sockets for workers, their count, password for
- controller's commands and parameters for fuzzy storage. Default config provides
- reasonable values of this parameters (except password of course), so for basic
- configuration you may just replace controller's password to more secure one.
-
- @section Classifiers configuration.
-
- @subsection Common classifiers options.
-
- Each classifier has mandatory option @var{type} that defines internal algorithm
- that is used for classifying. Currently only @code{winnow} is supported. You can
- read theoretical description of algorithm used here:
- @url{http://www.siefkes.net/papers/winnow-spam.pdf}
-
- The common classifier configuration consists of base classifier parameters and
- definitions of two (or more than two) statfiles. During classify process rspamd
- check each statfile in classifier and select those that has more
- probability/weight than others. If all statfiles has zero weight this classifier
- do not add any symbols. Among common classifiers options are:
- @multitable @columnfractions .2 .8
- @headitem Tag @tab Mean
- @item @var{<tokenizer>}
- @tab Tokenizer to extract tokens from messages. Currently only @emph{osb}
- tokenizer is supported
- @item @var{<metric>}
- @tab Metric to which this classifier would insert symbol.
- @end multitable
-
- Also option @var{min_tokens} is supported to specify minimum number of tokens to
- work with (this is usefull to avoid classifying of short messages as statistic
- is practically useless for small amount of tokens). Here is example of base
- classifier config:
- @example
- <classifier type="winnow">
- <tokenizer>osb-text</tokenizer>
- <metric>default</metric>
- <option name="min_tokens">20</option>
- <statfile>
- ...
- </statfile>
- </classifier>
- @end example
-
- @subsection Statfiles options.
-
- The most common statfile options are @var{symbol} and @var{size}. The first one defines
- which symbol would be inserted if this statfile would have maximal weight inside
- classifier and size defines statfile size on disk and in memory. Note that
- statfiles are mapped directly to memory and you should practically note
- parameter @var{statfile_pool_size} of main section which defines maximum ammount
- of memory for mapping statistic files. Also note that statistic files are
- of constant size: if you defines 100 megabytes statfile it would occupy 100
- megabytes of disc space and 100 megabytes of memory when it is used (mapped).
- Each statfile is indexed by tokens and contains so called "token chains". This
- mechanizm would be described further but note that each statfile has parameter
- "free tokens" that defines how much space is available for new tokens. If
- statfile has no free space the most unused tokens would be removed from
- statfile.
-
- Here is list of common options of statfiles:
- @multitable @columnfractions .2 .8
- @headitem Tag @tab Mean
- @item @var{<symbol>}
- @tab Defines symbol to insert for this statfile.
- @item @var{<size>}
- @tab Size of this statfile in bytes (kilo/mega/giga bytes).
- @item @var{<path>}
- @tab Filesystem path to statistic file.
- @item @var{<normalizer>}
- @tab Defines weight normalization structure. Can be lua function name or
- internal normalizer. Internal normalizer is defined in format:
- "internal:<max_weight>" where max_weight is fractional number that limits the
- maximum weight of this statfile's symbol (this is so called dynamic weight).
- @item @var{<binlog>}
- @tab Defines binlog affinity: master or slave. This option is used for statfiles
- binary sync that would be described further.
- @item @var{<binlog_master>}
- @tab Defines credits of binlog master for this statfile.
- @item @var{<binlog_rotate>}
- @tab Defines rotate time for binlog.
- @end multitable
-
- Internal normalization of statfile weight works in this way:
- @itemize @bullet
- @item @math{R_{score} = 1} when @math{W_{statfile} < 1}
- @item @math{R_{score} = W_statfile ^ 2} when @math{1 < W_{statfile} < max / 2}
- @item @math{R_{score} = W_statfile} when @math{max / 2 < W_{statfile} < max}
- @item @math{R_{score} = max} when @math{W_{statfile} > max}
- @end itemize
-
- The final result weight would be: @math{weight = R_{score} * W_{weight}}.
- Here is sample classifier configuration with two statfiles that can be used for
- spam/ham classifying:
-
- @example
- <symbol weight="-1.00">WINNOW_HAM</symbol>
- <symbol weight="1.00">WINNOW_SPAM</symbol>
- ...
-
- <!-- Classifiers section -->
- <classifier type="winnow">
- <tokenizer>osb-text</tokenizer>
- <metric>default</metric>
- <option name="min_tokens">20</option>
- <statfile>
- <symbol>WINNOW_HAM</symbol>
- <size>100M</size>
- <path>/var/run/rspamd/data.ham</path>
- <normalizer>internal:3</normalizer>
- </statfile>
- <statfile>
- <symbol>WINNOW_SPAM</symbol>
- <size>100M</size>
- <path>/var/run/rspamd/data.spam</path>
- <normalizer>internal:3</normalizer>
- </statfile>
- </classifier>
- <!-- End of classifiers section -->
- @end example
- @noindent
- In this sample we define classifier that contains two statfiles:
- @emph{WINNOW_SPAM} and @emph{WINNOW_HAM}. Each statfile has 100 megabytes size
- (so they would occupy 200Mb while classifying). Also each statfile has maximum
- weight of 3 so with such weights (-1 for WINNOW_HAM and 1 for WINNOW_SPAM) the
- result weight of symbols would be 0..3 for @emph{WINNOW_SPAM} and 0..-3 for
- @emph{WINNOW_HAM}.
-
- @section Composites config.
-
- Composite symbols are rules that allow combining of several other symbols by
- using logical expressions. For example you can add composite symbol COMP1 that
- would be added if SYMBOL1 and SYMBOL2 are presented after message checks. When
- composite symbol is added the symbols that are in that composite are removed. So
- if message has symbols SYMBOL1 and SYMBOL2 the composite symbol COMP1 would be
- inserted in place of these two symbols. Not that if composite symbol is not
- inserted the symbols that are inside it are not touched. So SYMBOL1 and SYMBOL2
- can be presented separately, but when COMP1 is added SYMBOL1 and SYMBOL2 would
- be removed. Composite symbols can be defined in main configuration section. Here
- is example of composite rules definition:
-
- @example
- <composite name="ONCE_RECEIVED_PBL">ONCE_RECEIVED & RECEIVED_PBL</composite>
- <composite name="SPF_TRUSTED">R_SPF_TRUSTED & R_SPF_ALLOW</composite>
- <composite name="TRUSTED_FROM">R_TRUSTED_FROM & R_SPF_ALLOW</composite>
- @end example
-
- Note that you need to insert xml entity (@emph{&}) instead of '&' symbol;
-
- @section Modules config.
-
- @subsection Lua modules loading.
- For loading custom lua modules you should use @emph{<modules>} section:
- @example
- <modules>
- <module>/usr/local/etc/rspamd/plugins/lua</module>
- </modules>
- @end example
- @noindent
- Each @emph{<module>} directive defines path to lua modules. If this is a
- directory so all @code{*.lua} files inside that directory would be loaded. If
- this is a file it would be loaded directly.
-
- @subsection Modules configuration.
- Each module can have its own config section (this is true not only for internal
- module but also for lua modules). Such section is called @emph{<module>} with
- mandatory attribute @emph{"name"}. Each module can be configured by
- @emph{<option>} directives. These directives must also have @emph{"name"}
- attribute. So module configuration is done in @code{param = value} style:
- @example
- <module name="fuzzy_check">
- <option name="servers">localhost:11335</option>
- <option name="symbol">R_FUZZY</option>
- <option name="min_length">300</option>
- <option name="max_score">10</option>
- </module>
- @end example
- @noindent
- The common parameters are:
- @itemize @bullet
- @item symbol - symbol that this module should insert.
- @end itemize
- But each module can have its own unique parameters. So it would be discussed
- furhter in detailed modules description. Also note that for internal modules you
- should edit @emph{<filters>} parameter in main section: this parameter defines
- which internal modules would be turned on in this configuration.
-
- @section Views config.
- It is possible to make different rules for different
- networks/senders/recipients. For this purposes you can use rspamd views: maps of
- conditions (ip, sender, recipients) and actions, associated with them. For
- example you can turn rspamd off for specific conditions by using
- @emph{skip_check} action or check only specific rules. Views are defined inside
- @emph{<view>} xml section. Here is list of available tags inside section:
- @multitable @columnfractions .2 .8
- @headitem Tag @tab Mean
- @item @var{<skip_check>}
- @tab Boolean flag (yes or no) that specifies whether rspamd checks should be
- turned off for this ip
- @item @var{<symbols>}
- @tab Defines comma-separated list of symbols that should be checked for this
- view
- @item @var{<ip>}
- @tab Map argument that defines path to list of ip addresses (may be with CIDR
- masks) to which this view should be applied.
- @item @var{<client_ip>}
- @tab Map argument that defines path to list of ip addresses of rspamd clients
- to which this view should be applied. Note that this is ip of rspamd client not
- ip of message's sender.
- @item @var{<from>}
- @tab Map argument that defines path to list of senders to which this view should
- be applied.
- @end multitable
- Here is an example view definition
- @example
- <view>
- <skip_check>yes</skip_check>
- <ip>file:///usr/local/etc/rspamd/whitelist</ip>
- </view>
- @end example
-
- @chapter Rspamd clients interaction.
-
- @section Introduction.
- After you have basic config file you may test rspamd functionality by using
- whether telnet like utility or @emph{rspamc} client. For testing newly installed
- config it is possible to run config file test:
- @example
- $ rspamd -t
- syntax OK
- @end example
-
- Rspamc utility is written in @code{perl} language and uses perl modules that are
- shipped with rspamd: @emph{Mail::Rspamd::Client} for client's protocol and
- @emph{Mail::Rspamd::Config} for reading and writing configuration. The
- documentation for these modules can be found by commands:
- @example
- $ perldoc Mail::Rspamd::Client
- $ perldoc Mail::Rspamd::Config
- @end example
-
- So other way to access rspamd is to use perl client API:
- @example
- use Mail::Rspamd::Client;
- my $config = @{
- hosts => ['localhost:11333'],
- @};
-
- my $client = new Mail::Rspamd::Client(%config);
-
- if (! $client->ping()) @{
- die "Cannot ping rspamd: $client->@{error@}";
- @}
-
- my $result = $client->check($testmsg);
-
- if ($result->@{'default'@}->@{isspam@} eq 'True') @{
- # do something with spam message here
- @}
- @end example
-
- @section Rspamc protocol.
- Rspamc protocol is an extension over traditional spamc protocol that is used by
- spamassassin. This protocol looks like traditional HTTP session: first line is
- method with version, headers can be passed by next lines and the message itself
- is waited after empty line:
- @example
- <REQUEST>
- SYMBOLS RSPAMC/1.1
- Content-Length: 2200
-
- <message octets>
-
- <REPLY>
- RSPAMD/1.1 0 OK
- Metric: default; True; 10.40 / 10.00 / 0.00
- Symbol: R_UNDISC_RCPT
- Symbol: ONCE_RECEIVED
- Symbol: R_MISSING_CHARSET
- Urls:
- @end example
- @noindent
- The format of method line can be presented as:
- @example
- <COMMAND> RSPAMC/<version>
- @end example
- @noindent
- Version can be 1.0 and 1.1. The main difference that in 1.1 metrics output also
- has @emph{reject score} - hard limit of score for metric. This would be
- discussed while describing user's options. Commands are:
- @multitable @columnfractions .2 .8
- @headitem Command @tab Mean
- @item CHECK
- @tab Check a message and output results for each metric. But do not output
- symbols.
- @item SYMBOLS
- @tab Same as @emph{CHECK} but output symbols.
- @item PROCESS
- @tab Same as @emph{SYMBOLS} but output also original message with inserted
- X-Spam headers.
- @item PING
- @tab Do not do any processing, just check rspamd state:
- @example
- $ telnet localhost 11333
- Trying 127.0.0.1...
- Connected to localhost.
- Escape character is '^]'.
- PING RSPAMC/1.1
-
- RSPAMD/1.1 0 PONG
- Connection closed by foreign host.
- @end example
- @noindent
- @end multitable
-
- After command there should be one mandatory header: @strong{Content-Length} that
- defines message's length in bytes and optional headers:
- @multitable @columnfractions .2 .8
- @headitem Header @tab Mean
- @item @var{Deliver-To:}
- @tab Defines actual delivery recipient of message. Can be used for personalized
- statistic and for user specific options.
- @item @var{IP:}
- @tab Defines IP from which this message is received.
- @item @var{Helo:}
- @tab Defines SMTP helo.
- @item @var{From:}
- @tab Defines SMTP mail from command data.
- @item @var{Queue-Id:}
- @tab Defines SMTP queue id for message (can be used instead of message id in
- logging).
- @item @var{Rcpt:}
- @tab Defines SMTP recipient (it may be several @emph{Rcpt:} headers).
- @item @var{Pass:}
- @tab If this header has @emph{"all"} value, all filters would be checked for
- this message.
- @item @var{Subject:}
- @tab Defines subject of message (is used for non-mime messages).
- @item @var{User:}
- @tab Defines SMTP user (this is currently unused in rspamd however).
- @end multitable
- So rspamc protocol allows to pass many data from MTA to rspamd. This is used to
- increase speed of processing and for building filters (like SPF filter). Also
- note that rspamd support spamassassin spamc protocol and you can even pass
- rspamc headers in spamc mode, but reply of rspamd in spamc mode would be much
- shorter: it would only use "default" metric and won't show additional options
- for symbols. Rspamc reply looks like this:
- @example
- RSPAMD/1.1 0 OK
- Metric: default; True; 10.40 / 10.00 / 0.00
- Symbol: R_UNDISC_RCPT
- Symbol: ONCE_RECEIVED
- Symbol: R_MISSING_CHARSET
- Urls:
- @end example
- @noindent
- First line is method reply: @code{<PROTOCOL>/<VERSION> <ERROR_CODE> <ERROR_REPLY>}.
- Error code is 0 when no error occured. After first reply line there are metrics
- output. For @emph{SYMBOLS} and @emph{PROCESS} commands there are symbols lines
- after each metric. And for @emph{PROCESS} command there would be original
- message after all metrics results. Metric result line looks like this:
- @example
- Metric: <name>; <result>; <score> / <required_score> / <reject_score>
- @end example
- @noindent
- For 1.0 version of rspamc protocol @emph{reject_score} parameter is not printed.
- Symbol line looks like this:
- @example
- Symbol: <Name>[; param1[, param2...]]
- @end example
- @noindent
- Some symbols can have parameters attached. It is useful for example for RBL
- checks (you can insert additional data after symbol name), for statistic and
- fuzzy checks. Also rspamd inserts @emph{Urls} line in which all urls that are
- contained in message are printed in comma-separated list.
- Note that this protocol is used for normal workers. Controller, fuzzy storage
- and lmtp/smtp workers are using other protocols. For example controller's
- protocol is oriented on interactive sessions: you can pass many commands to
- controller before disconnecting. Fuzzy storage is using UDP for making
- interaction with storage faster. LMTP/SMTP workers are using lmtp and smtp
- protocols. All of these protocols would be described in further chapters about
- rspamd workers.
-
- @section Controller protocol.
-
- Rspamd controller can also be accessed by telnet, by rspamc client or by using
- perl module Mail::Rspamd::Client. Controller protocol accepts commands and it is
- possible to send several commands during a single session. Here is an example
- telnet session:
- @example
- >telnet localhost 11334
- Trying 127.0.0.1...
- Connected to localhost.
- Escape character is '^]'.
- Rspamd version 0.3.0 is running on spam1.rambler.ru
- stat
- Messages scanned: 1526901
- Messages treated as spam: 238171, 15.60%
- Messages treated as ham: 1288730, 84.40%
- Messages learned: 0
- Connections count: 1529758
- Control connections count: 15
- Pools allocated: 3059589
- Pools freed: 3056134
- Bytes allocated: 98545852799
- Memory chunks allocated: 8745374
- Shared chunks allocated: 7
- Chunks freed: 8737507
- Oversized chunks: 768784
- Fuzzy hashes stored: 0
- Fuzzy hashes expired: 0
- Statfile: WINNOW_SPAM (version 186); length: 100.0 MB; free blocks: 748504; total blocks: 6553581; free: 11.42%
- Statfile: WINNOW_HAM (version 186); length: 100.0 MB; free blocks: 748504; total blocks: 6553581; free: 11.42%
- END
- @end example
- @noindent
-
- So you can see that reply from controller is ended with line that contains word
- @strong{END}. It is also possible to get summary help for controller's commands:
- @example
- help
- Rspamd CLI commands (* - privilleged command):
- help - this help message
- (*) learn <statfile> <size> [-r recipient] [-m multiplier] [-f from] [-n] - learn message to specified statfile
- quit - quit CLI session
- (*) reload - reload rspamd
- (*) shutdown - shutdown rspamd
- stat - show different rspamd stat
- counters - show rspamd counters
- uptime - rspamd uptime
- END
- @end example
- @noindent
-
- Note that some commands are privilleged ones - you are required to enter a
- password for them:
- @example
- >telnet localhost 11334
- Trying 127.0.0.1...
- Connected to localhost.
- Escape character is '^]'.
- Rspamd version 0.3.0 is running on spam1.rambler.ru
- reload
- not authorized
- END
-
- password q1
- password accepted
- END
-
- reload
- reload request sent
- END
- Connection closed by foreign host.
- @end example
- @noindent
-
- This password is configured in rspamd.xml in worker section where you are
- describing controller:
- @example
- <worker>
- <type>controller</type>
- ...
- <!-- Other params -->
- <param name="password">q1</param>
- </worker>
- @end example
-
- In many cases it is more easy to use rspamc to access controller. Here is
- example of learning statfiles using rspamc CLI:
- @example
- % rspamc -h localhost:11334 -P q1 -s WINNOW_HAM learn < /tmp/exim.eml
- Results for host localhost:11334:
-
- Learn succeed. Sum weight: 1.51
-
- % rspamc -h localhost:11334 -P q1 -s WINNOW_SPAM learn < /tmp/bad.eml
- Results for host localhost:11334:
-
- Learn succeed. Sum weight: 1.51
- @end example
-
- Note that rspamc handles password issues and other things like timeouts and
- error handling inside and makes this tasks rather easy.
-
- @section More about rspamc client.
-
- Rspamc is small and simple client that allows to simplify common tasks for
- rspamd manage. Rspamc is written in perl language and requires some modules for
- its work:
- @itemize @bullet
- @item Mail::Rspamd::Client - a module that contains common function for
- accessing rspamd, shipped with rspamd and installed automatically
- @item Term::Cap - a module that allows basic interaction with terminal, can be
- obtained via @url{http://www.cpan.org, cpan}.
- @end itemize
- Rspamc accepts several command line options:
-
- @example
- % rspamc --help
- Usage: rspamc.pl [-h host] [-H hosts_list] [-P password] [-c conf_file] [-s statfile] [-d user@@domain] [command] [path]
- -h host to connect (in format host:port) or unix socket path
- -H path to file that contains list of hosts
- -P define control password
- -c config file to parse
- -s statfile to use for learn commands
-
- Additional options:
- -d define deliver-to header
- -w define weight for fuzzy operations
- -S define search string for IMAP operations
- -i emulate that message was send from specified IP
- -p pass message throught all filters
-
- Notes:
- imap format: imap:user:<username>:password:[<password>]:host:<hostname>:mbox:<mboxname>
- Password may be omitted and then it would be asked in terminal
- imaps requires IO::Socket::SSL
-
- IMAP search strings samples:
- ALL - All messages in the mailbox;
- FROM <string> - Messages that contain the specified string in the envelope structure's FROM field;
- HEADER <field-name> <string> - Messages that have a header with the specified field-name and that
- contains the specified string in the text of the header (what comes after the colon);
- NEW - Messages that have the Recent flag set but not the Seen flag.
- This is functionally equivalent to "(RECENT UNSEEN)".
- OLD - Messages that do not have the Recent flag set.
- SEEN - Messages that have the Seen flag set.
- SENTBEFORE <date> - Messages whose [RFC-2822] Date: header (disregarding time and timezone)
- is earlier than the specified date.
- TO <string> - Messages that contain the specified string in the envelope structure's TO field.
- TEXT <string> - Messages that contain the specified string in the header or body of the message.
- OR <search-key1> <search-key2> - Messages that match either search key (same for AND and NOT operations).
-
- Version: 0.3.0
- @end example
- @noindent
-
- After options you should specify command to execute, for example:
- @example
- % rspamc symbols < /tmp/exim.eml
- @end example
- @noindent
- After command name you may specify objects to apply to: files, directories or
- even imap folders:
- @itemize @bullet
- @item A single file:
- @example
- % rspamc symbols /tmp/exim.eml
- @end example
- @noindent
- @item A list of files:
- @example
- % rspamc symbols /tmp/*.eml
- @end example
- @noindent
- @item Directories:
- @example
- % rspamc symbols /tmp/*.eml /tmp/to_scan/
- @end example
- @noindent
- @item IMAP folder:
- @example
- % rspamc symbols imap:user:username:password::host:localhost:mbox:INBOX
- Enter IMAP password:
- @end example
- @noindent
- Note that it is possible to specify empty password and be prompted for a
- password during execution (you also need perl module Term::ReadKey for turning
- on noecho input of password).
- @end itemize
- For fetching imap messages you may also use search string by specifying -S
- option. Some examples of IMAP search strings can be found in a help message. For
- more complex things you may read rfc3501 about imap4 search strings. This may be
- found for example here: @url{http://www.faqs.org/rfcs/rfc3501.html}. IMAP access
- may be usefull for setting up automatic learning scripts. Also it is possible to
- use SSL version of imap by specifying @strong{imaps} instead @strong{imap} as
- first component. Note that for SSL access you need @emph{IO::Socket::SSL} perl
- module.
-
- @chapter Statistics and hashes storage.
-
- @section Introduction.
- First of all we need to strictly define purposes of hashes and statistic. Hashes
- are used to find very close messages (for example messages where there are only
- several words changed), while statistic can find @strong{probability} of
- belonging message to specified class of messages. So when you learn rspamd with
- message's hash you just add this hash to storage and when you learn rspamd
- statistic you add tokens from message to specified class. So statistic is
- probabilistic method to filter message, while fuzzy hashes can detect specific
- patterns in messages and filter them.
-
- @section Classifiers and statistic.
- @subsection Tokenization.
- Now rspamd supports OSB-Winnow statistic algorithm. Let's describe it in
- details. First of all message is separeted into a set of tokens. The algorithm
- of extracting tokens is rather simple now:
- @enumerate 1
- @item Extract graph symbols till first non-graph symbol (whitespace, punctuation
- etc), the group of graph symbols forms a token, non-graphs are separators.
- @item Fill an array with token till @strong{window size} is reached (currently
- this size is 5 tokens).
- @item Get pairs of tokens from array and extract their hashes:
- @itemize @bullet
- @item * . . . * -> token1 (h1, h5);
- @item . * . . * -> token2 (h2, h5);
- @item . . * . * -> token3 (h3, h5);
- @item . . . * * -> token4 (h4, h5);
- @end itemize
- @noindent
- @item Insert these tokens to statfile (indexed by first hash).
- @item Shift window on next word.
- @end enumerate
- So after tokenizing process we would have tokens each of that contains 2 hashes of 2
- words from message. This mechanics allows to count not only words itself but
- also its combinations into a message, so providing more accurate statistic.
-
- @subsection Classifying.
- For classifying process @strong{winnow} algorithm is used. In this statistic
- algtorithm we operate not with probabilities but with weights. Each token has
- its own weight and when we learn some statfile with tokens rspamd does several
- things:
- @enumerate 1
- @item Try to find token inside statfile.
- @item If a token found multiply its weight by so called @strong{promotion
- factor} (that is now 1.23).
- @item If token not found insert it into statfile with weight 1.
- @end enumerate
-
- If it is needed to lower token weight, so its weight is multiplied with
- @strong{demotion factor} (currently 0.83). Classify process is even more simple:
- @enumerate 1
- @item Extract tokens from a message.
- @item For each statfile check weight of obtained tokens and store summary
- weight.
- @item Compare sums for each statfile and select statfile with the most big sum.
- @item Do weight normalization and insert symbol of selected statfile.
- @end enumerate
-
- @subsection Statfiles synchronization.
- Rspamd allows to make master/slave statfiles synchronization. This is done by
- writing changes to statfiles to special @emph{binary log}. Binary log is a file
- on filesystem named like statfile but with @emph{.binlog} suffix. Binary log
- consist of two level indexes and binary changes to each statfile. So after each
- learning process the version of affected statfiles is increased by 1 and a
- record is written to binary log. Binary logs have fixed size limit and may have
- time limit (rotate time). The process of synchronization may be described as:
- @enumerate 1
- @item Slave rspamd periodically asks master for version of statfiles monitored.
- @item If master has version that is larger than slave's one the synchronization
- process starts.
- @item During synchronization process master looks at version reported by client
- in binary log.
- @item If version is found all records that are @strong{after} client's version
- are sent to client.
- @item Client accepts changes and apply binary patches one-by-one incrementing
- statfile's version.
- @item If version that client reports is not found in binary log the completely
- statfile is sent to client (slow way, but practically that would take place only
- once for fresh slaves).
- @end enumerate
-
- Here is example configuration for master statfile:
- @example
- <statfile>
- <symbol>WINNOW_HAM</symbol>
- <size>100M</size>
- <path>/spool/rspamd/data.ham</path>
- <normalizer>internal:3</normalizer>
- <binlog>master</binlog>
- <binlog_rotate>1d</binlog_rotate>
- </statfile>
- @end example
- @noindent
- Here we define binlog affinity (master) that automatically create binlog file
- @file{/spool/rspamd/data.ham.binlog} and set up time limit for it (1 day).
- For slaves you should first of all set up controller worker to accept network
- connections (statfile synchronization is done via controller workers). The
- second task is to define affinity for slave and master's address:
- @example
- <statfile>
- <symbol>WINNOW_HAM</symbol>
- <size>100M</size>
- <path>/spool/rspamd/data.ham</path>
- <normalizer>internal:3</normalizer>
- <binlog>slave</binlog>
- <binlog_master>spam10:11334</binlog_master>
- </statfile>
- @end example
-
- @subsection Conclusion.
- Statfiles synchronization allows to set up rspamd cluster that uses the common
- statfiles and easily learn the whole cluster without unnecessary overhead.
-
- @section Hashes and hash storage.
- @subsection Fuzzy hashes.
- Hashes that are used in rspamd for messages are not cryptoghraphic. Instead of
- them fuzzy hashes are used. Fuzzy hashes is technics that allows to obtain
- common hashes for common messages (for cryptographic hashes you usually get very
- different hashes even if input messages are very common but not identical). The
- main principle of fuzzy hashing is to break up text parts of message into small
- pieces (blocks) and calculate hash for each block using so called @emph{rolling
- hash}. After this process the final hash is forming by setting bytes in it from
- blocks. So if we have 2 messages each of that contains 100 blocks and 99 of them
- are identical we would have 2 hashes that differs only in one byte. So we can
- consider that one message is 99% like other message.
-
- @subsection Fuzzy storage.
- In rspamd hashes can be stored in fuzzy storage. Fuzzy storage is a special
- worker that can store hashes and reply about score of hashes. Inside fuzzy
- storage each hash has its own weight and list number. List number is integer
- that specify to which list this hash is related. This number can be used in
- fuzzy_check plugin inside rspamd to add custom symbol. There are two ways of
- storing fuzzy hashes: store them in a set of linear linked lists and storing
- hashes in very fast judy tree. First way is good for a relatively small number
- of fuzzy hashes. Also in this case @emph{fuzzy match} is used, so you can find
- not only identical hashes but also common hashes. But for large number of hashes
- this method is very slow. The second way requires libJudy in system (can be
- found at @url{http://judy.sourceforge.net}) and turns off @emph{fuzzy matching}
- - only identical hashes would be found. On the other hand you may store millions
- of hashes in judy tree not loosing nor memory, nor CPU.
-
- @subsection Conclusion.
- Fuzzy hashes is efficient way to make up different black or white lists. Fuzzy
- storage can be distributed over several machines (if you specify several storage
- servers rspamd would select upstream by hash of fuzzy hash). Also storage can
- contain several lists identified by number. Each hash has its own weight that
- allows to set up dynamic rules that add different score from different hashes.
-
- @chapter Rspamd modules.
-
- @section Introduction.
-
- This chapter describes modules that are shipped with rspamd. Here you can find
- details about modules configuration, principles of working, tricks to make spam
- filtering effective. First sections describe internal modules written in C:
- regexp (regular expressions), surbl (black list for URLs), fuzzy_check (checks
- for fuzzy hashes), chartable (check for character sets in messages) and emails
- (check for blacklisted email addresses in messages). Modules configuration can
- be done in lua or in config file itself.
-
- @subsection Lua configuration.
- You may use lua for setting configuration options for modules. With lua you can
- write rather complex rules that can contain not only text lines, but also some
- lua functions that would be called while processing messages. For loading lua
- configuration you should add line to rspamd.xml:
- @example
- <lua src="/usr/local/etc/rspamd/lua/my.lua">fake</lua>
- @end example
- @noindent
- It is possible to load several scripts this way. Inside lua file there would be
- defined global table with name @var{config}. This table should contain
- configuration options for modules indexed by module. This can be written this
- way:
- @example
- config['module_name'] = @{@}
- local mconfig = config['module_name']
-
- mconfig['option_name'] = 'option value'
-
- local a = 'aa'
- local b = 'bb'
-
- mconfig['other_option'] = string.format('%s, %s', a, b)
- @end example
- @noindent
- In this simple example we defines new element of table that is associated with
- module named 'module_name'. Then we assign to it an empty table (@code{@{@}})
- and associate local variable mconfig. Then we set some elements of this table,
- that is equialent to setting module options like that:
- @example
- option_name = option_value
- other_option = aa, bb
- @end example
- @noindent
- Also you may assign to elements of modules tables some functions. That functions
- should accept one argument - worker task object and return result specific for
- that option: number, string, boolean. This can be shown on this simple example:
- @example
-
- local function test (task)
- if task:get_ip() == '127.0.0.1' then
- return 1
- else
- return 0
- end
- end
-
- mconfig['some_option'] = test
- @end example
- In this example we assign to module option 'some_option' a function that check
- for message's ip and return 1 if that ip is '127.0.0.1'.
-
- So using lua for configuration can help for making complex rules and for
- structuring rules - you can place options for specific modules to specific files
- and use lua function @code{dofile} for loading them (or add other @code{<lua>}
- tag to rspamd.xml).
-
- @subsection XML configuration.
-
- Options for rspamd modules can be set up from xml file too. This can be used for
- simple and/or temporary rules and should not be used for complex rules as this
- would make xml file too hard to read and edit. Thought it is surely possible but
- not recommended from points of config file understanding. Here is a simple
- example of module config options:
- @example
- <module name="module_name">
- <option name="option_name">option_value</option>
- <option name="other_option">aa, bb</option>
- </module>
- @end example
- @noindent
- Note that you need to encode xml entitles like @code{&} - @code{&} and so
- on. Also only utf8 encoding is allowed. In sample rspamd configuration all
- modules except regexp module are configured via xml as they have only settings
- and regexp module has rules that are sometimes rather complex.
-
- @section Regexp module.
-
- @subsection Introduction.
- Regexp module is one of the most important rspamd modules. Regexp module can
- load regular expressions and filter messages according to them. Also it is
- possible to use logical expressions of regexps to create complex rules of
- filtering. It is allowed to use logical operators:
- @itemize @bullet
- @item & - logical @strong{AND} function
- @item | - logical @strong{OR} function
- @item ! - logical @strong{NOT} function
- @end itemize
- Also it is possible to use brackets for making priorities in expressions. Regexp
- module operates with @emph{regexp items} that can be combined with logical
- operators into logical @emph{regexp expresions}. Each expression is associated
- with its symbol and if it evaluates to true with this message the symbol would
- be inserted. Note that rspamd uses internal optimization of logical expressions
- (for example if we have expression 'rule1 & rule2' rule2 would not be evaluated
- if rule1 is false) and internal regexp cache (so if rule1 and rule2 have common
- items they would be evaluated only once). So if you need speed optimization of
- your rules you should take this fact into consideration.
-
- @subsection Regular expressions.
- Rspamd uses perl compatible regular expressions. You may read about perl regular
- expression syntax here: @url{http://perldoc.perl.org/perlre.html}. In rspamd
- regular expressions must be enclosed in slashes:
- @example
- /^\\d+$/
- @end example
- @noindent
- If '/' symbol must be placed into regular expression it should be escaped:
- @example
- /^\\/\\w+$/
- @end example
- @noindent
- After last slash it is possible to place regular expression modificators:
- @multitable @columnfractions 0.1 0.9
- @headitem Modificator @tab Mean
- @item @strong{i} @tab Ignore case for this expression.
- @item @strong{m} @tab Assume this expression as multiline.
- @item @strong{s} @tab Assume @emph{.} as all characters including newline
- characters (should be used with @strong{m} flag).
- @item @strong{x} @tab Assume this expression as extended regexp.
- @item @strong{u} @tab Performs ungreedy matches.
- @item @strong{o} @tab Optimize regular expression.
- @item @strong{r} @tab Assume this expression as @emph{raw} (this is actual for
- utf8 mode of rspamd).
- @item @strong{H} @tab Search expression in message's headers.
- @item @strong{X} @tab Search expression in raw message's headers (without mime
- decoding).
- @item @strong{M} @tab Search expression in the whole message (must be used
- carefully as @strong{the whole message} would be checked with this expression).
- @item @strong{P} @tab Search expression in all text parts.
- @item @strong{U} @tab Search expression in all urls.
- @end multitable
-
- You can combine flags with each other:
- @example
- /^some text$/iP
- @end example
- @noindent
- All regexp must be with type: H, X, M, P or U as rspamd should know where to
- search for specified pattern. Header regexps (H and X) have special syntax if
- you need to check specific header, for example @emph{From} header:
- @example
- From=/^evil.*$/Hi
- @end example
- @noindent
- If header name is not specified all headers would be matched. Raw headers is
- matching is usefull for searching for mime specific headers like MIME-Version.
- The problem is that gmime that is used for mime parsing adds some headers
- implicitly, for example @emph{MIME-Version} and you should match them using raw
- headers. Also if header's value is encoded (base64 or quoted-printable encoding)
- you can search for decoded version using H modificator and for raw using X
- modificator. This is usefull for finding bad encodings types or for unnecessary
- encoding.
-
- @subsection Internal function.
- Rspamd provides several internal functions for simplifying message processing.
- You can use internal function as items in logical expressions as they like
- regular expressions return logical value (true or false). Here is list of
- internal functions with their arguments:
- @multitable @columnfractions 0.3 0.2 0.5
- @headitem Function @tab Arguments @tab Description
- @item header_exists
- @tab header name
- @tab Returns true if specified header exists.
-
- @item compare_parts_distance
- @tab number
- @tab If message has two parts (text/plain and text/html) compare how much they
- differs (html messages are compared with stripped tags). The difference is
- number in percents (0 is identically parts and 100 is totally different parts).
- So if difference is more than number this function returns true.
-
- @item compare_transfer_encoding
- @tab string
- @tab Compares header Content-Transfer-Encoding with specified string.
-
- @item content_type_compare_param
- @tab param_name, param_value
- @tab Compares specified parameter of Content-Type header with regexp or certain
- string:
- @example
- content_type_compare_param(Charset, /windows-\d+/)
- content_type_compare_param(Charset, ascii)
- @end example
- @noindent
-
- @item content_type_has_param
- @tab param_name
- @tab Returns true if content-type has specified parameter.
-
- @item content_type_is_subtype
- @tab subtype_name
- @tab Return true if content-type is of specified subtype (for example for
- text/plain subtype is 'plain').
-
- @item content_type_is_type
- @tab type_name
- @tab Return true if content-type is of specified type (for example for
- text/plain subtype is 'text'):
- @example
- content_type_is_type(text)
- content_type_is_subtype(/?.html/)
- @end example
- @noindent
-
- @item regexp_match_number
- @tab number,[regexps list]
- @tab Returns true if specified number of regexps matches for this message. This
- can be used for making rules when you do not know which regexps should match but
- if 2 of them matches the symbol shoul be inserted. For example:
- @example
- regexp_match_number(2, /^some evil text.*$/Pi, From=/^hacker.*$/H, header_exists(Subject))
- @end example
- @noindent
-
- @item has_only_html_part
- @tab nothing
- @tab Returns true when message has only HTML part
-
- @item compare_recipients_distance
- @tab number
- @tab Like compare_parts_distance calculate difference between recipients. Number
- is used as minimum percent of difference. Note that this function would check
- distance only when there are more than 5 recipients in message.
-
- @item is_recipients_sorted
- @tab nothing
- @tab Returns true if recipients list is sorted. This function would also works
- for more than 5 recipients.
-
- @item is_html_balanced
- @tab nothing
- @tab Returns true when all HTML tags in message are balanced.
-
- @item has_html_tag
- @tab tag_name
- @tab Returns true if tag 'tag_name' exists in message.
-
- @item check_smtp_data
- @tab item, regexp
- @tab Returns true if specified part of smtp dialog matches specified regexp. Can
- check HELO, FROM and RCPT items.
-
- @end multitable
-
- These internal functions can be easily implemented in lua but I've decided to
- make them built-in as they are widely used in our rules. In fact this list may
- be extended in future.
-
- @subsection Dynamic rules.
- Rspamd regexp module can use dynamic rules that can be written in json syntax.
- Dynamic rules are loaded at runtime and can be modified while rspamd is working.
- Also it is possible to turn dynamic rules for specific networks only and add rules
- that does not contain any regexp (this can be usefull for dynamic lists for example).
- Dynamic rules can be obtained like any other dynamic map via file monitoring or via
- http. Here are examples of dynamic rules definitions:
- @example
- <module name="regexp">
- <option name="dynamic_rules">file:///tmp/rules.json</option>
- </module>
- @end example
- @noindent
- or for http map:
- @example
- <module name="regexp">
- <option name="dynamic_rules">http://somehost/rules.json</option>
- </module>
- @end example
- @noindent
- Rules are presented as json array (in brackets @emph{'[]'}). Each rule is json object.
- This object can have several properties (properties with @strong{*} are required):
- @multitable @columnfractions 0.3 0.7
- @headitem Property @tab Mean
- @item symbol(*)
- @tab Symbol for rule.
- @item factor(*)
- @tab Factor for rule.
- @item rule
- @tab Rule itself (regexp expression).
- @item enabled
- @tab Boolean flag that define whether this rule is enabled (rule is enabled if
- this flag is not present by default).
- @item networks
- @tab Json array of networks (in CIDR format, also it is possible to add negation
- by prepending @emph{!} symbol before item.
- @end multitable
- Here is an example of dynamic rule:
- @example
- [
- {
- "rule": "/test/rP",
- "symbol": "R_TMP_1",
- "factor": 1.1,
- "networks": ["!192.168.1.0/24", "172.16.0.0/16"],
- "enabled": false
- }
- ]
- @end example
- Note that dynamic rules are constantly monitored for changes and are reloaded
- completely when modification is detected. If you change dynamic rules they
- would be reloaded in a minute and would be applied for new messages.
-
- @subsection Conclusion.
- Rspamd regexp module is powerfull tool for matching different patterns in
- messages. You may use logical expressions of regexps and internal rspamd
- functions to make rules. Rspamd is shipped with many rules for regexp module
- (most of them are taken from spamassassin rules as rspamd originally was a
- replacement of spamassassin) so you can look at them in ETCDIR/rspamd/lua/regexp
- directory. There are many built-in rules with detailed comments. Also note that
- if you add logical rule into XML file you need to escape all XML entitles (like
- @emph{&} operators). When you make complex rules from many parts do not forget
- to add brackets for parts inside expression as you would not predict order of
- checks otherwise. Rspamd regexp module has internal logical optimization and
- regexp cache, so you may use identical regexp many times - they would be matched
- only once. And in logical expression you may optimize performance by putting
- likely TRUE regexp first in @emph{OR} expression and likely FALSE expression
- first in @emph{AND} expression. A number of internal functions can simplify
- complex expressions and for making common filters. Lua functions can be added in
- rules as well (they should return boolean value).
-
- @section SURBL module.
-
- Surbl module is designed for checking urls via blacklists. You may read about
- surbls at @url{http://www.surbl.org}. Here is the sequence of operations that is
- done by surbl module:
- @enumerate 1
- @item Extract all urls in message and get domains for each url.
- @item Check to special list called '2tld' and extract 3 components for domains
- from that list and 2 components for domains that are not listed:
- @example
- http://virtual.somehost.domain.com/some_path
- -> somehost.domain.com if domain.com is in 2tld list
- -> domain.com if not in 2tld
- @end example
- @noindent
- @item Remove duplicates from domain lists
- @item For each registered surbl do dns request in form @emph{domain.surbl_name}
- @item Get result and insert symbol if that name resolves
- @item It is possible to examine bits in returned IP address and insert different
- symbol for each bit that is turned on in result.
- @end enumerate
- All DNS requests are done asynchronously so you may not bother about blocking.
- SURBL module has several configuration options:
- @itemize @bullet
- @item @emph{metric} - metric to insert symbol to.
- @item @emph{2tld} - list argument of domains for those 3 components of domain name
- would be extracted.
- @item @emph{max_urls} - maximum number of urls to check.
- @item @emph{whitelist} - map of domains for which surbl checks would not be performed.
- @item @emph{suffix} - a name of surbl. It is possible to add several suffixes:
- @example
- suffix_RAMBLER_URIBL = insecure-bl.rambler.ru
- or in xml:
- <param name="suffix_RAMBLER_URIBL">insecure-bl.rambler.ru</param>
- @end example
- @noindent
- It is possible to add %b to symbol name for checking specific bits:
- @example
- suffix_%b_SURBL_MULTI = multi.surbl.org
- then you may define replaces for %b in symbol name for each bit in result:
- bit_2 = SC -> sc.surbl.org
- bit_4 = WS -> ws.surbl.org
- bit_8 = PH -> ph.surbl.org
- bit_16 = OB -> ob.surbl.org
- bit_32 = AB -> ab.surbl.org
- bit_64 = JP -> jp.surbl.org
- @end example
- @noindent
- So we make one DNS request and check for specific list by checking bits in
- result ip. This is described in surbl page:
- @url{http://www.surbl.org/lists.html#multi}. Note that result symbol would NOT
- contain %b as it would be replaced by bit name. Also if several bits are set
- several corresponding symbols would be added.
- @end itemize
-
- Also surbl module can use redirector - a special daemon that can check for
- redirects. It uses HTTP/1.0 for requests and accepts a url and returns resolved
- result. Redirector is shipped with rspamd but not enabled by default. You may
- enable it on stage of configuring but note that it requires many perl modules
- for its work. Rspamd redirector is described in details further. Here are surbl
- options for working with redirector:
- @itemize @bullet
- @item @emph{redirector}: adress of redirector (in format host:port)
- @item @emph{redirector_connect_timeout} (seconds): redirector connect timeout (default: 1s)
- @item @emph{redirector_read_timeout} (seconds): timeout for reading data (default: 5s)
- @item @emph{redirector_hosts_map} (map string): map that contains domains to check with redirector
- @end itemize
-
- So surbl module is an easy to use way to check message's urls and it may be used
- in every configuration as it filters rather big ammount of email spam and scam.
-
- @section SPF module.
-
- SPF module is designed to make checks of spf records of sender's domains. SPF
- records are placed in TXT DNS items for domains that have enabled spf. You may
- read about SPF at @url{http://en.wikipedia.org/wiki/Sender_Policy_Framework}.
- There are 3 results of spf check for domain:
- @itemize @bullet
- @item ALLOW - this ip is allowed to send messages for this domain
- @item FAIL - this ip is @strong{not} allowed to send messages for this domain
- @item SOFTFAIL - it is unknown whether this ip is allowed to send mail for this
- domain
- @end itemize
- SPF supports different mechanizms for checking: dns subrequests, macroses,
- includes, blacklists. Rspamd supports the most of them. Also for security
- reasons there is internal limits for DNS subrequests and inclusions recursion.
- SPF module support very small ammount of options:
- @itemize @bullet
- @item @emph{metric} (string): metric to insert symbol (default: 'default')
- @item @emph{symbol_allow} (string): symbol to insert (default: 'R_SPF_ALLOW')
- @item @emph{symbol_fail} (string): symbol to insert (default: 'R_SPF_FAIL')
- @item @emph{symbol_softfail} (string): symbol to insert (default: 'R_SPF_SOFTFAIL')
- @end itemize
-
- @section Chartable module.
-
- Chartable is a simple module that detects different charsets in a message. This
- module is aimed to protect from emails that contains symbols from different
- character sets that looks like each other. Chartable module works differently
- for raw and utf modes: in utf modes it detects different characters from unicode
- tables and in raw modes only ASCII and non-ASCII symbols. Configuration of whis
- module is very simple:
- @itemize @bullet
- @item @emph{metric} (string): metric to insert symbol (default: 'default')
- @item @emph{symbol} (string): symbol to insert (default: 'R_BAD_CHARSET')
- @item @emph{threshold} (double): value that would be used as threshold in expression
- @math{N_{charset-changes} / N_{chars}}
- (e.g. if threshold is 0.1 than charset change should occure more often than in 10 symbols),
- default: 0.1
- @end itemize
-
- @section Fuzzy check module.
-
- Fuzzy check module provides a client for rspamd fuzzy storage. Fuzzy check can
- work with a cluster of rspamd fuzzy storages and the specific storage is
- selected by value of hash of message's hash. The available configuration options
- are:
- @itemize @bullet
- @item @emph{metric} (string): metric to insert symbol (default: 'default')
- @item @emph{symbol} (string): symbol to insert (default: 'R_FUZZY')
- @item @emph{max_score} (double): maximum score to that weights of hashes would be
- normalized (default: 0 - no normalization)
- @item @emph{fuzzy_map} (string): a string that contains map in format @{ fuzzy_key => [
- symbol, weight ] @} where fuzzy_key is number of fuzzy list. This string itself
- should be in format 1:R_FUZZY_SAMPLE1:10,2:R_FUZZY_SAMPLE2:1 etc, where first
- number is fuzzy key, second is symbol to insert and third - weight for
- normalization
- @item @emph{min_length} (integer): minimum length (in characters) for text part to be
- checked for fuzzy hash (default: 0 - no limit)
- @item @emph{whitelist} (map string): map of ip addresses that should not be checked
- with this module
- @item @emph{servers} (string): list of fuzzy servers in format
- "server1:port,server2:port" - these servers would be used for checking and
- storing fuzzy hashes
- @end itemize
-
- @section Forged recipients.
-
- Forged recipients is a lua module that compares recipients provided by smtp
- dialog and recipients from @emph{To:} header. Also it is possible to compare
- @emph{From:} header with SMTP from. So you may set @strong{symbol_rcpt} option
- to set up symbol that would be inserted when recipients differs and
- @strong{symbol_sender} when senders differs.
-
- @section Maillist.
-
- Maillist is a module that detects whether this message is send by using one of
- popular mailing list systems (among supported are ezmlm, mailman and
- subscribe.ru systems). The module has only option @strong{symbol} that defines a
- symbol that would be inserted if this message is sent via mailing list.
-
- @section Once received.
-
- This lua module checks received headers of message and insert symbol if only one
- received header is presented in message (that usually signals that this mail is
- sent directly to our MTA). Also it is possible to insert @emph{strict} symbol
- that indicates that host from which we receive this message is either
- unresolveable or has bad patterns (like 'dynamic', 'broadband' etc) that
- indicates widely used botnets. Configuration options are:
- @itemize @bullet
- @item @emph{symbol}: symbol to insert for messages with one received header.
- @item @emph{symbol_strict}: symbol to insert for messages with one received
- header and containing bad patterns or unresolveable sender.
- @item @emph{bad_host}: defines pattern that would be count as "bad".
- @item @emph{good_host}: defines pattern that would be count as "good" (no strict
- symbol would be inserted), note that "good" has a priority over "bad" pattern.
- @end itemize
- You can define several "good" and "bad" patterns for this module.
-
- @section Received rbl.
-
- Received rbl module checks for all received headers and make dns requests to IP
- black lists. This can be used for checking whether this email was transfered by
- some blacklisted gateway. Here are options available:
- @itemize @bullet
- @item @emph{symbol}: symbol to insert if message contains blacklisted received
- headers
- @item @emph{rbl}: a name of rbl to check, it is possible to define specific
- symbol for this rbl by adding symbol name after semicolon:
- @example
- rbl = pbl.spamhaus.org:RECEIVED_PBL
- @end example
- @end itemize
-
- @section Multimap.
-
- Multimap is lua module that provides functionality to operate with different types
- of lists. Now it can works with maps of strings for extracting MIME headers and match
- them using lists. Also it is possible to create ip (or ipnetwork) maps for
- checking ip address from which we receive that mail. DNS black lists are also
- supported.
- Multimap module works with a set of rules. Each rule can be one of three types:
- @enumerate 1
- @item @emph{ip}: is used for lists of ip addresses
- @item @emph{header}: is used to match headers
- @item @emph{dnsbl}: is used for dns lists of ip addresses
- @end enumerate
- Basically each rule is a line separated by commas containing rule parameters.
- Each parameter has name and value, separated by equal sign. Here is list of
- parameters (mandatory parameters are marked with @strong{*}):
- @itemize @bullet
- @item @strong{*} @emph{type}: rule type
- @item @strong{*} @emph{map}: path to map (in uri format file:// or http://) or
- name of dns blacklist
- @item @strong{*} @emph{symbol}: symbol to insert
- @item @emph{header}: header to use for header rules
- @item @emph{pattern}: pattern that would be used to extract specific part of
- header
- @end itemize
- @noindent
- Here is an example of multimap rules:
- @example
- <module name="multimap">
- <option name="rule">type = header, header = To, pattern = @(.+)>?$, map = file:///var/db/rspamd/rcpt_test, symbol = R_RCPT_WHITELIST</option>
- <option name="rule">type = ip, map = file:///var/db/rspamd/ip_test, symbol = R_IP_WHITELIST</option>
- <option name="rule">type = dnsbl, map = pbl.spamhaus.org, symbol = R_IP_PBL</option>
- </module>
- @end example
-
- @section Conclusion.
-
- Rspamd is shipped with some ammount of modules that provides basic functionality
- fro checking emails. You are allowed to add custom rules for regexp module and
- to set up available parameters for other modules. Also you may write your own
- modules (in C or Lua) but this would be described further in this documentation.
- You may set configuration options for modules from lua or from xml depends on
- its complexity. Internal modules are enabled and disabled by @strong{filters}
- configuration option. Lua modules are loaded and usually can be disabled by
- removing their configuration section from xml file or by removing corresponding
- line from @strong{modules} section.
-
- @bye
|