You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

compact_enc_det.cc 226KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545554655475548554955505551555255535554555555565557555855595560556155625563556455655566556755685569557055715572557355745575557655775578557955805581558255835584558555865587558855895590559155925593559455955596559755985599560056015602560356045605560656075608560956105611561256135614561556165617561856195620562156225623562456255626562756285629563056315632563356345635563656375638563956405641564256435644564556465647564856495650565156525653565456555656565756585659566056615662566356645665566656675668566956705671567256735674567556765677567856795680568156825683568456855686568756885689569056915692569356945695569656975698569957005701570257035704570557065707570857095710571157125713571457155716571757185719
  1. // Copyright 2016 Google Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. //
  15. ////////////////////////////////////////////////////////////////////////////////
  16. #include "compact_enc_det.h"
  17. #include <math.h> // for sqrt
  18. #include <stddef.h> // for size_t
  19. #include <stdio.h> // for printf, fprintf, NULL, etc
  20. #include <stdlib.h> // for qsort
  21. #include <string.h> // for memset, memcpy, memcmp, etc
  22. #include <memory>
  23. #include <string> // for string, operator==, etc
  24. #include "compact_enc_det_hint_code.h"
  25. #include "util/string_util.h"
  26. #include "util/basictypes.h"
  27. #include "util/commandlineflags.h"
  28. #include "util/logging.h"
  29. using std::string;
  30. // TODO as of 2007.10.09:
  31. //
  32. // Consider font=TT-BHxxx as user-defined => binary
  33. // Demote GB18030 if no 8x3x pair
  34. // Map byte2 ascii punct to 0x60, digits to 0x7e, gets them into hires
  35. // Consider removing/ignoring bytes 01-1F to avoid crap pollution
  36. // Possibly boost declared encoding in robust scan
  37. // googlebot tiny files
  38. // look for ranges of encodings
  39. // consider tags just as > < within aligned block of 32
  40. // flag too few characters in postproc (Latin 6 problem)
  41. // Remove slow scan beyond 16KB
  42. // Consider removing kMostLikelyEncoding or cut it in half
  43. // A note on mixed encodings
  44. //
  45. // The most common encoding error on the web is a page containing a mixture of
  46. // CP-1252 and UTF-8. A less common encoding error is a third-party feed that
  47. // has been converted from CP-1252 to UTF-8 and then those bytes converted a
  48. // second time to UTF-8. CED originally attempted to detect these error cases
  49. // by using two synthetic encodings, UTF8CP1252 and UTF8UTF8. The intended
  50. // implementation was to start these just below CP1252 and UTF8 respectively in
  51. // overall liklihood, and allow 1252 and UTF8 to fall behind if mixtures are
  52. // found.
  53. //
  54. // The UTF8UTF8 encoding is a possible outcome from CED, but unfortunately the
  55. // UTF8CP1252 internal encoding was added late and not put into encodings.proto,
  56. // so at the final step it is mapped to UTF8UTF8 also. This was a bad idea and
  57. // is removed in this November 2011 CL.
  58. //
  59. // Mixed encoding detection never worked out as well as envisioned, so the
  60. // ced_allow_utf8utf8 flag normally disables all this.
  61. //
  62. // The effect is that CP-1252 and UTF-8 mixtures will usually be detected as
  63. // UTF8, and the inputconverter code for UTF8 normally will convert bare
  64. // CP-1252 bytes to UTF-8, instead of the less-helpful FFFD substitution. UTF-8
  65. // and double-UTF-8 mixtures will be detected as UTF-8, and the double
  66. // conversion will stand.
  67. //
  68. // However, it is occasionally useful to use CED to detect double-converted
  69. // UTF-8 coming from third-party data feeds, so they can be fixed at the source.
  70. // For this purpose, the UTF8UTF8 encoding remains available under the
  71. // ced_allow_utf8utf8 flag.
  72. //
  73. // When UTF8UTF8 is detected, the inputconverter code will undo the double
  74. // conversion, giving good text.
  75. // Norbert Runge has noted these words in CP1252 that are mistakenly identified
  76. // as UTF-8 because of the last pair of characters:
  77. // NESTLÉ® 0xC9 0xAE U+00C9 U+00AE C9AE = U+026E;SMALL LEZH
  78. // drauß\u2019 0xDF 0x92 U+00DF U+2019 DF92 = U+07D2;NKO LETTER N
  79. // Mutterschoß\u201c 0xDF 0x93 U+00DF U+201C DF93 = U+07D3;NKO LETTER BA
  80. // Schoß\u201c 0xDF 0x93 U+00DF U+201C
  81. // weiß\u201c 0xDF 0x93 U+00DF U+00AB
  82. // Schnellfuß\u201c 0xDF 0x93 U+00DF U+201C
  83. // süß« 0xDF 0xAB U+00DF U+00AB DFAB = U+07EB;NKO HIGH TONE
  84. // These four byte combinations now explicitly boost Latin1/CP1252.
  85. // And for reference, here are a couple of Portuguese spellings
  86. // that may be mistaken as double-byte encodings.
  87. // informações 0xE7 0xF5
  88. // traição 0xE7 0xE3
  89. static const char* kVersion = "2.2";
  90. DEFINE_bool(ced_allow_utf8utf8, false, "Allow the UTF8UTF8 encoding, "
  91. "to handle mixtures of CP1252 "
  92. "converted to UTF-8 zero, one, "
  93. "or two times");
  94. DEFINE_int32(enc_detect_slow_max_kb, 16,
  95. "Maximum number of Kbytes to examine for "
  96. "7-bit-only (2022, Hz, UTF7) encoding detect. "
  97. "You are unlikely to want to change this.");
  98. DEFINE_int32(enc_detect_fast_max_kb, 256,
  99. "Maximum number of Kbytes to examine for encoding detect. "
  100. "You are unlikely to want to change this.");
  101. DEFINE_int32(ced_reliable_difference, 300, "30 * Bits of minimum probablility "
  102. "difference 1st - 2nd to be considered reliable \n"
  103. " 2 corresponds to min 4x difference\n"
  104. " 4 corresponds to min 16x difference\n"
  105. " 8 corresponds to min 256x difference\n"
  106. " 10 corresponds to min 1024x difference\n"
  107. " 20 corresponds to min 1Mx difference.");
  108. // Text debug output options
  109. DEFINE_bool(enc_detect_summary, false,
  110. "Print first 16 interesting pairs at exit.");
  111. DEFINE_bool(counts, false, "Count major-section usage");
  112. // PostScript debug output options
  113. DEFINE_bool(enc_detect_detail, false,
  114. "Print PostScript of every update, to stderr.");
  115. DEFINE_bool(enc_detect_detail2, false,
  116. "More PostScript detail of every update, to stderr.");
  117. DEFINE_bool(enc_detect_source, false, "Include source text in detail");
  118. // Encoding name must exactly match FIRST column of kI18NInfoByEncoding in
  119. // lang_enc.cc
  120. // Following flags are not in use. Replace them with constants to
  121. // avoid static initialization.
  122. //DEFINE_string(enc_detect_watch1, "", "Do detail2 about this encoding name.");
  123. //DEFINE_string(enc_detect_watch2, "", "Do detail2 about this encoding name.");
  124. static const char* const FLAGS_enc_detect_watch1 = "";
  125. static const char* const FLAGS_enc_detect_watch2 = "";
  126. // Only for experiments. Delete soon.
  127. DEFINE_bool(force127, false, "Force Latin1, Latin2, Latin7 based on trigrams");
  128. // Demo-mode/debugging experiment
  129. DEFINE_bool(demo_nodefault, false,
  130. "Default to all equal; no boost for declared encoding.");
  131. DEFINE_bool(dirtsimple, false, "Just scan and count for all encodings");
  132. DEFINE_bool(ced_echo_input, false, "Echo ced input to stderr");
  133. static const int XDECILOG2 = 3; // Multiplier for log base 2 ** n/10
  134. static const int XLOG2 = 30; // Multiplier for log base 2 ** n
  135. static const int kFinalPruneDifference = 10 * XLOG2;
  136. // Final bits of minimum
  137. // probability difference 1st-nth
  138. // to be pruned
  139. static const int kInititalPruneDifference = kFinalPruneDifference * 4;
  140. // Initial bits of minimum
  141. // probability difference 1st-nth
  142. // to be pruned
  143. //
  144. static const int kPruneDiffDecrement = kFinalPruneDifference;
  145. // Decrements bits of minimum
  146. // probability difference 1st-nth
  147. // to be pruned
  148. static const int kSmallInitDiff = 2 * XLOG2; // bits of minimum
  149. // probability difference, base to
  150. // superset encodings
  151. static const int kBoostInitial = 20 * XLOG2; // bits of boost for
  152. // initial byte patterns (BOM, 00)
  153. static const int kBadPairWhack = 20 * XLOG2; // bits of whack for
  154. // one bad pair
  155. static const int kBoostOnePair = 20 * XLOG2; // bits of boost for
  156. // one good pair in Hz, etc.
  157. static const int kGentleOnePair = 4 * XLOG2; // bits of boost for
  158. // one good sequence
  159. //
  160. static const int kGentlePairWhack = 2 * XLOG2; // bits of whack
  161. // for ill-formed sequence
  162. static const int kGentlePairBoost = 2 * XLOG2; // bits of boost
  163. // for well-formed sequence
  164. static const int kDeclaredEncBoost = 5 * XDECILOG2; // bits/10 of boost for
  165. // best declared encoding per bigram
  166. static const int kBestEncBoost = 5 * XDECILOG2; // bits/10 of boost for
  167. // best encoding per bigram
  168. static const int kTrigramBoost = 2 * XLOG2; // bits of boost for Latin127 tri
  169. static const int kMaxPairs = 48; // Max interesting pairs to look at
  170. // If you change this,
  171. // adjust *PruneDiff*
  172. static const int kPruneMask = 0x07; // Prune every 8 interesting pairs
  173. static const int kBestPairsCount = 16; // For first N pairs, do extra boost
  174. // based on most likely encoding
  175. // of pair over entire web
  176. static const int kDerateHintsBelow = 12; // If we have fewer than N bigrams,
  177. // weaken the hints enough that
  178. // unhinted encodings have a hope of
  179. // rising to the top
  180. static const int kMinRescanLength = 800; // Don't bother rescanning for
  181. // unreliable encoding if fewer
  182. // than this many bytes unscanned.
  183. // We will rescan at most last half
  184. // of this.
  185. static const int kStrongBinary = 12; // Make F_BINARY the only encoding
  186. static const int kWeakerBinary = 4; // Make F_BINARY likely encoding
  187. // These are byte counts from front of file
  188. static const int kBinaryHardAsciiLimit = 6 * 1024; // Not binary if all ASCII
  189. static const int kBinarySoftAsciiLimit = 8 * 1024; // " if mostly ASCII
  190. // We try here to avoid having title text dominate the encoding detection,
  191. // for the not-infrequent error case of title in encoding1, body in encoding2:
  192. // we want to bias toward encoding2 winning.
  193. //
  194. // kMaxBigramsTagTitleText should be a multiple of 2, 3, and 4, so that we
  195. // rarely cut off mid-character in the original (not-yet-detected) encoding.
  196. // This matters most for UTF-8 two- and three-byte codes and for
  197. // Shift-JIS three-byte codes.
  198. static const int kMaxBigramsTagTitleText = 12; // Keep only some tag text
  199. static const int kWeightshiftForTagTitleText = 4; // Give text in tags, etc.
  200. // 1/16 normal weight
  201. static const int kStrongPairs = 6; // Let reliable enc with this many
  202. // pairs overcome missing hint
  203. enum CEDInternalFlags {
  204. kCEDNone = 0, // The empty flag
  205. kCEDRescanning = 1, // Do not further recurse
  206. kCEDSlowscore = 2, // Do extra scoring
  207. kCEDForceTags = 4, // Always examine text inside tags
  208. };
  209. // Forward declaration
  210. Encoding InternalDetectEncoding(
  211. CEDInternalFlags flags, const char* text, int text_length,
  212. const char* url_hint, const char* http_charset_hint,
  213. const char* meta_charset_hint, const int encoding_hint,
  214. const Language language_hint, // User interface lang
  215. const CompactEncDet::TextCorpusType corpus_type,
  216. bool ignore_7bit_mail_encodings, int* bytes_consumed, bool* is_reliable,
  217. Encoding* second_best_enc);
  218. typedef struct {
  219. const uint8* hires[4]; // Pointers to possible high-resolution bigram deltas
  220. uint8 x_bar; // Average byte2 value
  221. uint8 y_bar; // Average byte1 value
  222. uint8 x_stddev; // Standard deviation of byte2 value
  223. uint8 y_stddev; // Standard deviation of byte1 value
  224. int so; // Scaling offset -- add to probabilities below
  225. uint8 b1[256]; // Unigram probability for first byte of aligned bigram
  226. uint8 b2[256]; // Unigram probability for second byte of aligned bigram
  227. uint8 b12[256]; // Unigram probability for cross bytes of aligned bigram
  228. } UnigramEntry;
  229. //typedef struct {
  230. // uint8 b12[256*256]; // Bigram probability for aligned bigram
  231. //} FullBigramEntry;
  232. // Include all the postproc-generated tables here:
  233. // RankedEncoding
  234. // kMapToEncoding
  235. // unigram_table
  236. // kMostLIkelyEncoding
  237. // kTLDHintProbs
  238. // kCharsetHintProbs
  239. // HintEntry, kMaxTldKey kMaxTldVector, etc.
  240. // =============================================================================
  241. #include "compact_enc_det_generated_tables.h"
  242. #define F_ASCII F_Latin1 // "ASCII" is a misnomer, so this code uses "Latin1"
  243. #define F_BINARY F_X_BINARYENC // We are mid-update for name change
  244. #define F_UTF8UTF8 F_X_UTF8UTF8 // We are mid-update for name change
  245. #define F_BIG5_CP950 F_BIG5 // We are mid-update for name change
  246. #define F_Unicode F_UTF_16LE // We are mid-update for name change
  247. // =============================================================================
  248. // 7-bit encodings have at least one "interesting" byte value < 0x80
  249. // (00 0E 1B + ~)
  250. // JIS 2022-cn 2022-kr hz utf7
  251. // Unicode UTF-16 UTF-32
  252. // 8-bit encodings have no interesting byte values < 0x80
  253. static const uint32 kSevenBitActive = 0x00000001; // needs <80 to detect
  254. static const uint32 kUTF7Active = 0x00000002; // <80 and +
  255. static const uint32 kHzActive = 0x00000004; // <80 and ~
  256. static const uint32 kIso2022Active = 0x00000008; // <80 and 1B 0E 0F
  257. static const uint32 kUTF8Active = 0x00000010;
  258. static const uint32 kUTF8UTF8Active = 0x00000020;
  259. static const uint32 kUTF1632Active = 0x00000040; // <80 and 00
  260. static const uint32 kBinaryActive = 0x00000080; // <80 and 00
  261. static const uint32 kTwobyteCode = 0x00000100; // Needs 8xxx
  262. static const uint32 kIsIndicCode = 0x00000200; //
  263. static const uint32 kHighAlphaCode = 0x00000400; // full alphabet in 8x-Fx
  264. static const uint32 kHighAccentCode = 0x00000800; // accents in 8x-Fx
  265. static const uint32 kEUCJPActive = 0x00001000; // Have to mess with phase
  266. // Debug only. not thread safe
  267. static int encdet_used = 0;
  268. static int rescore_used = 0;
  269. static int rescan_used = 0;
  270. static int robust_used = 0;
  271. static int looking_used = 0;
  272. static int doing_used = 0;
  273. // For debugging only -- about 256B/entry times about 500 = 128KB
  274. // TODO: only allocate this if being used
  275. typedef struct {
  276. int offset;
  277. int best_enc; // Best ranked encoding for this bigram, or
  278. // -1 for overhead entries
  279. string label;
  280. int detail_enc_prob[NUM_RANKEDENCODING];
  281. } DetailEntry;
  282. static int watch1_rankedenc = -1; // Debug. not threadsafe
  283. static int watch2_rankedenc = -1; // Debug. not threadsafe
  284. ////static int next_detail_entry = 0; // Debug. not threadsafe
  285. ////static DetailEntry details[kMaxPairs * 10]; // Allow 10 details per bigram
  286. // End For debugging only
  287. // Must match kTestPrintableAsciiTildePlus exit codes, minus one
  288. enum PairSet {AsciiPair = 0, OtherPair = 1, NUM_PAIR_SETS = 2};
  289. // The reasons for pruning
  290. enum PruneReason {PRUNE_NORMAL, PRUNE_SLOWEND, PRUNE_FINAL};
  291. static const char* kWhatSetName[] = {"Ascii", "Other"};
  292. // State for encodings that do shift-out/shift-in between one- and two-byte
  293. // regions (ISO-2022-xx, HZ)
  294. enum StateSoSi {SOSI_NONE, SOSI_ERROR, SOSI_ONEBYTE, SOSI_TWOBYTE};
  295. typedef struct {
  296. const uint8* initial_src; // For calculating byte offsets
  297. const uint8* limit_src; // Range of input source
  298. const uint8* prior_src; // Source consumed by prior call to BoostPrune
  299. const uint8* last_pair; // Last pair inserted into interesting_pairs
  300. DetailEntry* debug_data; // Normally NULL. Ptr to debug data for
  301. // FLAGS_enc_detect_detail PostScript data
  302. int next_detail_entry; // Debug
  303. bool done;
  304. bool reliable;
  305. bool hints_derated;
  306. int declared_enc_1; // From http/meta hint
  307. int declared_enc_2; // from http/meta hint
  308. int prune_count; // Number of times we have pruned
  309. int trigram_highwater_mark; // Byte offset of last trigram processing
  310. bool looking_for_latin_trigrams; // True if we should test for doing
  311. // Latin1/2/7 trigram processing
  312. bool do_latin_trigrams; // True if we actually are scoring trigrams
  313. // Miscellaneous state variables for difficult encodings
  314. int binary_quadrants_count; // Number of four bigram quadrants seen:
  315. // 0xxxxxxx0xxxxxxx 0xxxxxxx1xxxxxx
  316. // 1xxxxxxx0xxxxxxx 1xxxxxxx1xxxxxx
  317. int binary_8x4_count; // Number of 8x4 buckets seen:
  318. uint32 binary_quadrants_seen; // Bit[i] set if bigram i.......i....... seen
  319. uint32 binary_8x4_seen; // Bit[i] set if bigram iii.....ii...... seen
  320. int utf7_starts; // Count of possible UTF-7 beginnings seen
  321. int prior_utf7_offset; // Source consumed by prior UTF-7 string
  322. int next_utf8_ministate; // Mini state for UTF-8 sequences
  323. int utf8_minicount[6]; // Number of correct 2- 3- 4-byte seq, errors
  324. int next_utf8utf8_ministate; // Mini state for UTF8UTF8 sequences
  325. int utf8utf8_odd_byte; // UTF8UTF8 seq has odd number of bytes
  326. int utf8utf8_minicount[6]; // Number of correct 2- 3- 4-byte seq, errors
  327. StateSoSi next_2022_state; // Mini state for 2022 sequences
  328. StateSoSi next_hz_state; // Mini state for HZ sequences
  329. bool next_eucjp_oddphase; // Mini state for EUC-JP sequences
  330. int byte32_count[8]; // Count of top 3 bits of byte1 of bigram
  331. // 0x1x 2x3x 4x5x 6x7x 8x9x AxBx CxDx ExFx
  332. uint32 active_special; // Bits showing which special cases are active
  333. Encoding tld_hint; // Top TLD encoding or UNKNOWN
  334. Encoding http_hint; // What the document says about itself or
  335. Encoding meta_hint; // UNKNOWN_ENCODING. BOM is initial byte
  336. Encoding bom_hint; // order mark for UTF-xx
  337. // small cache of previous interesting bigrams
  338. int next_prior_bigram;
  339. int prior_bigram[4];
  340. int prior_binary[1];
  341. int top_rankedencoding; // Top two probabilities and families
  342. int second_top_rankedencoding;
  343. int top_prob;
  344. int second_top_prob;
  345. int prune_difference; // Prune things this much below the top prob
  346. int rankedencoding_list_len; // Number of active encodings
  347. int rankedencoding_list[NUM_RANKEDENCODING]; // List of active encodings
  348. //
  349. int enc_prob[NUM_RANKEDENCODING]; // Cumulative probability per enc
  350. // This is where all the action is
  351. int hint_prob[NUM_RANKEDENCODING]; // Initial hint probabilities
  352. int hint_weight[NUM_RANKEDENCODING]; // Number of hints for this enc
  353. // Two sets -- one for printable ASCII, one for the rest
  354. int prior_interesting_pair[NUM_PAIR_SETS]; // Pairs consumed by prior call
  355. int next_interesting_pair[NUM_PAIR_SETS]; // Next pair to write
  356. char interesting_pairs[NUM_PAIR_SETS][kMaxPairs * 2]; // Two bytes per pair
  357. int interesting_offsets[NUM_PAIR_SETS][kMaxPairs]; // Src offset of pair
  358. int interesting_weightshift[NUM_PAIR_SETS][kMaxPairs]; // weightshift of pair
  359. } DetectEncodingState;
  360. // Record a debug event that changes probabilities
  361. void SetDetailsEncProb(DetectEncodingState* destatep,
  362. int offset, int best_enc, const char* label) {
  363. int next = destatep->next_detail_entry;
  364. destatep->debug_data[next].offset = offset;
  365. destatep->debug_data[next].best_enc = best_enc;
  366. destatep->debug_data[next].label = label;
  367. memcpy(&destatep->debug_data[next].detail_enc_prob,
  368. &destatep->enc_prob,
  369. sizeof(destatep->enc_prob));
  370. ++destatep->next_detail_entry;
  371. }
  372. // Record a debug event that changes probabilities, copy offset
  373. void SetDetailsEncProbCopyOffset(DetectEncodingState* destatep,
  374. int best_enc, const char* label) {
  375. int next = destatep->next_detail_entry;
  376. destatep->debug_data[next].offset = destatep->debug_data[next - 1].offset;
  377. destatep->debug_data[next].best_enc = best_enc;
  378. destatep->debug_data[next].label = label;
  379. memcpy(&destatep->debug_data[next].detail_enc_prob,
  380. &destatep->enc_prob,
  381. sizeof(destatep->enc_prob));
  382. ++destatep->next_detail_entry;
  383. }
  384. // Record a debug event that changes probs and has simple text label
  385. void SetDetailsEncLabel(DetectEncodingState* destatep, const char* label) {
  386. int next = destatep->next_detail_entry;
  387. destatep->debug_data[next].offset = destatep->debug_data[next - 1].offset;
  388. destatep->debug_data[next].best_enc = -1;
  389. destatep->debug_data[next].label = label;
  390. memcpy(&destatep->debug_data[next].detail_enc_prob,
  391. &destatep->enc_prob,
  392. sizeof(destatep->enc_prob));
  393. ++destatep->next_detail_entry;
  394. }
  395. // Record a debug event that is just a text label, no change in probs
  396. void SetDetailsLabel(DetectEncodingState* destatep, const char* label) {
  397. int next = destatep->next_detail_entry;
  398. destatep->debug_data[next].offset = destatep->debug_data[next - 1].offset;
  399. destatep->debug_data[next].best_enc = -1;
  400. destatep->debug_data[next].label = label;
  401. memcpy(&destatep->debug_data[next].detail_enc_prob,
  402. &destatep->debug_data[next - 1].detail_enc_prob,
  403. sizeof(destatep->enc_prob));
  404. ++destatep->next_detail_entry;
  405. }
  406. // Maps superset encodings to base, to see if 2 encodings are compatible
  407. // (Non-identity mappings are marked "-->" below.)
  408. static const Encoding kMapEncToBaseEncoding[] = {
  409. ISO_8859_1, // 0: Teragram ASCII
  410. ISO_8859_2, // 1: Teragram Latin2
  411. ISO_8859_3, // 2: in BasisTech but not in Teragram
  412. ISO_8859_4, // 3: Teragram Latin4
  413. ISO_8859_5, // 4: Teragram ISO-8859-5
  414. ISO_8859_6, // 5: Teragram Arabic
  415. ISO_8859_7, // 6: Teragram Greek
  416. MSFT_CP1255, // 7: Teragram Hebrew --> 36
  417. ISO_8859_9, // 8: in BasisTech but not in Teragram
  418. ISO_8859_10, // 9: in BasisTech but not in Teragram
  419. JAPANESE_EUC_JP, // 10: Teragram EUC_JP
  420. JAPANESE_SHIFT_JIS, // 11: Teragram SJS
  421. JAPANESE_JIS, // 12: Teragram JIS
  422. CHINESE_BIG5, // 13: Teragram BIG5
  423. CHINESE_GB, // 14: Teragram GB
  424. CHINESE_EUC_CN, // 15: Teragram EUC-CN
  425. KOREAN_EUC_KR, // 16: Teragram KSC
  426. UNICODE, // 17: Teragram Unicode
  427. CHINESE_EUC_CN, // 18: Teragram EUC --> 15
  428. CHINESE_EUC_CN, // 19: Teragram CNS --> 15
  429. CHINESE_BIG5, // 20: Teragram BIG5_CP950 --> 13
  430. JAPANESE_SHIFT_JIS, // 21: Teragram CP932 --> 11
  431. UTF8, // 22
  432. UNKNOWN_ENCODING, // 23
  433. ISO_8859_1, // 24: ISO_8859_1 with all characters <= 127 --> 0
  434. RUSSIAN_KOI8_R, // 25: Teragram KOI8R
  435. RUSSIAN_CP1251, // 26: Teragram CP1251
  436. ISO_8859_1, // 27: CP1252 aka MSFT euro ascii --> 0
  437. RUSSIAN_KOI8_RU, // 28: CP21866 aka KOI8_RU, used for Ukrainian
  438. MSFT_CP1250, // 29: CP1250 aka MSFT eastern european
  439. ISO_8859_1, // 30: aka ISO_8859_0 aka ISO_8859_1 euroized --> 0
  440. ISO_8859_9, // 31: used for Turkish
  441. ISO_8859_13, // 32: used in Baltic countries --> 43
  442. ISO_8859_11, // 33: aka TIS-620, used for Thai
  443. ISO_8859_11, // 34: used for Thai --> 33
  444. MSFT_CP1256, // 35: used for Arabic
  445. MSFT_CP1255, // 36: Logical Hebrew Microsoft
  446. MSFT_CP1255, // 37: Iso Hebrew Logical --> 36
  447. MSFT_CP1255, // 38: Iso Hebrew Visual --> 36
  448. CZECH_CP852, // 39
  449. ISO_8859_2, // 40: aka ISO_IR_139 aka KOI8_CS --> 1
  450. MSFT_CP1253, // 41: used for Greek, but NOT a superset of 8859-7
  451. RUSSIAN_CP866, // 42
  452. ISO_8859_13, // 43
  453. ISO_2022_KR, // 44
  454. CHINESE_GB, // 45 GBK --> 14
  455. CHINESE_GB, // 46 GB18030 --> 14
  456. CHINESE_BIG5, // 47 BIG5_HKSCS --> 13
  457. ISO_2022_KR, // 48 ISO_2022_CN --> 44
  458. TSCII, // 49 Indic encoding
  459. TAMIL_MONO, // 50 Indic encoding - Tamil
  460. TAMIL_BI, // 51 Indic encoding - Tamil
  461. JAGRAN, // 52 Indic encoding - Devanagari
  462. MACINTOSH_ROMAN, // 53
  463. UTF7, // 54
  464. BHASKAR, // 55 Indic encoding - Devanagari
  465. HTCHANAKYA, // 56 Indic encoding - Devanagari
  466. UTF16BE, // 57
  467. UTF16LE, // 58
  468. UTF32BE, // 59
  469. UTF32LE, // 60
  470. BINARYENC, // 61
  471. HZ_GB_2312, // 62
  472. UTF8UTF8, // 63
  473. TAM_ELANGO, // 64 Elango - Tamil
  474. TAM_LTTMBARANI, // 65 Barani - Tamil
  475. TAM_SHREE, // 66 Shree - Tamil
  476. TAM_TBOOMIS, // 67 TBoomis - Tamil
  477. TAM_TMNEWS, // 68 TMNews - Tamil
  478. TAM_WEBTAMIL, // 69 Webtamil - Tamil
  479. KDDI_SHIFT_JIS, // 70 KDDI Shift_JIS
  480. DOCOMO_SHIFT_JIS, // 71 DoCoMo Shift_JIS
  481. SOFTBANK_SHIFT_JIS, // 72 SoftBank Shift_JIS
  482. KDDI_ISO_2022_JP, // 73 KDDI ISO-2022-JP
  483. SOFTBANK_ISO_2022_JP, // 74 SOFTBANK ISO-2022-JP
  484. };
  485. COMPILE_ASSERT(arraysize(kMapEncToBaseEncoding) == NUM_ENCODINGS,
  486. kMapEncToBaseEncoding_has_incorrect_size);
  487. // Maps base encodings to 0, supersets to 1+, undesired to -1
  488. // (Non-identity mappings are marked "-->" below.)
  489. static const int kMapEncToSuperLevel[] = {
  490. 0, // 0: Teragram ASCII
  491. 0, // 1: Teragram Latin2
  492. 0, // 2: in BasisTech but not in Teragram
  493. 0, // 3: Teragram Latin4
  494. 0, // 4: Teragram ISO-8859-5
  495. 0, // 5: Teragram Arabic
  496. 0, // 6: Teragram Greek
  497. 0, // 7: Teragram Hebrew
  498. 0, // 8: in BasisTech but not in Teragram
  499. 0, // 9: in BasisTech but not in Teragram
  500. 0, // 10: Teragram EUC_JP
  501. 0, // 11: Teragram SJS
  502. 0, // 12: Teragram JIS
  503. 0, // 13: Teragram BIG5
  504. 0, // 14: Teragram GB
  505. 0, // 15: Teragram EUC-CN
  506. 0, // 16: Teragram KSC
  507. 0, // 17: Teragram Unicode
  508. -1, // 18: Teragram EUC --> 15
  509. -1, // 19: Teragram CNS --> 15
  510. 1, // 20: Teragram BIG5_CP950 --> 13
  511. 1, // 21: Teragram CP932 --> 11
  512. 0, // 22
  513. -1, // 23
  514. -1, // 24: ISO_8859_1 with all characters <= 127 --> 0
  515. 0, // 25: Teragram KOI8R
  516. 0, // 26: Teragram CP1251
  517. 1, // 27: CP1252 aka MSFT euro ascii --> 0
  518. 0, // 28: CP21866 aka KOI8_RU, used for Ukrainian
  519. 0, // 29: CP1250 aka MSFT eastern european
  520. 1, // 30: aka ISO_8859_0 aka ISO_8859_1 euroized --> 0
  521. 0, // 31: used for Turkish
  522. 1, // 32: used in Baltic countries --> 43
  523. 0, // 33: aka TIS-620, used for Thai
  524. 1, // 34: used for Thai --> 33
  525. 0, // 35: used for Arabic
  526. 0, // 36: Logical Hebrew Microsoft
  527. -1, // 37: Iso Hebrew Logical --> 36
  528. -1, // 38: Iso Hebrew Visual --> 7
  529. 0, // 39
  530. 1, // 40: aka ISO_IR_139 aka KOI8_CS --> 1
  531. 0, // 41: used for Greek, NOT superset of 8859-7
  532. 0, // 42
  533. 0, // 43
  534. 0, // 44
  535. 1, // 45 GBK --> 14
  536. 1, // 46 GB18030 --> 14
  537. 1, // 47 BIG5_HKSCS --> 13
  538. 1, // 48 ISO_2022_CN --> 44
  539. 0, // 49 Indic encoding
  540. 0, // 50 Indic encoding - Tamil
  541. 0, // 51 Indic encoding - Tamil
  542. 0, // 52 Indic encoding - Devanagari
  543. 0, // 53
  544. 0, // 54
  545. 0, // 55 Indic encoding - Devanagari
  546. 0, // 56 Indic encoding - Devanagari
  547. 0, // 57
  548. 0, // 58
  549. 0, // 59
  550. 0, // 60
  551. 0, // 61
  552. 0, // 62
  553. 2, // 63
  554. 0, 0, 0, 0, 0, 0, // add six more Tamil
  555. 0, 0, 0, 0, 0, // add five encodings with emoji
  556. };
  557. COMPILE_ASSERT(arraysize(kMapEncToSuperLevel) == NUM_ENCODINGS,
  558. kMapEncToSuperLevel_has_incorrect_size);
  559. // Subscripted by Encoding enum value
  560. static const uint32 kSpecialMask[] = {
  561. kHighAccentCode, // 0
  562. kHighAccentCode,
  563. kHighAccentCode,
  564. kHighAccentCode,
  565. kHighAlphaCode, // 4
  566. kHighAlphaCode,
  567. kHighAlphaCode,
  568. kHighAlphaCode,
  569. kHighAccentCode,
  570. kHighAccentCode,
  571. kTwobyteCode + kEUCJPActive, // 10 euc-jp
  572. kTwobyteCode,
  573. kSevenBitActive + kIso2022Active, // jis
  574. kTwobyteCode,
  575. kTwobyteCode,
  576. kTwobyteCode,
  577. kTwobyteCode,
  578. kSevenBitActive + kUTF1632Active, // Unicode
  579. kTwobyteCode,
  580. kTwobyteCode,
  581. kTwobyteCode, // 20
  582. kTwobyteCode,
  583. kUTF8Active, // UTF-8
  584. 0,
  585. 0,
  586. kHighAlphaCode, // 25
  587. kHighAlphaCode,
  588. kHighAccentCode,
  589. kHighAlphaCode,
  590. kHighAccentCode,
  591. kHighAccentCode, // 30
  592. kHighAccentCode,
  593. kHighAccentCode,
  594. kHighAlphaCode,
  595. kHighAlphaCode,
  596. kHighAlphaCode, // 35
  597. kHighAlphaCode,
  598. kHighAlphaCode,
  599. kHighAlphaCode,
  600. 0,
  601. 0, // 40
  602. kHighAlphaCode,
  603. kHighAlphaCode,
  604. kHighAccentCode,
  605. kSevenBitActive + kIso2022Active, // 2022-kr
  606. kTwobyteCode,
  607. kTwobyteCode,
  608. kTwobyteCode,
  609. kSevenBitActive + kIso2022Active, // 2022-cn
  610. kHighAlphaCode + kIsIndicCode, // 49 TSCII
  611. kHighAlphaCode + kIsIndicCode, // 50 TAMIL_MONO
  612. kHighAlphaCode + kIsIndicCode, // 51 TAMIL_BI
  613. kHighAlphaCode + kIsIndicCode, // 52 JAGRAN
  614. kHighAccentCode, // 53 MACINTOSH_ROMAN
  615. kSevenBitActive + kUTF7Active, // 54 UTF-7
  616. kHighAlphaCode + kIsIndicCode, // 55 BHASKAR Indic encoding - Devanagari
  617. kHighAlphaCode + kIsIndicCode, // 56 HTCHANAKYA Indic encoding - Devanagari
  618. kSevenBitActive + kUTF1632Active, // 57 UTF16BE
  619. kSevenBitActive + kUTF1632Active, // 58 UTF16LE
  620. kSevenBitActive + kUTF1632Active, // 59 UTF32BE
  621. kSevenBitActive + kUTF1632Active, // 60 UTF32LE
  622. kSevenBitActive + kBinaryActive, // 61 BINARYENC
  623. kSevenBitActive + kHzActive, // 62 HZ_GB_2312
  624. kHighAccentCode + kUTF8Active + kUTF8UTF8Active, // 63 UTF8UTF8
  625. kHighAlphaCode + kIsIndicCode, // 64 Elango - Tamil
  626. kHighAlphaCode + kIsIndicCode, // 65 Barani - Tamil
  627. kHighAlphaCode + kIsIndicCode, // 66 Shree - Tamil
  628. kHighAlphaCode + kIsIndicCode, // 67 TBoomis - Tamil
  629. kHighAlphaCode + kIsIndicCode, // 68 TMNews - Tamil
  630. kHighAlphaCode + kIsIndicCode, // 69 Webtamil - Tamil
  631. kTwobyteCode, // 70 KDDI Shift_JIS
  632. kTwobyteCode, // 71 DoCoMo Shift_JIS
  633. kTwobyteCode, // 72 SoftBank Shift_JIS
  634. kSevenBitActive + kIso2022Active, // 73 KDDI-ISO-2022-JP
  635. kSevenBitActive + kIso2022Active, // 74 SOFTBANK-ISO-2022-JP
  636. };
  637. COMPILE_ASSERT(arraysize(kSpecialMask) == NUM_ENCODINGS,
  638. kSpecialMask_has_incorrect_size);
  639. /***
  640. kHighAlphaCode -- full alphabet in 8x-Fx range, not just accents
  641. ISO_8859_5, // 4: Teragram ISO-8859-5 Cyrl UL bd
  642. RUSSIAN_CP1251, // 26: Teragram CP1251 UL cdef
  643. RUSSIAN_KOI8_R, // 25: Teragram KOI8R LU cdef
  644. RUSSIAN_KOI8_RU, // 28: CP21866 aka KOI8_RU, LU cdef
  645. RUSSIAN_CP866, // 42 89ae
  646. ISO_8859_6, // 5: Teragram Arabic nocase cde
  647. MSFT_CP1256, // 35: used for Arabic nocase cde
  648. ISO_8859_7, // 6: Teragram Greek UL cdef
  649. MSFT_CP1253, // 41: used for Greek UL cdef
  650. ISO_8859_8, // 7: Teragram Hebrew nocase ef
  651. MSFT_CP1255, // 36: Logical Hebrew Microsoft nocase ef
  652. ISO_8859_8_I, // 37: Iso Hebrew Logical nocase ef
  653. HEBREW_VISUAL, // 38: Iso Hebrew Visual nocase ef
  654. ISO_8859_11, // 33: aka TIS-620, used for Thai nocase abcde
  655. MSFT_CP874, // 34: used for Thai nocase abcde
  656. TSCII, // 49 8-f
  657. TAMIL_MONO, // 50
  658. TAMIL_BI, // 51
  659. JAGRAN, // 52
  660. BHASKAR, // 55 Indic encoding - Devanagari
  661. HTCHANAKYA, // 56 Indic encoding - Devanagari
  662. ***/
  663. // We can scan bytes using this at about 500 MB/sec 2.8GHz P4
  664. // Slow scan uses this, stopping on NUL ESC SO SI bad C0 and + ~
  665. // We allow FF, 0x0C, here because it gives a better result for old
  666. // Ascii text formatted for a TTY
  667. // non-zero exits scan loop -- 1 for printable ASCII, 2 otherwise
  668. static const char kTestPrintableAsciiTildePlus[256] = {
  669. 2,2,2,2,2,2,2,2, 2,0,0,2,0,0,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
  670. 0,0,0,0,0,0,0,0, 0,0,0,1,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  671. 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  672. 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,1,2,
  673. 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
  674. 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
  675. 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
  676. 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
  677. };
  678. // We can scan bytes using this at about 550 MB/sec 2.8GHz P4
  679. // Slow scan uses this, stopping on NUL ESC SO SI and bad C0
  680. // after Hz and UTF7 are pruned away
  681. // We allow Form Feed, 0x0C, here
  682. static const char kTestPrintableAscii[256] = {
  683. 2,2,2,2,2,2,2,2, 2,0,0,2,0,0,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
  684. 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  685. 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  686. 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,2,
  687. 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
  688. 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
  689. 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
  690. 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
  691. };
  692. // Used in first-four-byte testing
  693. static const char kIsPrintableAscii[256] = {
  694. 0,0,0,0,0,0,0,0, 0,1,1,0,0,1,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  695. 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
  696. 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
  697. 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,0,
  698. 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  699. 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  700. 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  701. 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
  702. };
  703. static const signed char kBase64Value[256] = {
  704. -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
  705. -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
  706. -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,62,-1,-1,-1,63,
  707. 52,53,54,55,56,57,58,59, 60,61,-1,-1,-1,-1,-1,-1,
  708. -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,
  709. 15,16,17,18,19,20,21,22, 23,24,25,-1,-1,-1,-1,-1,
  710. -1,26,27,28,29,30,31,32, 33,34,35,36,37,38,39,40,
  711. 41,42,43,44,45,46,47,48, 49,50,51,-1,-1,-1,-1,-1,
  712. -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
  713. -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
  714. -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
  715. -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
  716. -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
  717. -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
  718. -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
  719. -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
  720. };
  721. // Subscripted by <state, byte/16>
  722. // Accepts Cx->8x Dx->8x Ex->8x->8x Fx->8x->8x->8x
  723. //
  724. // Fixed Problem: GB has sequences like B2DB B8D6 BDE1 B9B9
  725. // which we can mis-parse as an error byte followed by good UTF-8:
  726. // B2 DBB8 D6BD E1B9B9
  727. // To counteract this, we now require an ASCII7 byte to resync out
  728. // of the error state
  729. // Next problem: good UTF-8 with bad byte
  730. // efbc a012 eea4 bee7 b280 c2b7
  731. // efbca0 12 eea4be e7b280 c2b7
  732. // ^^ bad byte
  733. // fix: change state0 byte 1x to be don't-care
  734. //
  735. // Short UTF-8 ending in ASCII7 byte should resync immediately:
  736. // E0 20 E0 A6 AA should give one error and resync at 2nd E0
  737. //
  738. static const char kMiniUTF8State[8][16] = {
  739. {0,0,0,0,0,0,0,0, 7,7,7,7,1,1,2,4,}, // [0] start char (allow cr/lf/ht)
  740. {0,7,0,0,0,0,0,0, 0,0,0,0,7,7,7,7,}, // [1] continue 1 of 2
  741. {0,7,0,0,0,0,0,0, 3,3,3,3,7,7,7,7,}, // [2] continue 1 of 3
  742. {0,7,0,0,0,0,0,0, 0,0,0,0,7,7,7,7,}, // [3] continue 2 of 3
  743. {0,7,0,0,0,0,0,0, 5,5,5,5,7,7,7,7,}, // [4] continue 1 of 4
  744. {0,7,0,0,0,0,0,0, 6,6,6,6,7,7,7,7,}, // [5] continue 2 of 4
  745. {0,7,0,0,0,0,0,0, 0,0,0,0,7,7,7,7,}, // [6] continue 3 of 4
  746. {0,7,0,0,0,0,0,0, 7,7,7,7,7,7,7,7,}, // [7] error, soak up continues,
  747. // ONLY resync after Ascii char
  748. // then restart
  749. };
  750. // Counter to increment: 0-don'tcare 1-error 2-good_2B 3-good_3B 4-good_4B
  751. static const char kMiniUTF8Count[8][16] = {
  752. {0,0,0,0,0,0,0,0, 1,1,1,1,0,0,0,0,}, // [0] start char (allow cr/lf/ht)
  753. {1,1,1,1,1,1,1,1, 2,2,2,2,1,1,1,1,}, // [1] continue 1 of 2
  754. {1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,}, // [2] continue 1 of 3
  755. {1,1,1,1,1,1,1,1, 3,3,3,3,1,1,1,1,}, // [3] continue 2 of 3
  756. {1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,}, // [4] continue 1 of 4
  757. {1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,}, // [5] continue 2 of 4
  758. {1,1,1,1,1,1,1,1, 4,4,4,4,1,1,1,1,}, // [6] continue 3 of 4
  759. {0,1,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,}, // [7] error, soak up continues,
  760. // then restart
  761. };
  762. // Subscripted by <state, f(byte1) + g(byte2)>
  763. // where f(x)= E2->4, Cx->8 and C3->12 and 0 otherwise
  764. // and g(x) = (x >> 4) & 3 8x->0 9x->1 Ax->2 Bx->3 Cx->0, etc.
  765. // (no checking for illegal bytes)
  766. // Here are example patterns of CP1252 converted to UTF-8 0/1/2 times. We want
  767. // to detect two, so we can back-convert to one.
  768. // zero one two pattern
  769. // ---- ------ ---------------- -----------------
  770. // 81 C281 C382C281 C3->8x->C2->xx
  771. // 98 CB9C C38BC593 C3->8x->C5->xx
  772. // C3 C383 C383C692 C3->8x->C6->xx
  773. // C8 C388 C383CB86 C3->8x->CB->xx
  774. // 83 C692 C386E28099 C3->8x->E2->xx->8x
  775. // 80 E282AC C3A2E2809AC2AC C3->A2->E2->xx->xx->Cx->xx
  776. // 92 E28099 C3A2E282ACE284A2 C3->A2->E2->xx->xx->E2->xx->xx
  777. //
  778. // We also want to detect bare-byte extra UTF-8 conversions:
  779. // zero one two pattern
  780. // ---- ------ ---------------- -----------------
  781. // C3 C3 C383 C3->8x->C2->xx
  782. // D3 D3 C393 C3->9x->C2->xx->C2->xx
  783. // E3 E3 C3A3 C3->Ax->C2->xx->C2->xx->C2->xx
  784. // F3 F3 C3B2 C3->Bx->C2->xx->C2->xx->C2->xx->C2->xx
  785. //
  786. /**
  787. CP1252 => UTF8 => UTF8UTF8
  788. 80 => E282AC => C3A2E2809AC2AC
  789. 81 => C281 => C382C281
  790. 82 => E2809A => C3A2E282ACC5A1
  791. 83 => C692 => C386E28099
  792. 84 => E2809E => C3A2E282ACC5BE
  793. 85 => E280A6 => C3A2E282ACC2A6
  794. 86 => E280A0 => C3A2E282ACC2A0
  795. 87 => E280A1 => C3A2E282ACC2A1
  796. 88 => CB86 => C38BE280A0
  797. 89 => E280B0 => C3A2E282ACC2B0
  798. 8A => C5A0 => C385C2A0
  799. 8B => E280B9 => C3A2E282ACC2B9
  800. 8C => C592 => C385E28099
  801. 8D => C28D => C382C28D
  802. 8E => C5BD => C385C2BD
  803. 8F => C28F => C382C28F
  804. 90 => C290 => C382C290
  805. 91 => E28098 => C3A2E282ACCB9C
  806. 92 => E28099 => C3A2E282ACE284A2
  807. 93 => E2809C => C3A2E282ACC593
  808. 94 => E2809D => C3A2E282ACC29D
  809. 95 => E280A2 => C3A2E282ACC2A2
  810. 96 => E28093 => C3A2E282ACE2809C
  811. 97 => E28094 => C3A2E282ACE2809D
  812. 98 => CB9C => C38BC593
  813. 99 => E284A2 => C3A2E2809EC2A2
  814. 9A => C5A1 => C385C2A1
  815. 9B => E280BA => C3A2E282ACC2BA
  816. 9C => C593 => C385E2809C
  817. 9D => C29D => C382C29D
  818. 9E => C5BE => C385C2BE
  819. 9F => C5B8 => C385C2B8
  820. A0 => C2A0 => C382C2A0
  821. A1 => C2A1 => C382C2A1
  822. A2 => C2A2 => C382C2A2
  823. A3 => C2A3 => C382C2A3
  824. A4 => C2A4 => C382C2A4
  825. A5 => C2A5 => C382C2A5
  826. A6 => C2A6 => C382C2A6
  827. A7 => C2A7 => C382C2A7
  828. A8 => C2A8 => C382C2A8
  829. A9 => C2A9 => C382C2A9
  830. AA => C2AA => C382C2AA
  831. AB => C2AB => C382C2AB
  832. AC => C2AC => C382C2AC
  833. AD => C2AD => C382C2AD
  834. AE => C2AE => C382C2AE
  835. AF => C2AF => C382C2AF
  836. B0 => C2B0 => C382C2B0
  837. B1 => C2B1 => C382C2B1
  838. B2 => C2B2 => C382C2B2
  839. B3 => C2B3 => C382C2B3
  840. B4 => C2B4 => C382C2B4
  841. B5 => C2B5 => C382C2B5
  842. B6 => C2B6 => C382C2B6
  843. B7 => C2B7 => C382C2B7
  844. B8 => C2B8 => C382C2B8
  845. B9 => C2B9 => C382C2B9
  846. BA => C2BA => C382C2BA
  847. BB => C2BB => C382C2BB
  848. BC => C2BC => C382C2BC
  849. BD => C2BD => C382C2BD
  850. BE => C2BE => C382C2BE
  851. BF => C2BF => C382C2BF
  852. C0 => C380 => C383E282AC
  853. C1 => C381 => C383C281
  854. C2 => C382 => C383E2809A
  855. C3 => C383 => C383C692
  856. C4 => C384 => C383E2809E
  857. C5 => C385 => C383E280A6
  858. C6 => C386 => C383E280A0
  859. C7 => C387 => C383E280A1
  860. C8 => C388 => C383CB86
  861. C9 => C389 => C383E280B0
  862. CA => C38A => C383C5A0
  863. CB => C38B => C383E280B9
  864. CC => C38C => C383C592
  865. CD => C38D => C383C28D
  866. CE => C38E => C383C5BD
  867. CF => C38F => C383C28F
  868. D0 => C390 => C383C290
  869. D1 => C391 => C383E28098
  870. D2 => C392 => C383E28099
  871. D3 => C393 => C383E2809C
  872. D4 => C394 => C383E2809D
  873. D5 => C395 => C383E280A2
  874. D6 => C396 => C383E28093
  875. D7 => C397 => C383E28094
  876. D8 => C398 => C383CB9C
  877. D9 => C399 => C383E284A2
  878. DA => C39A => C383C5A1
  879. DB => C39B => C383E280BA
  880. DC => C39C => C383C593
  881. DD => C39D => C383C29D
  882. DE => C39E => C383C5BE
  883. DF => C39F => C383C5B8
  884. E0 => C3A0 => C383C2A0
  885. E1 => C3A1 => C383C2A1
  886. E2 => C3A2 => C383C2A2
  887. E3 => C3A3 => C383C2A3
  888. E4 => C3A4 => C383C2A4
  889. E5 => C3A5 => C383C2A5
  890. E6 => C3A6 => C383C2A6
  891. E7 => C3A7 => C383C2A7
  892. E8 => C3A8 => C383C2A8
  893. E9 => C3A9 => C383C2A9
  894. EA => C3AA => C383C2AA
  895. EB => C3AB => C383C2AB
  896. EC => C3AC => C383C2AC
  897. ED => C3AD => C383C2AD
  898. EE => C3AE => C383C2AE
  899. EF => C3AF => C383C2AF
  900. F0 => C3B0 => C383C2B0
  901. F1 => C3B1 => C383C2B1
  902. F2 => C3B2 => C383C2B2
  903. F3 => C3B3 => C383C2B3
  904. F4 => C3B4 => C383C2B4
  905. F5 => C3B5 => C383C2B5
  906. F6 => C3B6 => C383C2B6
  907. F7 => C3B7 => C383C2B7
  908. F8 => C3B8 => C383C2B8
  909. F9 => C3B9 => C383C2B9
  910. FA => C3BA => C383C2BA
  911. FB => C3BB => C383C2BB
  912. FC => C3BC => C383C2BC
  913. FD => C3BD => C383C2BD
  914. FE => C3BE => C383C2BE
  915. FF => C3BF => C383C2BF
  916. **/
  917. // Subscripted by <state, f(byte1) + g(byte2)>
  918. // where f(x)= E2->4, C2/5/6/B->8 and C3->12 and 0 otherwise
  919. // and g(x) = (x >> 4) & 3 8x->0 9x->1 Ax->2 Bx->3 Cx->0, etc.
  920. // 81 C281 C382C281 C3->8x->C2->xx
  921. // 98 CB9C C38BC593 C3->8x->C5->xx
  922. // C3 C383 C383C692 C3->8x->C6->xx
  923. // C8 C388 C383CB86 C3->8x->CB->xx
  924. // [0] [2] [0]
  925. // 83 C692 C386E28099 C3->8x->E2->xx->xx
  926. // odd_byte=0 [0] [2] [0+] odd_byte flipped
  927. // odd_byte=1 [0+] [2] [0] [0] odd_byte unflipped
  928. // 80 E282AC C3A2E2809AC2AC C3->A2->E2->xx->xx->Cx->xx
  929. // odd_byte=0 [0] [3] [4] [0+]
  930. // odd_byte=1 [0+] [3] [4] [4] [0]
  931. // 92 E28099 C3A2E282ACE284A2 C3->A2->E2->xx->xx->E2->xx->xx
  932. // odd_byte=0 [0] [3] [4] [0] [0]
  933. // odd_byte=1 [0+] [3] [4] [4] [0+]
  934. //
  935. // When an E2xxxx sequence is encountered, we absorb the two bytes E2xx and flip
  936. // the odd_byte state. If that goes from 0 to 1, the next pair is offset up
  937. // by one byte, picking up the two bytes just after E2xxxx. If odd_byte goes
  938. // from 1 to 0, the next two bytes picked up are the two bytes xxxx of E2xxxx.
  939. // These are absorbed with no error in state 0 or state 4
  940. //
  941. // C3 C3 C383 C3->8x->C2->xx
  942. // D3 D3 C393 C3->9x->C2->xx->C2->xx
  943. // E3 E3 C3A3 C3->Ax->C2->xx->C2->xx->C2->xx
  944. // F3 F3 C3B2 C3->Bx->C2->xx->C2->xx->C2->xx->C2->xx
  945. // Counter3 for Fx Ex sequences is incremented at last C2
  946. static const char kMiniUTF8UTF8State[8][16] = {
  947. // xxxx E2xx CXxx C3xx
  948. // 8 9 a b 8 9 a b 8 9 a b
  949. {0,0,0,0,1,1,1,1, 1,1,1,1,2,2,3,5,}, // [0] looking for C38x/C3Ax/2020/8x8x, or err
  950. {0,0,0,0,1,1,1,1, 1,1,1,1,2,2,3,5,}, // [1] error, back to looking
  951. {1,1,1,1,0,0,0,0, 0,0,0,0,1,1,1,1,}, // [2] C38x looking for CXxx/E2xxxx
  952. // + + + + // E2xxxx flips odd_byte
  953. {1,1,1,1,4,4,4,4, 7,7,7,7,1,1,1,1,}, // [3] C3Ax looking for E2xx or C2xxC2xx
  954. // + + + + // E2xxxx flips odd_byte
  955. {4,4,4,4,0,0,0,0, 0,0,0,0,1,1,1,1,}, // [4] C3AxE2xx-- looking for C2xx/E2xxxx
  956. // + + + + // E2xxxx flips odd_byte
  957. {1,1,1,1,1,1,1,1, 6,6,6,6,1,1,1,1,}, // [5] C3Bx -- looking for C2xxC2xxC2xx
  958. {1,1,1,1,1,1,1,1, 7,7,7,7,1,1,1,1,}, // [6] C3Bx -- looking for C2xxC2xx
  959. {1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,}, // [7] C3Bx -- looking for C2xx
  960. };
  961. // Counter to increment: 0-don'tcare 1-error 2-good_2B 3-good_3B 4-good_4B
  962. static const char kMiniUTF8UTF8Count[8][16] = {
  963. // xxxx E2xx C2Xx C3xx
  964. // 8 9 a b 8 9 a b 8 9 a b
  965. {0,0,0,0,1,1,1,1, 1,1,1,1,0,0,0,0,}, // [0] looking for C38x/C3Ax/2020/8x8x, or err
  966. {0,0,0,0,1,1,1,1, 1,1,1,1,0,0,0,0,}, // [1] error, back to looking
  967. {1,1,1,1,3,3,3,3, 2,2,2,2,1,1,1,1,}, // [2] C38x looking for CXxx/E2xxxx
  968. // + + + + // E2xxxx flips odd_byte
  969. {1,1,1,1,0,0,0,0, 0,0,0,0,1,1,1,1,}, // [3] C3Ax looking for E2xx
  970. // + + + + // E2xxxx flips odd_byte
  971. {1,1,1,1,4,4,4,4, 4,4,4,4,1,1,1,1,}, // [4] C3AxE2xx-- looking for C2xx/E2xxxx
  972. // + + + + // E2xxxx flips odd_byte
  973. {1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,}, // [5] C3Bx -- looking for C2xxC2xxC2xx
  974. {1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,}, // [6] C3Bx -- looking for C2xxC2xx
  975. {1,1,1,1,1,1,1,1, 3,3,3,3,1,1,1,1,}, // [7] C3Bx -- looking for C2xx
  976. };
  977. static const char kMiniUTF8UTF8Odd[8][16] = {
  978. // xxxx E2xx C2Xx C3xx
  979. // 8 9 a b 8 9 a b 8 9 a b
  980. {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,}, // [0] looking for C38x/C3Ax/2020/8x8x, or err
  981. {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,}, // [1] error, back to looking
  982. {0,0,0,0,1,1,1,1, 0,0,0,0,0,0,0,0,}, // [2] C38x looking for CXxx/E2xxxx
  983. // + + + + // E2xxxx flips odd_byte
  984. {0,0,0,0,1,1,1,1, 0,0,0,0,0,0,0,0,}, // [3] C3Ax looking for E2xx
  985. // + + + + // E2xxxx flips odd_byte
  986. {0,0,0,0,1,1,1,1, 0,0,0,0,0,0,0,0,}, // [4] C3AxE2xx-- looking for C2xx/E2xxxx
  987. // + + + + // E2xxxx flips odd_byte
  988. {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,}, // [5] C3Bx -- looking for C2xxC2xxC2xx
  989. {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,}, // [6] C3Bx -- looking for C2xxC2xx
  990. {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,}, // [7] C3Bx -- looking for C2xx
  991. };
  992. // Turn a pair of bytes into the subscript for UTF8UTF8 tables above
  993. int UTF88Sub(char s0, char s1) {
  994. int sub = (s1 >> 4) & 0x03;
  995. uint8 u0 = static_cast<uint8>(s0);
  996. if (u0 == 0xc3) {
  997. sub += 12;
  998. } else if ((u0 & 0xf0) == 0xc0) {
  999. if ((u0 == 0xc2) || (u0 == 0xc5) || (u0 == 0xc6) || (u0 == 0xcb)) {
  1000. sub += 8;
  1001. }
  1002. } else if (u0 == 0xe2) {
  1003. sub += 4;
  1004. }
  1005. return sub;
  1006. }
  1007. // Default probability for an encoding rankedencoding
  1008. // Based on a scan of 55M web pages
  1009. // These values are 255 - log base 2**1/10 (occurrences / total)
  1010. // Large values are most likely. This the reverse of some Google code
  1011. // 255 = 1.0, 245 = 1/2, 235 = 1/4, 15 = 1/2**24, 0 = 0 (< 1/50M)
  1012. //
  1013. // TODO change this to be per encoding, not permuted
  1014. //
  1015. // Support function for unit test program
  1016. // Return ranked encoding corresponding to enc
  1017. // (also exported to compact_enc_det_text.cc)
  1018. int CompactEncDet::BackmapEncodingToRankedEncoding(Encoding enc) {
  1019. for (int i = 0; i < NUM_RANKEDENCODING; ++i) {
  1020. if (kMapToEncoding[i] == enc) {
  1021. return i;
  1022. }
  1023. }
  1024. return -1;
  1025. }
  1026. string DecodeActive(uint32 active) {
  1027. string temp("");
  1028. if (active & kBinaryActive) {
  1029. temp.append("Binary ");
  1030. }
  1031. if (active & kUTF1632Active) {
  1032. temp.append("UTF1632 ");
  1033. }
  1034. if (active & kUTF8UTF8Active) {
  1035. temp.append("UTF8UTF8 ");
  1036. }
  1037. if (active & kUTF8Active) {
  1038. temp.append("UTF8 ");
  1039. }
  1040. if (active & kIso2022Active) {
  1041. temp.append("Iso2022 ");
  1042. }
  1043. if (active & kHzActive) {
  1044. temp.append("Hz ");
  1045. }
  1046. if (active & kUTF7Active) {
  1047. temp.append("UTF7A ");
  1048. }
  1049. if (active & kSevenBitActive) {
  1050. temp.append("SevenBit ");
  1051. }
  1052. if (active & kIsIndicCode) {
  1053. temp.append("Indic ");
  1054. }
  1055. if (active & kHighAlphaCode) {
  1056. temp.append("HighAlpha ");
  1057. }
  1058. if (active & kHighAccentCode) {
  1059. temp.append("HighAccent ");
  1060. }
  1061. if (active & kEUCJPActive) {
  1062. temp.append("EUCJP ");
  1063. }
  1064. return temp;
  1065. }
  1066. static inline bool SevenBitEncoding(int enc) {
  1067. return ((kSpecialMask[enc] & kSevenBitActive) != 0);
  1068. }
  1069. static inline bool TwoByteEncoding(int enc) {
  1070. return ((kSpecialMask[enc] & kTwobyteCode) != 0);
  1071. }
  1072. static inline bool IndicEncoding(int enc) {
  1073. return ((kSpecialMask[enc] & kIsIndicCode) != 0);
  1074. }
  1075. static inline bool HighAlphaEncoding(int enc) {
  1076. return ((kSpecialMask[enc] & kHighAlphaCode) != 0);
  1077. }
  1078. static inline bool HighAccentEncoding(int enc) {
  1079. return ((kSpecialMask[enc] & kHighAccentCode) != 0);
  1080. }
  1081. static inline bool AnyActive(DetectEncodingState* destatep) {
  1082. return (destatep->active_special != 0);
  1083. }
  1084. static inline bool SevenBitActive(DetectEncodingState* destatep) {
  1085. return (destatep->active_special & kSevenBitActive) != 0;
  1086. }
  1087. static inline bool HzActive(DetectEncodingState* destatep) {
  1088. return (destatep->active_special & kHzActive) != 0;
  1089. }
  1090. static inline bool Iso2022Active(DetectEncodingState* destatep) {
  1091. return (destatep->active_special & kIso2022Active) != 0;
  1092. }
  1093. static inline bool UTF8Active(DetectEncodingState* destatep) {
  1094. return (destatep->active_special & kUTF8Active) != 0;
  1095. }
  1096. static inline bool UTF8UTF8Active(DetectEncodingState* destatep) {
  1097. return (destatep->active_special & kUTF8UTF8Active) != 0;
  1098. }
  1099. static inline bool UTF1632Active(DetectEncodingState* destatep) {
  1100. return (destatep->active_special & kUTF1632Active) != 0;
  1101. }
  1102. static inline bool BinaryActive(DetectEncodingState* destatep) {
  1103. return (destatep->active_special & kBinaryActive) != 0;
  1104. }
  1105. static inline bool UTF7OrHzActive(DetectEncodingState* destatep) {
  1106. return (destatep->active_special & (kHzActive + kUTF7Active)) != 0;
  1107. }
  1108. static inline bool EUCJPActive(DetectEncodingState* destatep) {
  1109. return ((destatep->active_special & kEUCJPActive) != 0);
  1110. }
  1111. static inline bool OtherActive(DetectEncodingState* destatep) {
  1112. return (destatep->active_special & (kIso2022Active + kBinaryActive +
  1113. kUTF8Active + kUTF8UTF8Active +
  1114. kUTF1632Active + kEUCJPActive)) != 0;
  1115. }
  1116. static inline bool CEDFlagRescanning(CEDInternalFlags flags) {
  1117. return (flags & kCEDRescanning) != 0;
  1118. }
  1119. static inline bool CEDFlagForceTags(CEDInternalFlags flags) {
  1120. return (flags & kCEDForceTags) != 0;
  1121. }
  1122. static inline int maxint(int a, int b) {return (a > b) ? a : b;}
  1123. static inline int minint(int a, int b) {return (a < b) ? a : b;}
  1124. static inline const char* MyRankedEncName(int r_enc) {
  1125. return MyEncodingName(kMapToEncoding[r_enc]);
  1126. }
  1127. // Only for debugging. not thread safe
  1128. static const int kPsSourceWidth = 32;
  1129. static int pssourcenext = 0; // debug only. not threadsafe. dump only >= this
  1130. static int pssourcewidth = 0; // debug only.
  1131. static char* pssource_mark_buffer = NULL;
  1132. int next_do_src_line;
  1133. int do_src_offset[16];
  1134. void PsSourceInit(int len) {
  1135. pssourcenext = 0;
  1136. pssourcewidth = len;
  1137. delete[] pssource_mark_buffer;
  1138. // Allocate 2 Ascii characters per input byte
  1139. pssource_mark_buffer = new char[(pssourcewidth * 2) + 8]; // 8 = overscan
  1140. memset(pssource_mark_buffer, ' ', pssourcewidth * 2);
  1141. memset(pssource_mark_buffer + (pssourcewidth * 2), '\0', 8);
  1142. next_do_src_line = 0;
  1143. memset(do_src_offset, 0, sizeof(do_src_offset));
  1144. }
  1145. void PsSourceFinish() {
  1146. // Print preceding mark buffer
  1147. int j = (pssourcewidth * 2) - 1;
  1148. while ((0 <= j) && (pssource_mark_buffer[j] == ' ')) {--j;} // trim
  1149. pssource_mark_buffer[j + 1] = '\0';
  1150. fprintf(stderr, "( %s) do-src\n", pssource_mark_buffer);
  1151. memset(pssource_mark_buffer, ' ', pssourcewidth * 2);
  1152. memset(pssource_mark_buffer + (pssourcewidth * 2), '\0', 8);
  1153. delete[] pssource_mark_buffer;
  1154. pssource_mark_buffer = NULL;
  1155. }
  1156. // Dump aligned len bytes src... if not already dumped
  1157. void PsSource(const uint8* src, const uint8* isrc, const uint8* srclimit) {
  1158. int offset = src - isrc;
  1159. offset -= (offset % pssourcewidth); // round down to multiple of len bytes
  1160. if (offset < pssourcenext) {
  1161. return;
  1162. }
  1163. pssourcenext = offset + pssourcewidth; // Min offset for next dump
  1164. // Print preceding mark buffer
  1165. int j = (pssourcewidth * 2) - 1;
  1166. while ((0 <= j) && (pssource_mark_buffer[j] == ' ')) {--j;} // trim
  1167. pssource_mark_buffer[j + 1] = '\0';
  1168. fprintf(stderr, "( %s) do-src\n", pssource_mark_buffer);
  1169. memset(pssource_mark_buffer, ' ', pssourcewidth * 2);
  1170. memset(pssource_mark_buffer + (pssourcewidth * 2), '\0', 8);
  1171. // Print source bytes
  1172. const uint8* src_aligned = isrc + offset;
  1173. int length = srclimit - src_aligned;
  1174. length = minint(pssourcewidth, length);
  1175. fprintf(stderr, "(%05x ", offset);
  1176. for (int i = 0; i < length; ++i) {
  1177. char c = src_aligned[i];
  1178. if (c == '\n') {c = ' ';}
  1179. if (c == '\r') {c = ' ';}
  1180. if (c == '\t') {c = ' ';}
  1181. if (c == '(') {
  1182. fprintf(stderr, "%s", "\\( ");
  1183. } else if (c == ')') {
  1184. fprintf(stderr, "%s", "\\) ");
  1185. } else if (c == '\\') {
  1186. fprintf(stderr, "%s", "\\\\ ");
  1187. } else if ((0x20 <= c) && (c <= 0x7e)) {
  1188. fprintf(stderr, "%c ", c);
  1189. } else {
  1190. fprintf(stderr, "%02x", c);
  1191. }
  1192. }
  1193. fprintf(stderr, ") do-src\n");
  1194. // Remember which source offsets are where, mod 16
  1195. do_src_offset[next_do_src_line & 0x0f] = offset;
  1196. ++next_do_src_line;
  1197. }
  1198. // Mark bytes in just-previous source bytes
  1199. void PsMark(const uint8* src, int len, const uint8* isrc, int weightshift) {
  1200. int offset = src - isrc;
  1201. offset = (offset % pssourcewidth); // mod len bytes
  1202. char mark = (weightshift == 0) ? '-' : 'x';
  1203. pssource_mark_buffer[(offset * 2)] = '=';
  1204. pssource_mark_buffer[(offset * 2) + 1] = '=';
  1205. for (int i = 1; i < len; ++i) {
  1206. pssource_mark_buffer[(offset + i) * 2] = mark;
  1207. pssource_mark_buffer[((offset + i) * 2) + 1] = mark;
  1208. }
  1209. }
  1210. // Highlight trigram bytes in just-previous source bytes
  1211. // Unfortunately, we have to skip back N lines since source was printed for
  1212. // up to 8 bigrams before we get here. Match on src+1 to handle 0/31 better
  1213. void PsHighlight(const uint8* src, const uint8* isrc, int trigram_val, int n) {
  1214. int offset = (src + 1) - isrc;
  1215. int offset32 = (offset % pssourcewidth); // mod len bytes
  1216. offset -= offset32; // round down to multiple of len bytes
  1217. for (int i = 1; i <= 16; ++i) {
  1218. if (do_src_offset[(next_do_src_line - i) & 0x0f] == offset) {
  1219. fprintf(stderr, "%d %d %d do-highlight%d\n",
  1220. i, offset32 - 1, trigram_val, n);
  1221. break;
  1222. }
  1223. }
  1224. }
  1225. void InitDetectEncodingState(DetectEncodingState* destatep) {
  1226. destatep->initial_src = NULL; // Filled in by caller
  1227. destatep->limit_src = NULL;
  1228. destatep->prior_src = NULL;
  1229. destatep->last_pair = NULL;
  1230. destatep->debug_data = NULL;
  1231. destatep->next_detail_entry = 0;
  1232. destatep->done = false;
  1233. destatep->reliable = false;
  1234. destatep->hints_derated = false;
  1235. //destatep->declared_enc_1 init in ApplyHints
  1236. //destatep->declared_enc_2 init in ApplyHints
  1237. destatep->prune_count = 0;
  1238. destatep->trigram_highwater_mark = 0;
  1239. destatep->looking_for_latin_trigrams = false;
  1240. destatep->do_latin_trigrams = false;
  1241. // Miscellaneous state variables for difficult encodings
  1242. destatep->binary_quadrants_count = 0;
  1243. destatep->binary_8x4_count = 0;
  1244. destatep->binary_quadrants_seen = 0;
  1245. destatep->binary_8x4_seen = 0;
  1246. destatep->utf7_starts = 0;
  1247. destatep->prior_utf7_offset = 0;
  1248. destatep->next_utf8_ministate = 0;
  1249. for (int i = 0; i < 6; i++) {destatep->utf8_minicount[i] = 0;}
  1250. destatep->next_utf8utf8_ministate = 0;
  1251. destatep->utf8utf8_odd_byte = 0;
  1252. for (int i = 0; i < 6; i++) {destatep->utf8utf8_minicount[i] = 0;}
  1253. destatep->next_2022_state = SOSI_NONE;
  1254. destatep->next_hz_state = SOSI_NONE;
  1255. destatep->next_eucjp_oddphase = false;
  1256. for (int i = 0; i < 8; i++) {destatep->byte32_count[i] = 0;}
  1257. destatep->active_special = 0xffffffff;
  1258. destatep->tld_hint = UNKNOWN_ENCODING;
  1259. destatep->http_hint = UNKNOWN_ENCODING;
  1260. destatep->meta_hint = UNKNOWN_ENCODING;
  1261. destatep->bom_hint = UNKNOWN_ENCODING;
  1262. destatep->top_rankedencoding = 0; // ASCII [seven-bit] is the default
  1263. destatep->second_top_rankedencoding = 0; // ASCII [seven-bit] is the default
  1264. destatep->top_prob = -1;
  1265. destatep->second_top_prob = -1;
  1266. // This is wide for first pruning, shrinks for 2nd and later
  1267. destatep->prune_difference = kInititalPruneDifference;
  1268. destatep->next_prior_bigram = 0;
  1269. destatep->prior_bigram[0] = -1;
  1270. destatep->prior_bigram[1] = -1;
  1271. destatep->prior_bigram[2] = -1;
  1272. destatep->prior_bigram[3] = -1;
  1273. destatep->prior_binary[0] = -1;
  1274. // Initialize with all but Indic encodings, which we never detect
  1275. int k = 0;
  1276. for (int rankedencoding = 0;
  1277. rankedencoding < NUM_RANKEDENCODING;
  1278. rankedencoding++) {
  1279. Encoding enc = kMapToEncoding[rankedencoding];
  1280. if (!IndicEncoding(enc)) {
  1281. destatep->rankedencoding_list[k++] = rankedencoding;
  1282. }
  1283. }
  1284. destatep->rankedencoding_list_len = k;
  1285. // This is where all the action is
  1286. memset(destatep->enc_prob, 0, sizeof(destatep->enc_prob));
  1287. memset(destatep->hint_prob, 0, sizeof(destatep->hint_prob));
  1288. memset(destatep->hint_weight, 0, sizeof(destatep->hint_weight));
  1289. destatep->prior_interesting_pair[AsciiPair] = 0;
  1290. destatep->prior_interesting_pair[OtherPair] = 0;
  1291. destatep->next_interesting_pair[AsciiPair] = 0;
  1292. destatep->next_interesting_pair[OtherPair] = 0;
  1293. // interesting_pairs/offsets/weightshifts not initialized; no need
  1294. }
  1295. // Probability strings are uint8, with zeros removed via simple run-length:
  1296. // (<skip-take byte> <data bytes>)*
  1297. // skip-take:
  1298. // 00 end
  1299. // x0 skip 16 x locations, take 0 data values
  1300. // xy skip x locations, take y data values
  1301. // Multiply all the incoming values by 3 to account for 3x unigram sums
  1302. //
  1303. // {{0x77,0x69,0x6e,0x64,0x31,0x32,0x35,0x35,
  1304. // 0x01,0xc2,0x10,0x41,0xfe,0x71,0xba,0x00,}}, // "wind1255"
  1305. //
  1306. // Weight is 0..100 percent
  1307. //
  1308. // Returns subscript of largest (most probable) value
  1309. //
  1310. // {{0x6e,0x6c,0x5f,0x5f, 0x05,0xb2,0xae,0xa0,0x32,0xa1,0x36,0x31,0x42,0x39,0x3b,0x33,0x45,0x11,0x6f,0x00,}}, // "nl__"
  1311. // // ASCII-7-bit=178 Latin1=174 UTF8=160 GB=50 CP1252=161 BIG5=49 Latin2=66 CP1251=57 CP1256=59 CP1250=51 Latin5=69 ISO-8859-15=111 [top ASCII-7-bit]
  1312. int ApplyCompressedProb(const char* iprob, int len,
  1313. int weight, DetectEncodingState* destatep) {
  1314. int* dst = &destatep->enc_prob[0];
  1315. int* dst2 = &destatep->hint_weight[0];
  1316. const uint8* prob = reinterpret_cast<const uint8*>(iprob);
  1317. const uint8* problimit = prob + len;
  1318. int largest = -1;
  1319. int subscript_of_largest = 0;
  1320. // Continue with first byte and subsequent ones
  1321. while (prob < problimit) {
  1322. int skiptake = *prob++;
  1323. int skip = (skiptake & 0xf0) >> 4;
  1324. int take = skiptake & 0x0f;
  1325. if (skiptake == 00) {
  1326. break;
  1327. } else if (take == 0) {
  1328. dst += (skip << 4);
  1329. dst2 += (skip << 4);
  1330. } else {
  1331. dst += skip; // Normal case
  1332. dst2 += skip; // Normal case
  1333. for (int i = 0; i < take; i++) {
  1334. int enc = static_cast<int>(dst - &destatep->enc_prob[0]) + i;
  1335. if (largest < prob[i]) {
  1336. largest = prob[i];
  1337. subscript_of_largest = enc;
  1338. }
  1339. int increment = prob[i] * 3; // The actual increment
  1340. // Do maximum of previous hints plus this new one
  1341. if (weight > 0) {
  1342. increment = (increment * weight) / 100;
  1343. dst[i] = maxint(dst[i], increment);
  1344. dst2[i] = 1; // New total weight
  1345. }
  1346. }
  1347. prob += take;
  1348. dst += take;
  1349. dst2 += take;
  1350. }
  1351. }
  1352. return subscript_of_largest;
  1353. }
  1354. // Returns subscript of largest (most probable) value [for unit test]
  1355. int TopCompressedProb(const char* iprob, int len) {
  1356. const uint8* prob = reinterpret_cast<const uint8*>(iprob);
  1357. const uint8* problimit = prob + len;
  1358. int next_prob_sub = 0;
  1359. int topprob = 0;
  1360. int toprankenc = 0;
  1361. while (prob < problimit) {
  1362. int skiptake = *prob++;
  1363. int skip = (skiptake & 0xf0) >> 4;
  1364. int take = skiptake & 0x0f;
  1365. if (skiptake == 0) {
  1366. break;
  1367. } else if (take == 0) {
  1368. next_prob_sub += (skip << 4);
  1369. } else {
  1370. next_prob_sub += skip; // Normal case
  1371. for (int i = 0; i < take; i++) {
  1372. if (topprob < prob[i]) {
  1373. topprob = prob[i];
  1374. toprankenc = next_prob_sub + i;
  1375. }
  1376. }
  1377. prob += take;
  1378. next_prob_sub += take;
  1379. }
  1380. }
  1381. return toprankenc;
  1382. }
  1383. // Find subscript of matching key in first 8 bytes of sorted hint array, or -1
  1384. int HintBinaryLookup8(const HintEntry* hintprobs, int hintprobssize,
  1385. const char* norm_key) {
  1386. // Key is always in range [lo..hi)
  1387. int lo = 0;
  1388. int hi = hintprobssize;
  1389. while (lo < hi) {
  1390. int mid = (lo + hi) >> 1;
  1391. int comp = memcmp(&hintprobs[mid].key_prob[0], norm_key, 8);
  1392. if (comp < 0) {
  1393. lo = mid + 1;
  1394. } else if (comp > 0) {
  1395. hi = mid;
  1396. } else {
  1397. return mid;
  1398. }
  1399. }
  1400. return -1;
  1401. }
  1402. // Find subscript of matching key in first 4 bytes of sorted hint array, or -1
  1403. int HintBinaryLookup4(const HintEntry* hintprobs, int hintprobssize,
  1404. const char* norm_key) {
  1405. // Key is always in range [lo..hi)
  1406. int lo = 0;
  1407. int hi = hintprobssize;
  1408. while (lo < hi) {
  1409. int mid = (lo + hi) >> 1;
  1410. int comp = memcmp(&hintprobs[mid].key_prob[0], norm_key, 4);
  1411. if (comp < 0) {
  1412. lo = mid + 1;
  1413. } else if (comp > 0) {
  1414. hi = mid;
  1415. } else {
  1416. return mid;
  1417. }
  1418. }
  1419. return -1;
  1420. }
  1421. static inline void Boost(DetectEncodingState* destatep, int r_enc, int boost) {
  1422. destatep->enc_prob[r_enc] += boost;
  1423. }
  1424. static inline void Whack(DetectEncodingState* destatep, int r_enc, int whack) {
  1425. destatep->enc_prob[r_enc] -= whack;
  1426. }
  1427. // Apply initial probability hint based on top level domain name
  1428. // Weight is 0..100 percent
  1429. // Return 1 if name match found
  1430. int ApplyTldHint(const char* url_tld_hint, int weight,
  1431. DetectEncodingState* destatep) {
  1432. if (url_tld_hint[0] == '~') {
  1433. return 0;
  1434. }
  1435. string normalized_tld = MakeChar4(string(url_tld_hint));
  1436. int n = HintBinaryLookup4(kTLDHintProbs, kTLDHintProbsSize,
  1437. normalized_tld.c_str());
  1438. if (n >= 0) {
  1439. // TLD is four bytes, probability table is ~12 bytes
  1440. int best_sub = ApplyCompressedProb((const char *)&kTLDHintProbs[n].key_prob[kMaxTldKey],
  1441. kMaxTldVector, weight, destatep);
  1442. // Never boost ASCII7; do CP1252 instead
  1443. if (best_sub == F_ASCII_7_bit) {best_sub = F_CP1252;}
  1444. destatep->declared_enc_1 = best_sub;
  1445. if (destatep->debug_data != NULL) {
  1446. // Show TLD hint
  1447. SetDetailsEncProb(destatep, 0, best_sub, url_tld_hint);
  1448. }
  1449. return 1;
  1450. }
  1451. return 0;
  1452. }
  1453. // Apply initial probability hint based on charset= name
  1454. // Weight is 0..100 percent
  1455. // Return 1 if name match found
  1456. int ApplyCharsetHint(const char* charset_hint, int weight,
  1457. DetectEncodingState* destatep) {
  1458. if (charset_hint[0] == '~') {
  1459. return 0;
  1460. }
  1461. string normalized_charset = MakeChar44(string(charset_hint));
  1462. int n = HintBinaryLookup8(kCharsetHintProbs, kCharsetHintProbsSize,
  1463. normalized_charset.c_str());
  1464. if (n >= 0) {
  1465. // Charset is eight bytes, probability table is ~eight bytes
  1466. int best_sub = ApplyCompressedProb((const char *)&kCharsetHintProbs[n].key_prob[kMaxCharsetKey],
  1467. kMaxCharsetVector, weight, destatep);
  1468. // Never boost ASCII7; do CP1252 instead
  1469. if (best_sub == F_ASCII_7_bit) {best_sub = F_CP1252;}
  1470. destatep->declared_enc_1 = best_sub;
  1471. // If first explicitly declared charset is confusable with Latin1/1252, put
  1472. // both declared forms in declared_enc_*, displacing Latin1/1252.
  1473. // This avoids a bit of Latin1 creep.
  1474. // Also boost the declared encoding and its pair
  1475. // TODO: This should all be folded into postproc-enc-detect.cc
  1476. if ((destatep->http_hint == UNKNOWN_ENCODING) &&
  1477. (destatep->meta_hint == UNKNOWN_ENCODING)) {
  1478. // This is the first charset=hint
  1479. switch (best_sub) {
  1480. case F_Latin2: // 8859-2 Latin2, east euro
  1481. destatep->declared_enc_2 = F_CP1250;
  1482. Boost(destatep, F_Latin2, kGentleOnePair);
  1483. Boost(destatep, F_CP1250, kGentleOnePair);
  1484. break;
  1485. case F_CP1250:
  1486. destatep->declared_enc_2 = F_Latin2;
  1487. Boost(destatep, F_Latin2, kGentleOnePair);
  1488. Boost(destatep, F_CP1250, kGentleOnePair);
  1489. break;
  1490. case F_Latin3: // 8859-3 Latin3, south euro, Esperanto
  1491. destatep->declared_enc_2 = F_ASCII_7_bit;
  1492. Boost(destatep, F_Latin3, kGentleOnePair);
  1493. break;
  1494. case F_Latin4: // 8859-4 Latin4, north euro
  1495. destatep->declared_enc_2 = F_ASCII_7_bit;
  1496. Boost(destatep, F_Latin4, kGentleOnePair);
  1497. break;
  1498. case F_ISO_8859_5: // 8859-5 Cyrillic
  1499. destatep->declared_enc_2 = F_ASCII_7_bit; // Don't boost 1251
  1500. Boost(destatep, F_ISO_8859_5, kGentleOnePair); // (too different)
  1501. break;
  1502. case F_CP1251:
  1503. destatep->declared_enc_2 = F_ASCII_7_bit; // Don't boost -5
  1504. Boost(destatep, F_CP1251, kGentleOnePair); // (too different)
  1505. break;
  1506. case F_Arabic: // 8859-6 Arabic
  1507. destatep->declared_enc_2 = F_CP1256;
  1508. Boost(destatep, F_Arabic, kGentleOnePair);
  1509. Boost(destatep, F_CP1256, kGentleOnePair);
  1510. break;
  1511. case F_CP1256:
  1512. destatep->declared_enc_2 = F_Arabic;
  1513. Boost(destatep, F_Arabic, kGentleOnePair);
  1514. Boost(destatep, F_CP1256, kGentleOnePair);
  1515. break;
  1516. case F_Greek: // 8859-7 Greek
  1517. destatep->declared_enc_2 = F_CP1253;
  1518. Boost(destatep, F_Greek, kGentleOnePair);
  1519. Boost(destatep, F_CP1253, kGentleOnePair);
  1520. break;
  1521. case F_CP1253:
  1522. destatep->declared_enc_2 = F_Greek;
  1523. Boost(destatep, F_Greek, kGentleOnePair);
  1524. Boost(destatep, F_CP1253, kGentleOnePair);
  1525. break;
  1526. case F_Hebrew: // 8859-8 Hebrew
  1527. destatep->declared_enc_2 = F_CP1255;
  1528. Boost(destatep, F_Hebrew, kGentleOnePair);
  1529. Boost(destatep, F_CP1255, kGentleOnePair);
  1530. break;
  1531. case F_CP1255:
  1532. destatep->declared_enc_2 = F_Hebrew;
  1533. Boost(destatep, F_Hebrew, kGentleOnePair);
  1534. Boost(destatep, F_CP1255, kGentleOnePair);
  1535. break;
  1536. case F_Latin5: // 8859-9 Latin5, Turkish
  1537. destatep->declared_enc_2 = F_ASCII_7_bit; // Don't boost 1254
  1538. Boost(destatep, F_Latin5, kGentleOnePair); // (too different)
  1539. break;
  1540. case F_CP1254:
  1541. destatep->declared_enc_2 = F_ASCII_7_bit; // Don't boost Latin5
  1542. Boost(destatep, F_CP1254, kGentleOnePair); // (too different)
  1543. break;
  1544. case F_Latin6: // 8859-10 Latin6, Nordic
  1545. destatep->declared_enc_2 = F_ASCII_7_bit;
  1546. Boost(destatep, F_Latin6, kGentleOnePair);
  1547. break;
  1548. case F_ISO_8859_11: // 8859-11 Thai,
  1549. destatep->declared_enc_2 = F_CP874;
  1550. Boost(destatep, F_ISO_8859_11, kGentleOnePair);
  1551. Boost(destatep, F_CP874, kGentleOnePair);
  1552. break;
  1553. case F_CP874:
  1554. destatep->declared_enc_2 = F_ISO_8859_11;
  1555. Boost(destatep, F_ISO_8859_11, kGentleOnePair);
  1556. Boost(destatep, F_CP874, kGentleOnePair);
  1557. break;
  1558. case F_ISO_8859_13: // 8859-13 Latin7, Baltic
  1559. destatep->declared_enc_2 = F_CP1257;
  1560. Boost(destatep, F_ISO_8859_13, kGentleOnePair);
  1561. Boost(destatep, F_CP1257, kGentleOnePair);
  1562. break;
  1563. case F_CP1257:
  1564. destatep->declared_enc_2 = F_ISO_8859_13;
  1565. Boost(destatep, F_ISO_8859_13, kGentleOnePair);
  1566. Boost(destatep, F_CP1257, kGentleOnePair);
  1567. break;
  1568. case F_ISO_8859_15: // 8859-15 Latin9, Latin0, Euro-ized Latin1
  1569. destatep->declared_enc_2 = F_ASCII_7_bit;
  1570. Boost(destatep, F_ISO_8859_15, kGentleOnePair);
  1571. break;
  1572. // Greek all-caps is confusable with KOI8x all-lower and Hebrew.
  1573. // This turns some Greek documents into Cyrillic, etc. by mistake.
  1574. // Greek and Hebrew are boosted explicitly above; do KOI8x here.
  1575. // Boosting the declared encodingmakes it harder for the wrong one to
  1576. // creep up.
  1577. case F_KOI8R:
  1578. Boost(destatep, F_KOI8R, kGentleOnePair);
  1579. break;
  1580. case F_KOI8U:
  1581. Boost(destatep, F_KOI8U, kGentleOnePair);
  1582. break;
  1583. default:
  1584. break;
  1585. }
  1586. }
  1587. if (destatep->debug_data != NULL) {
  1588. // Show charset hint
  1589. SetDetailsEncProb(destatep, 0, best_sub, charset_hint);
  1590. }
  1591. //
  1592. // Some fix-ups for the declared encodings
  1593. //
  1594. // If non-UTF8, non-Latin1/1252 encoding declared, disable UTF8 combos
  1595. // TODO: This should all be folded into postproc-enc-detect.cc
  1596. if ((best_sub != F_UTF8) &&
  1597. (best_sub != F_Latin1) &&
  1598. (best_sub != F_CP1252)) {
  1599. Whack(destatep, F_UTF8UTF8, kBadPairWhack * 4); // demote
  1600. }
  1601. // Latin2 and CP1250 differ in the overlap part, such as B1 or B9
  1602. // The initial probabilites for charset=Latin2 explicitly put CP1250
  1603. // down twice as far as normal, and vice versa. This is done in
  1604. // postproc-enc-detect.cc
  1605. // If charset=user-defined, treat as Binary --
  1606. // we can safely only do low ASCII, might be Indic
  1607. if (normalized_charset.substr(0,4) == "user") {
  1608. Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
  1609. }
  1610. return 1;
  1611. }
  1612. return 0;
  1613. }
  1614. // Apply initial probability hint based on caller-supplied encoding
  1615. // Negative hint whacks ~encoding, non-negative boosts encoding
  1616. //
  1617. // Negative hints are an experiment to see if they might be useful.
  1618. // Not operator used instead of unary minus to allow specifying not-zero
  1619. int ApplyEncodingHint(const int encoding_hint, int weight,
  1620. DetectEncodingState* destatep) {
  1621. Encoding enc_hint = static_cast<Encoding>((encoding_hint < 0) ?
  1622. ~encoding_hint : encoding_hint);
  1623. // Map to the right internal subscript
  1624. int rankedenc_hint = CompactEncDet::BackmapEncodingToRankedEncoding(enc_hint);
  1625. // I'm not sure how strong this hint should be. Weight 100% = 1 bigram
  1626. int increment = (kBoostOnePair * weight) / 100;
  1627. if (encoding_hint < 0) {
  1628. destatep->enc_prob[rankedenc_hint] -= increment;
  1629. } else {
  1630. destatep->enc_prob[rankedenc_hint] += increment;
  1631. }
  1632. if (destatep->debug_data != NULL) {
  1633. // Show encoding hint
  1634. SetDetailsEncProb(destatep, 0, -1, MyEncodingName(enc_hint));
  1635. }
  1636. return 1;
  1637. }
  1638. // Apply initial probability hint based on user interface language
  1639. // Weight is 0..100 percent
  1640. // Return 1 if name match found
  1641. int ApplyUILanguageHint(const Language language_hint,
  1642. int weight, DetectEncodingState* destatep) {
  1643. if (language_hint == UNKNOWN_LANGUAGE) {
  1644. return 0;
  1645. }
  1646. string normalized_lang = MakeChar8(LanguageName(language_hint));
  1647. int n = HintBinaryLookup8(kLangHintProbs, kLangHintProbsSize,
  1648. normalized_lang.c_str());
  1649. if (n >= 0) {
  1650. // Language is eight bytes, probability table is ~eight bytes
  1651. int best_sub = ApplyCompressedProb((const char *)&kLangHintProbs[n].key_prob[kMaxLangKey],
  1652. kMaxLangVector, weight, destatep);
  1653. // Never boost ASCII7; do CP1252 instead
  1654. if (best_sub == F_ASCII_7_bit) {best_sub = F_CP1252;}
  1655. destatep->declared_enc_1 = best_sub;
  1656. if (destatep->debug_data != NULL) {
  1657. // Show language hint
  1658. SetDetailsEncProb(destatep, 0, best_sub, normalized_lang.c_str());
  1659. }
  1660. return 1;
  1661. }
  1662. return 0;
  1663. }
  1664. // Apply initial probability hint based on corpus type (web, email, etc)
  1665. // Return 1 if name match found
  1666. int ApplyDefaultHint(const CompactEncDet::TextCorpusType corpus_type,
  1667. DetectEncodingState* destatep) {
  1668. for (int i = 0; i < NUM_RANKEDENCODING; i++) {
  1669. // Set the default probability
  1670. destatep->enc_prob[i] = kDefaultProb[i] * 3;
  1671. // Deliberately set 2022 seven-bit encodings to zero,
  1672. // so we can look for actual use
  1673. // TODO: This should all be folded into postproc-enc-detect.cc
  1674. if (SevenBitEncoding(kMapToEncoding[i])) {
  1675. destatep->enc_prob[i] = 0;
  1676. }
  1677. }
  1678. // A little corpus distinction
  1679. switch (corpus_type) {
  1680. case CompactEncDet::WEB_CORPUS:
  1681. case CompactEncDet::XML_CORPUS:
  1682. // Allow double-converted UTF-8 to start nearly equal to normal UTF-8
  1683. destatep->enc_prob[F_UTF8UTF8] =
  1684. destatep->enc_prob[F_UTF8] - kSmallInitDiff;
  1685. break;
  1686. case CompactEncDet::QUERY_CORPUS:
  1687. case CompactEncDet::EMAIL_CORPUS:
  1688. default:
  1689. break;
  1690. }
  1691. if (FLAGS_demo_nodefault) {
  1692. // Demo, make initial probs all zero
  1693. for (int i = 0; i < NUM_RANKEDENCODING; i++) {
  1694. destatep->enc_prob[i] = 0;
  1695. }
  1696. }
  1697. if (destatep->debug_data != NULL) {
  1698. // Show default hint
  1699. SetDetailsEncProb(destatep, 0, -1, "Default");
  1700. }
  1701. return 1;
  1702. }
  1703. // Do reverse search for c in [str..str+len)
  1704. // Note: initial pointer is to FRONT of string, not back
  1705. const char* MyMemrchr(const char* str, char c, size_t len) {
  1706. const char* ret = str + len;
  1707. while (str <= --ret) {
  1708. if (*ret == c) {return ret;}
  1709. }
  1710. return NULL;
  1711. }
  1712. // Minimum real URL is 11 bytes: "http://a.bc" -- shorter is assumed to be TLD
  1713. // Now that we are no longer trying to do Indic font-based encodigns, we
  1714. // don't need the full URL and can go back to simple TLD. This test remains for
  1715. // backwards compatility with any caller using full URL.
  1716. static const int kMinURLLength = 11;
  1717. // Extract TLD from a full URL or just a TLD
  1718. // Return hostname and length if a full URL
  1719. void ExtractTLD(const char* url_hint, char* tld_hint, int tld_hint_len,
  1720. const char** ret_host_start, int* ret_host_len) {
  1721. // url_hint can either be a full URL (preferred) or just top-level domain name
  1722. // Extract the TLD from a full URL and use it for
  1723. // a normal TLD hint
  1724. strncpy(tld_hint, "~", tld_hint_len);
  1725. tld_hint[tld_hint_len - 1] = '\0';
  1726. *ret_host_start = NULL;
  1727. *ret_host_len = 0;
  1728. int url_len = (url_hint != NULL) ? strlen(url_hint) : 0;
  1729. if (url_len == 0) {
  1730. // Empty TLD
  1731. return;
  1732. }
  1733. // Minimum real URL is 11 bytes: "http://a.bc" -- shorter is assumed to be TLD
  1734. if (kMinURLLength <= url_len) {
  1735. // See if it really is a URL
  1736. const char* first_slash = strchr(url_hint, '/');
  1737. if ((first_slash != NULL) && (first_slash != url_hint) &&
  1738. (first_slash[-1] == ':') && (first_slash[1] == '/') &&
  1739. (memrchr(url_hint, '.', first_slash - url_hint) == NULL)) {
  1740. // We found :// and no dot in front of it, so declare a real URL
  1741. const char* hostname_start = first_slash + 2;
  1742. const char* hostname_end = strchr(hostname_start, '/');
  1743. if (hostname_end == NULL) {
  1744. // No slash; end is first byte off end of the URL string
  1745. hostname_end = url_hint + url_len;
  1746. }
  1747. size_t hostname_len = hostname_end - hostname_start;
  1748. const char* port_start =
  1749. (const char*)memchr(hostname_start, ':', hostname_len);
  1750. if (port_start != NULL) {
  1751. // Port; shorten hostname
  1752. hostname_end = port_start;
  1753. hostname_len = hostname_end - hostname_start;
  1754. }
  1755. const char* tld_start = MyMemrchr(hostname_start, '.', hostname_len);
  1756. if (tld_start != NULL) {
  1757. // Remember the TLD we just found
  1758. int tld_len = hostname_start + hostname_len - tld_start - 1;
  1759. if (tld_len > (tld_hint_len - 1)) {
  1760. tld_len = tld_hint_len - 1;
  1761. }
  1762. memcpy(tld_hint, tld_start + 1, tld_len);
  1763. tld_hint[tld_len] = '\0';
  1764. }
  1765. *ret_host_start = hostname_start;
  1766. *ret_host_len = hostname_len;
  1767. return;
  1768. }
  1769. } else {
  1770. strncpy(tld_hint, url_hint, tld_hint_len);
  1771. tld_hint[tld_hint_len - 1] = '\0';
  1772. }
  1773. }
  1774. // Apply hints, if any, to probabilities
  1775. // NOTE: Encoding probabilites are all zero at this point
  1776. void ApplyHints(const char* url_hint,
  1777. const char* http_charset_hint,
  1778. const char* meta_charset_hint,
  1779. const int encoding_hint,
  1780. const Language language_hint,
  1781. const CompactEncDet::TextCorpusType corpus_type,
  1782. DetectEncodingState* destatep) {
  1783. int hint_count = 0;
  1784. // url_hint can either be a full URL (preferred) or just top-level domain name
  1785. // Extract the TLD from a full URL and use it for
  1786. // a normal TLD hint
  1787. char tld_hint[16];
  1788. const char* hostname_start = NULL;
  1789. int hostname_len = 0;
  1790. ExtractTLD(url_hint, tld_hint, sizeof(tld_hint),
  1791. &hostname_start, &hostname_len);
  1792. // Initial hints give slight boost to Ascii-7-bit and code page 1252
  1793. // ApplyXxx routines copy enc_1 to enc_2 then update declared_enc_1
  1794. // This gives a boost to 1252 if one of HTTP/META is specified,
  1795. // but this could be the wrong thing to do if Latin2/3/4/etc. is specified
  1796. destatep->declared_enc_1 = F_CP1252;
  1797. destatep->declared_enc_2 = F_ASCII_7_bit;
  1798. // Applying various hints takes max of new hint and any old hint.
  1799. // This does better on multiple hints that a weighted average
  1800. // Weight is 0..100 percent
  1801. if ((http_charset_hint != NULL) && (http_charset_hint[0] != '~')) {
  1802. destatep->declared_enc_2 = destatep->declared_enc_1;
  1803. hint_count += ApplyCharsetHint(http_charset_hint, 100, destatep);
  1804. destatep->http_hint = kMapToEncoding[destatep->declared_enc_1];
  1805. if ((destatep->declared_enc_1 == F_CP1252) ||
  1806. (destatep->declared_enc_1 == F_Latin1)) {
  1807. destatep->looking_for_latin_trigrams = true;
  1808. }
  1809. }
  1810. if ((meta_charset_hint != NULL) && (meta_charset_hint[0] != '~')) {
  1811. destatep->declared_enc_2 = destatep->declared_enc_1;
  1812. hint_count += ApplyCharsetHint(meta_charset_hint, 100, destatep);
  1813. destatep->meta_hint = kMapToEncoding[destatep->declared_enc_1];
  1814. if ((destatep->declared_enc_1 == F_CP1252) ||
  1815. (destatep->declared_enc_1 == F_Latin1)) {
  1816. destatep->looking_for_latin_trigrams = true;
  1817. }
  1818. }
  1819. if (encoding_hint != UNKNOWN_ENCODING) {
  1820. destatep->declared_enc_2 = destatep->declared_enc_1;
  1821. hint_count += ApplyEncodingHint(encoding_hint, 50, destatep);
  1822. }
  1823. if (language_hint != UNKNOWN_LANGUAGE) {
  1824. destatep->declared_enc_2 = destatep->declared_enc_1;
  1825. hint_count += ApplyUILanguageHint(language_hint, 50, destatep);
  1826. }
  1827. // Use top level domain if not .com and <=1 other hint was available
  1828. if (url_hint != NULL) {
  1829. destatep->tld_hint = CompactEncDet::TopEncodingOfTLDHint(tld_hint);
  1830. if (hint_count == 0) {
  1831. // Apply with weight 100%
  1832. destatep->declared_enc_2 = destatep->declared_enc_1;
  1833. hint_count += ApplyTldHint(tld_hint, 100, destatep);
  1834. if ((destatep->declared_enc_1 == F_CP1252) ||
  1835. (destatep->declared_enc_1 == F_Latin1)) {
  1836. destatep->looking_for_latin_trigrams = true;
  1837. }
  1838. if (strcmp("hu", tld_hint) == 0) {
  1839. // Hungarian is particularly difficult to separate Latin2 from Latin1,
  1840. // so always look for trigram scanning if bare TLD=hu hint
  1841. destatep->looking_for_latin_trigrams = true;
  1842. }
  1843. // Treat .com as no TLD hint at all
  1844. } else if ((hint_count == 1) && (strcmp("com", tld_hint) != 0)) {
  1845. // Either shift weighting or consider doing no TLD here -- seems to
  1846. // distract from correct charset= hints. Or perhaps apply only if
  1847. // charset = Latin1/1252...
  1848. // Apply with weight 50%
  1849. destatep->declared_enc_2 = destatep->declared_enc_1;
  1850. hint_count += ApplyTldHint(tld_hint, 50, destatep);
  1851. if ((destatep->declared_enc_1 == F_CP1252) ||
  1852. (destatep->declared_enc_1 == F_Latin1)) {
  1853. destatep->looking_for_latin_trigrams = true; // These need trigrams
  1854. }
  1855. }
  1856. // Else ignore TLD hint entirely
  1857. }
  1858. // Use all-web default distribution if not even a TLD hint
  1859. if (hint_count == 0) {
  1860. destatep->looking_for_latin_trigrams = true; // Default needs trigrams
  1861. destatep->declared_enc_2 = destatep->declared_enc_1;
  1862. hint_count += ApplyDefaultHint(corpus_type, destatep);
  1863. }
  1864. // ISO-Microsoft Pairs
  1865. // F_Latin1, F_CP1252,
  1866. // F_Latin2, F_CP1250, NOT really strict subset/superset pairs
  1867. // F_Latin3,
  1868. // F_Latin4,
  1869. // F_ISO_8859_5, F_CP1251,
  1870. // F_Arabic, F_CP1256, NOT
  1871. // F_Greek, F_CP1253, NOT really pairs
  1872. // (or upgrade incvt to make Greek use CP)
  1873. // F_Hebrew, F_CP1255, NOT really pairs
  1874. // F_Latin5, F_CP1254,
  1875. // F_Latin6,
  1876. // F_ISO_8859_11,
  1877. // F_ISO_8859_13, F_CP1257,
  1878. // F_ISO_8859_15,
  1879. // ISO-Microsoft Pairs
  1880. // Get important families started together
  1881. // // This should fall out of the initializatoin vectors for charset,
  1882. // but we need to get rid of families alltogetrher
  1883. //
  1884. // TODO make this more graceful
  1885. // Add small bias for subsets
  1886. // Subtract small bias for supersets
  1887. destatep->enc_prob[F_CP932] = destatep->enc_prob[F_SJS] - kSmallInitDiff;
  1888. destatep->enc_prob[F_GBK] = destatep->enc_prob[F_GB] - kSmallInitDiff;
  1889. destatep->enc_prob[F_GB18030] = destatep->enc_prob[F_GB] - kSmallInitDiff;
  1890. destatep->enc_prob[F_BIG5_CP950] = destatep->enc_prob[F_BIG5] -
  1891. kSmallInitDiff;
  1892. destatep->enc_prob[F_BIG5_HKSCS] = destatep->enc_prob[F_BIG5] -
  1893. kSmallInitDiff;
  1894. // Deliberate over-bias Ascii7 and underbias Binary [unneeded]
  1895. // destatep->enc_prob[F_ASCII_7_bit] = destatep->enc_prob[F_ASCII_7_bit] + kSmallInitDiff;
  1896. // destatep->enc_prob[F_BINARY] = destatep->enc_prob[F_BINARY] - (kBoostInitial / 2);
  1897. if (destatep->debug_data != NULL) {
  1898. // Show state at end of hints
  1899. SetDetailsEncProb(destatep, 0, -1, "Endhints");
  1900. if(FLAGS_enc_detect_detail2) {
  1901. // Add a line showing the watched encoding(s)
  1902. if (watch1_rankedenc >= 0) {
  1903. SetDetailsEncProb(destatep, 0,
  1904. watch1_rankedenc, FLAGS_enc_detect_watch1);
  1905. }
  1906. if (watch2_rankedenc >= 0) {
  1907. SetDetailsEncProb(destatep, 0,
  1908. watch2_rankedenc, FLAGS_enc_detect_watch2);
  1909. }
  1910. } // End detail2
  1911. }
  1912. // If duplicate hints, set second one to ASCII_7BIT to prevent double-boost
  1913. if (destatep->declared_enc_1 == destatep->declared_enc_2) {
  1914. destatep->declared_enc_2 = F_ASCII_7_bit;
  1915. }
  1916. if (FLAGS_force127) {
  1917. destatep->do_latin_trigrams = true;
  1918. if (FLAGS_enc_detect_source) {
  1919. PsHighlight(0, destatep->initial_src, 0, 2);
  1920. }
  1921. }
  1922. if (FLAGS_counts && destatep->looking_for_latin_trigrams) {++looking_used;}
  1923. if (FLAGS_counts && destatep->do_latin_trigrams) {++doing_used;}
  1924. //
  1925. // At this point, destatep->enc_prob[] is an initial probability vector based
  1926. // on the given hints/default. In general, it spreads out least-likely
  1927. // encodings to be about 2**-25 below the most-likely encoding.
  1928. // For input text with lots of bigrams, an unlikely encoding can rise to
  1929. // the top at a rate of about 2**6 per bigram, and more commonly 2**2 per
  1930. // bigram. So more than 4 bigrams and commonly more than 12 are
  1931. // needed to overcome the initial hints when the least-likely encoding
  1932. // is in fact the correct answer. So if the entire text has very few bigrams
  1933. // (as a two-word query might), it can be impossible for the correct
  1934. // encoding to win.
  1935. //
  1936. // To compensate for this, we take the initial hint vector and effectively
  1937. // apply it at the rate of 1/16 every bigram for the first 16 bigrams. The
  1938. // actual mechanism is done just before the last prune.
  1939. //
  1940. // Remember Initial hint probabilities
  1941. memcpy(destatep->hint_prob, destatep->enc_prob, sizeof(destatep->enc_prob));
  1942. }
  1943. // Look for specific high-value patterns in the first 4 bytes
  1944. // Byte order marks (BOM)
  1945. // EFBBBF UTF-8
  1946. // FEFF UTF-16 BE
  1947. // FFFE UTF-16 LE
  1948. // FFFE0000 UTF-32 BE
  1949. // 0000FEFF UTF-32 LE
  1950. //
  1951. // Likely UTF-x of seven-bit ASCII
  1952. // 00xx UTF-16 BE xx printable ASCII
  1953. // xx00 UTF-16 LE
  1954. // 000000xx UTF-32 BE
  1955. // xx000000 UTF-32 LE
  1956. //
  1957. void InitialBytesBoost(const uint8* src,
  1958. int text_length,
  1959. DetectEncodingState* destatep) {
  1960. if (text_length < 4) {return;}
  1961. uint32 pair01 = (src[0] << 8) | src[1];
  1962. uint32 pair23 = (src[2] << 8) | src[3];
  1963. uint32 quad0123 = (pair01 << 16) | pair23;
  1964. bool utf_16_indication = false;
  1965. bool utf_32_indication = false;
  1966. int best_enc = -1;
  1967. // Byte order marks
  1968. // UTF-8
  1969. if ((quad0123 & 0xffffff00) == 0xEFBBBF00) {
  1970. destatep->bom_hint = UTF8;
  1971. Boost(destatep, F_UTF8, kBoostInitial * 2);
  1972. Boost(destatep, F_UTF8UTF8, kBoostInitial * 2);
  1973. best_enc = F_UTF8;
  1974. // UTF-32 (test before UTF-16)
  1975. } else if (quad0123 == 0x0000FEFF) {
  1976. destatep->bom_hint = UTF32BE;
  1977. Boost(destatep, F_UTF_32BE, kBoostInitial * 2);
  1978. best_enc = F_UTF_32BE;
  1979. } else if (quad0123 == 0xFFFE0000) {
  1980. destatep->bom_hint = UTF32LE;
  1981. Boost(destatep, F_UTF_32LE, kBoostInitial * 2);
  1982. best_enc = F_UTF_32LE;
  1983. // UTF-16
  1984. } else if (pair01 == 0xFEFF) {
  1985. destatep->bom_hint = UTF16BE;
  1986. Boost(destatep, F_UTF_16BE, kBoostInitial * 3);
  1987. best_enc = F_UTF_16BE;
  1988. } else if (pair01 == 0xFFFE) {
  1989. destatep->bom_hint = UTF16LE;
  1990. Boost(destatep, F_UTF_16LE, kBoostInitial * 3);
  1991. best_enc = F_UTF_16LE;
  1992. // Possible seven-bit ASCII encoded as UTF-16/32
  1993. // UTF-32 (test before UTF-16)
  1994. } else if (((quad0123 & 0xffffff00) == 0) &&
  1995. (kIsPrintableAscii[src[3]] != 0)) {
  1996. Boost(destatep, F_UTF_32BE, kBoostInitial);
  1997. Whack(destatep, F_UTF_32LE, kBadPairWhack); // Illegal char
  1998. best_enc = F_UTF_32BE;
  1999. } else if (((quad0123 & 0x00ffffff) == 0) &&
  2000. (kIsPrintableAscii[src[0]] != 0)) {
  2001. Boost(destatep, F_UTF_32LE, kBoostInitial);
  2002. Whack(destatep, F_UTF_32BE, kBadPairWhack); // Illegal char
  2003. best_enc = F_UTF_32LE;
  2004. } else if ((src[0] == 0x00) && (kIsPrintableAscii[src[1]] != 0)) {
  2005. Boost(destatep, F_UTF_16BE, kBoostInitial);
  2006. best_enc = F_UTF_16BE;
  2007. } else if ((src[1] == 0x00) && (kIsPrintableAscii[src[0]] != 0)) {
  2008. Boost(destatep, F_UTF_16LE, kBoostInitial);
  2009. best_enc = F_UTF_16LE;
  2010. // Whack if 0000 or FFFF
  2011. // UTF-32 (test before UTF-16)
  2012. } else if (quad0123 == 0x00000000) {
  2013. Whack(destatep, F_UTF_32BE, kBadPairWhack); // Illegal char
  2014. Whack(destatep, F_UTF_32LE, kBadPairWhack);
  2015. Whack(destatep, F_UTF_16BE, kBadPairWhack);
  2016. Whack(destatep, F_UTF_16LE, kBadPairWhack);
  2017. best_enc = -1;
  2018. } else if (quad0123 == 0xffffffff) {
  2019. Whack(destatep, F_UTF_32BE, kBadPairWhack); // Illegal char
  2020. Whack(destatep, F_UTF_32LE, kBadPairWhack);
  2021. Whack(destatep, F_UTF_16BE, kBadPairWhack);
  2022. Whack(destatep, F_UTF_16LE, kBadPairWhack);
  2023. best_enc = -1;
  2024. } else if (pair01 == 0x0000) {
  2025. Whack(destatep, F_UTF_16BE, kBadPairWhack); // Illegal char
  2026. Whack(destatep, F_UTF_16LE, kBadPairWhack);
  2027. best_enc = -1;
  2028. } else if (pair01 == 0xffff) {
  2029. Whack(destatep, F_UTF_16BE, kBadPairWhack); // Illegal char
  2030. Whack(destatep, F_UTF_16LE, kBadPairWhack);
  2031. best_enc = -1;
  2032. // These are the first four bytes of some known binary file formats
  2033. // Boost BINARY bigtime if JPEG FFD8FFxx
  2034. // Boost BINARY bigtime if png 89504E47 (.PNG)
  2035. // Boost BINARY bigtime if gif 47494638 (GIF8)
  2036. // Boost BINARY bigtime if zip 504B0304 (PK..)
  2037. // Boost BINARY bigtime if gzip 1F8B08xx
  2038. // Boost BINARY bigtime if gzip 78DAxxxx
  2039. // Boost BINARY if PDF 25504446 (%PDF)
  2040. // Boost BINARY if SWF (FWSx or CWSx where x <= 0x1f)
  2041. } else if ((quad0123 & 0xffffff00) == 0xFFD8FF00) { // JPEG FFD8FFxx
  2042. Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
  2043. } else if (quad0123 == 0x89504E47) { // Hex 89 P N G
  2044. Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
  2045. } else if (quad0123 == 0x47494638) { // Hex GIF8
  2046. Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
  2047. } else if (quad0123 == 0x504B0304) { // Hex P K 03 04
  2048. Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
  2049. } else if ((quad0123 & 0xffffff00) == 0x1F8B0800) { // gzip 1F8B08xx
  2050. Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
  2051. } else if (pair01 == 0x78DA) { // gzip 78DAxxxx
  2052. Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
  2053. } else if (quad0123 == 0x25504446) { // Hex %PDF
  2054. Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
  2055. } else if ((quad0123 & 0xffffff1f) == 0x66535700) { // Hex FWSx
  2056. Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
  2057. } else if ((quad0123 & 0xffffff1f) == 0x63535700) { // Hex CWSx
  2058. Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
  2059. // More binary detect prefixes
  2060. // 7F E L F Executable and linking format
  2061. // M M 00 * TIFF (little-endian)
  2062. // * 00 M M TIFF (big-endian)
  2063. // 01 f c p Final cut pro
  2064. } else if (quad0123 == 0x7F454C46) { // Hex 7F E L F
  2065. Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
  2066. } else if (quad0123 == 0x4D4D002A) { // Hex M M 00 *
  2067. Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
  2068. } else if (quad0123 == 0x2A004D4D) { // Hex * 00 M M
  2069. Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
  2070. } else if (quad0123 == 0x01666370) { // Hex 01 f c p
  2071. Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
  2072. // More binary detect prefixes; all-ASCII names; heavy weight to avoid ASCII
  2073. // prefix overcoming binary
  2074. // C C S D USGS ISIS 3-D cube files
  2075. // S I M P FITS image header "SIMPLE "
  2076. } else if (quad0123 == 0x43435344) { // Hex C C S D
  2077. Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
  2078. } else if (quad0123 == 0x53494D50) { // Hex S I M P
  2079. Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
  2080. // More binary detect prefixes; all-ASCII names; lighter weight
  2081. // H W P Hangul word processor
  2082. // 8 B P S Photoshop
  2083. // P D S _ xx "PDS_VERSION_ID "
  2084. } else if (quad0123 == 0x48575020) { // Hex H W P
  2085. if ((19 <= text_length) &&
  2086. (memcmp(src, "HWP.Document.File.V", 19) == 0)) {
  2087. Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
  2088. } else if ((19 <= text_length) &&
  2089. (memcmp(src, "HWP Document File V", 19) == 0)) {
  2090. Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
  2091. } else {
  2092. Boost(destatep, F_BINARY, kBoostInitial * kWeakerBinary);
  2093. }
  2094. } else if (quad0123 == 0x38425053) { // Hex 8 B P S
  2095. Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
  2096. } else if (quad0123 == 0x5044535F) { // Hex P D S _
  2097. if ((14 <= text_length) && (memcmp(src, "PDS_VERSION_ID", 14) == 0)) {
  2098. Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
  2099. } else {
  2100. Boost(destatep, F_BINARY, kBoostInitial * kWeakerBinary);
  2101. }
  2102. }
  2103. // There are several main Windows EXE file formats.
  2104. // Not examined here (prefix too short; never see them in Google pipeline)
  2105. // M Z DOS .exe Mark Zbikowski
  2106. // N E DOS 4.0 16-bit
  2107. // L E OS/2 VxD drivers
  2108. // L X OS/2
  2109. // P E Windows NT
  2110. // More user-defined
  2111. // http://www.freenet.am/armscii/ Armenian
  2112. // If any hints or BOM, etc. keep UTF 16/32 around
  2113. if ((destatep->enc_prob[F_UTF_16BE] > 0) ||
  2114. (destatep->enc_prob[F_UTF_16LE] > 0)) {
  2115. utf_16_indication = true;
  2116. }
  2117. if ((destatep->enc_prob[F_UTF_32BE] > 0) ||
  2118. (destatep->enc_prob[F_UTF_32LE] > 0)) {
  2119. utf_32_indication = true;
  2120. }
  2121. // Kill UTF16/32 right now if no positive indication of them
  2122. // Otherwise, they tend to rise to the top in 7-bit files with an
  2123. // occasional 0x02 byte in some comment or javascript
  2124. if (!utf_16_indication) {
  2125. Whack(destatep, F_UTF_16BE, kBadPairWhack * 8);
  2126. Whack(destatep, F_UTF_16LE, kBadPairWhack * 8);
  2127. Whack(destatep, F_Unicode, kBadPairWhack * 8);
  2128. }
  2129. if (!utf_32_indication) {
  2130. Whack(destatep, F_UTF_32BE, kBadPairWhack * 8);
  2131. Whack(destatep, F_UTF_32LE, kBadPairWhack * 8);
  2132. }
  2133. // Usually kill mixed encodings
  2134. if (!FLAGS_ced_allow_utf8utf8) {
  2135. Whack(destatep, F_UTF8UTF8, kBadPairWhack * 8);
  2136. }
  2137. // 2011.11.07 never use UTF8CP1252 -- answer will be UTF8 instead
  2138. Whack(destatep, F_UTF8CP1252, kBadPairWhack * 8);
  2139. if (destatep->debug_data != NULL) {
  2140. // Show first four bytes of the input
  2141. char buff[16];
  2142. snprintf(buff, sizeof(buff), "%04x%04x", pair01, pair23);
  2143. SetDetailsEncProb(destatep, 0, best_enc, buff);
  2144. }
  2145. }
  2146. // Descending order
  2147. int IntCompare(const void* v1, const void* v2) {
  2148. const int* p1 = reinterpret_cast<const int*>(v1);
  2149. const int* p2 = reinterpret_cast<const int*>(v2);
  2150. if (*p1 < *p2) {return 1;}
  2151. if (*p1 > *p2) {return -1;}
  2152. return 0;
  2153. }
  2154. bool Base64Char(uint8 c) {
  2155. if (('A' <= c) && (c <= 'Z')) {return true;}
  2156. if (('a' <= c) && (c <= 'z')) {return true;}
  2157. if (('0' <= c) && (c <= '9')) {return true;}
  2158. if ('+' == c) {return true;}
  2159. if ('/' == c) {return true;}
  2160. return false;
  2161. }
  2162. int Base64ScanLen(const uint8* start, const uint8* limit) {
  2163. // We have a plausible beginning; scan entire base64 string
  2164. const uint8* ib64str = start;
  2165. const uint8* b64str = ib64str;
  2166. const uint8* b64strlimit = limit;
  2167. // if starts with + +++, assume it is drawing, so bogus
  2168. if (((limit - start) > 3) && (start[0] == '+') &&
  2169. (start[1] == '+') && (start[2] == '+')) {
  2170. return 81;
  2171. }
  2172. // Scan over base64
  2173. while ((b64str < b64strlimit) && (kBase64Value[*b64str++] >= 0)) {
  2174. }
  2175. b64str--; // We overshot by 1
  2176. return b64str - ib64str;
  2177. }
  2178. // Input is at least 8-character legal base64 string after +.
  2179. // But might be say + "Presse+Termine"
  2180. bool GoodUnicodeFromBase64(const uint8* start, const uint8* limit) {
  2181. // Reject base64 string len N if density of '+' is > 1 + N/16 (expect 1/64)
  2182. // Reject base64 string len N if density of A-Z is < 1 + N/16 (expect 26/64)
  2183. // Reject base64 string len N if density of a-z is < 1 + N/16 (expect 26/64)
  2184. // Reject base64 string len N if density of 0-9 is < 1 + N/32 (expect 10/64)
  2185. // NOTE: this requires at least one lower AND one upper AND one digit to pass
  2186. //
  2187. int plus_count = 0;
  2188. int lower_count = 0;
  2189. int upper_count = 0;
  2190. int digit_count = 0;
  2191. int len = limit - start;
  2192. for (const uint8* src = start; src < limit; ++src) {
  2193. uint8 c = *src;
  2194. if (('a' <= c) && (c <= 'z')) {
  2195. ++lower_count;
  2196. } else if (('A' <= c) && (c <= 'Z')) {
  2197. ++upper_count;
  2198. } else if (('0' <= c) && (c <= '0')) {
  2199. ++digit_count;
  2200. } else if (*src == '+') {
  2201. ++plus_count;
  2202. }
  2203. }
  2204. if (plus_count > (1 + (len >> 4))) {return false;}
  2205. if (lower_count < (1 + (len >> 4))) {return false;}
  2206. if (upper_count < (1 + (len >> 4))) {return false;}
  2207. if (digit_count < (1 + (len >> 5))) {return false;}
  2208. // checking the last character to reduce false positive
  2209. // since the last character may be padded to 0 bits at the end.
  2210. // refer to http://en.wikipedia.org/wiki/UTF-7
  2211. int nmod8 = len & 7;
  2212. const uint8 last = *(start+len-1);
  2213. // When UTF-7 string length%8=3, the last two bits must be padded as 0
  2214. if ((nmod8 == 3) && (kBase64Value[last] & 3)) {return false;}
  2215. // When UTF-7 string length%8=6, the last four bits must be padded as 0
  2216. if ((nmod8 == 6) && (kBase64Value[last] & 15)) {return false;}
  2217. return true;
  2218. }
  2219. // Prune here after N bytes
  2220. // Boost here for seven-bit sequences (at every prune)
  2221. // if (sevenbitrankedencoding)
  2222. // + UTF7 scan and boost/demote len mod 8 = 0 3 6
  2223. // ~ Hz scan and boost/demote len mod 8 = 0 2 4 6
  2224. // 1B 2022 scan and boost/demote len mod 8 = 0 2 4 6
  2225. // 0E 2022 scan and boost/demote len mod 8 = 0 2 4 6
  2226. // [0F 2022 boost/demote]
  2227. // 00 UTF16/32 scan and boost/demote offset = even/odd
  2228. //
  2229. // If still some seven-bit possibilities > pure ASCII,
  2230. // scan each possibility for clearer prob, s.t. about
  2231. // two good sequences is a clear win
  2232. // A-Z 00-19 00xx-64xx (B = 04xx)
  2233. // a-z 1A-33 68xx-CCxx (f = 7Cxx)
  2234. // 0-9 34-3D D0xx-F4xx (1 = D4xx)
  2235. // + 3E F8xx
  2236. // / 3F FCxx
  2237. // do another chunk with slow scan
  2238. // Boost, whack, or leave alone UTF-7 probablilty
  2239. void UTF7BoostWhack(DetectEncodingState* destatep, int next_pair, uint8 byte2) {
  2240. int off = destatep->interesting_offsets[AsciiPair][next_pair];
  2241. if (off >= destatep->prior_utf7_offset) {
  2242. // Not part of a previous successful UTF-7 string
  2243. ++destatep->utf7_starts;
  2244. if (byte2 == '-') {
  2245. // +- encoding for '+' neutral
  2246. } else if (!Base64Char(byte2)) {
  2247. // Not base64 -- not UTF-7, whack
  2248. Whack(destatep, F_UTF7, kBadPairWhack); // Illegal pair
  2249. } else {
  2250. // Starts with base64 byte, might be a good UTF7 sequence
  2251. const uint8* start = destatep->initial_src + off + 1; // over the +
  2252. int n = Base64ScanLen(start, destatep->limit_src);
  2253. int nmod8 = n & 7;
  2254. if ((n == 3) || (n == 6)) {
  2255. // short but legal -- treat as neutral
  2256. } else if ((nmod8 == 0) | (nmod8 == 3) | (nmod8 == 6)) {
  2257. // Good length. Check for good Unicode.
  2258. if (GoodUnicodeFromBase64(start, start + n)) {
  2259. // Good length and Unicode, boost
  2260. Boost(destatep, F_UTF7, kBoostOnePair); // Found good
  2261. destatep->prior_utf7_offset = off + n + 1;
  2262. } else {
  2263. // Bad Unicode. Whack
  2264. Whack(destatep, F_UTF7, kBadPairWhack); // Illegal length
  2265. }
  2266. } else {
  2267. // Bad length. Whack
  2268. Whack(destatep, F_UTF7, kBadPairWhack); // Illegal length
  2269. }
  2270. }
  2271. }
  2272. }
  2273. // Boost, whack, or leave alone HZ probablilty
  2274. void HzBoostWhack(DetectEncodingState* destatep, uint8 byte2) {
  2275. if ((byte2 == '{') || (byte2 == '}')) {
  2276. Boost(destatep, F_HZ_GB_2312, kBoostOnePair); // Found ~{ or ~}
  2277. } else if ((byte2 == '~') || (byte2 == '\n')) {
  2278. destatep->enc_prob[F_HZ_GB_2312] += 0; // neutral
  2279. } else {
  2280. Whack(destatep, F_HZ_GB_2312, kBadPairWhack); // Illegal pair
  2281. }
  2282. }
  2283. // Boost, whack, or leave alone BINARY probablilty
  2284. void BinaryBoostWhack(DetectEncodingState* destatep, uint8 byte1, uint8 byte2) {
  2285. int quadrant = ((byte1 & 0x80) >> 6) | ((byte2 & 0x80) >> 7);
  2286. int bucket8x4 = ((byte1 & 0xe0) >> 3) | ((byte2 & 0xc0) >> 6);
  2287. uint32 quad_mask = 1 << quadrant;
  2288. uint32 bucket8x4_mask = 1 << bucket8x4;
  2289. if ((destatep->binary_quadrants_seen & quad_mask) == 0) {
  2290. destatep->binary_quadrants_seen |= quad_mask;
  2291. destatep->binary_quadrants_count += 1;
  2292. if (destatep->binary_quadrants_count == 4) {
  2293. Boost(destatep, F_BINARY, kBoostOnePair * 2); // Found all 4 quadrants,
  2294. // boost 2 pairs
  2295. }
  2296. }
  2297. if ((destatep->binary_8x4_seen & bucket8x4_mask) == 0) {
  2298. destatep->binary_8x4_seen |= bucket8x4_mask;
  2299. destatep->binary_8x4_count += 1;
  2300. if (destatep->binary_8x4_count >= 11) {
  2301. Boost(destatep, F_BINARY, kBoostOnePair * 4); // Found 11+/20 buckets,
  2302. // boost 4 pairs each time
  2303. }
  2304. }
  2305. }
  2306. // Demote UTF-16/32 on 0000 or FFFF, favoring Binary
  2307. void UTF1632BoostWhack(DetectEncodingState* destatep, int offset, uint8 byte1) {
  2308. if (byte1 == 0) { // We have 0000
  2309. Whack(destatep, F_UTF_16BE, kBadPairWhack); // Illegal pair
  2310. Whack(destatep, F_UTF_16LE, kBadPairWhack); // Illegal pair
  2311. switch (offset & 3) {
  2312. case 0: // We get called with 0 4 8, etc. for ASCII/BMP as UTF-32BE
  2313. Whack(destatep, F_UTF_32LE, kBadPairWhack); // Illegal pair
  2314. Boost(destatep, F_UTF_32BE, kSmallInitDiff); // Good pair
  2315. break;
  2316. case 1: // We get called with 1 5 9, etc. for ASCII as UTF-32LE
  2317. case 2: // We get called with 2 6 10, etc. for BMP as UTF-32LE
  2318. Whack(destatep, F_UTF_32BE, kBadPairWhack); // Illegal pair
  2319. Boost(destatep, F_UTF_32LE, kSmallInitDiff); // Good pair
  2320. break;
  2321. case 3: // ambiguous
  2322. break;
  2323. }
  2324. } else { // We have ffff
  2325. Whack(destatep, F_UTF_32BE, kBadPairWhack); // Illegal pair
  2326. Whack(destatep, F_UTF_32LE, kBadPairWhack); // Illegal pair
  2327. Whack(destatep, F_UTF_16BE, kBadPairWhack); // Illegal pair
  2328. Whack(destatep, F_UTF_16LE, kBadPairWhack); // Illegal pair
  2329. }
  2330. }
  2331. // Make even offset
  2332. void UTF16MakeEven(DetectEncodingState* destatep, int next_pair) {
  2333. destatep->interesting_offsets[OtherPair][next_pair] &= ~1;
  2334. }
  2335. bool ConsecutivePair(DetectEncodingState* destatep, int i) {
  2336. if (i <= 0) {
  2337. return false;
  2338. }
  2339. return destatep->interesting_offsets[OtherPair][i] ==
  2340. (destatep->interesting_offsets[OtherPair][i - 1] + 2);
  2341. }
  2342. // boost, whack, or leave alone UTF-8 probablilty
  2343. // Any whacks are also applied to UTF8UTF8; CheckUTF8UTF8Seq assumes good UTF8
  2344. // Returns total boost
  2345. int CheckUTF8Seq(DetectEncodingState* destatep, int weightshift) {
  2346. int startcount = destatep->prior_interesting_pair[OtherPair];
  2347. int endcount = destatep->next_interesting_pair[OtherPair];
  2348. int demotion_count = 0;
  2349. for (int i = startcount; i < endcount; ++i) {
  2350. int sub;
  2351. char* s = &destatep->interesting_pairs[OtherPair][i * 2];
  2352. // Demote four byte patterns that are more likely Latin1 than UTF-8
  2353. // C9AE, DF92, DF93, DFAB. See note at top.
  2354. // Demotion also boosts Latin1 and CP1252
  2355. uint8 s0 = static_cast<uint8>(s[0]);
  2356. uint8 s1 = static_cast<uint8>(s[1]);
  2357. if ((s0 == 0xc9) && (s1 == 0xae)) {++demotion_count;}
  2358. if ((s0 == 0xdf) && (s1 == 0x92)) {++demotion_count;}
  2359. if ((s0 == 0xdf) && (s1 == 0x93)) {++demotion_count;}
  2360. if ((s0 == 0xdf) && (s1 == 0xab)) {++demotion_count;}
  2361. if (!ConsecutivePair(destatep, i)) {
  2362. // Insert a blank into the sequence; avoid wrong splices
  2363. sub = (' ' >> 4) & 0x0f;
  2364. ++destatep->utf8_minicount[
  2365. static_cast<int>(kMiniUTF8Count[static_cast<int>(destatep->next_utf8_ministate)][sub])];
  2366. destatep->next_utf8_ministate =
  2367. kMiniUTF8State[destatep->next_utf8_ministate][sub];
  2368. }
  2369. // Byte 0
  2370. sub = (s0 >> 4) & 0x0f;
  2371. ++destatep->utf8_minicount[
  2372. static_cast<int>(kMiniUTF8Count[static_cast<int>(destatep->next_utf8_ministate)][sub])];
  2373. destatep->next_utf8_ministate =
  2374. kMiniUTF8State[destatep->next_utf8_ministate][sub];
  2375. // Byte 1
  2376. sub = (s1 >> 4) & 0x0f;
  2377. ++destatep->utf8_minicount[
  2378. static_cast<int>(kMiniUTF8Count[static_cast<int>(destatep->next_utf8_ministate)][sub])];
  2379. destatep->next_utf8_ministate =
  2380. kMiniUTF8State[destatep->next_utf8_ministate][sub];
  2381. DCHECK((0 <= destatep->next_utf8_ministate) &&
  2382. (destatep->next_utf8_ministate < 8));
  2383. }
  2384. // For the four specific byte combinations above, Latin1/CP1252 is more likely
  2385. if (demotion_count > 0) {
  2386. Boost(destatep, F_Latin1, kGentleOnePair * demotion_count);
  2387. Boost(destatep, F_CP1252, kGentleOnePair * demotion_count);
  2388. }
  2389. // Boost UTF8 for completed good sequences
  2390. int total_boost = 2 * destatep->utf8_minicount[2] +
  2391. 3 * destatep->utf8_minicount[3] +
  2392. 4 * destatep->utf8_minicount[4];
  2393. // But not so much for demoted bytes
  2394. total_boost -= (3 * demotion_count);
  2395. total_boost *= kGentleOnePair;
  2396. total_boost >>= weightshift;
  2397. // Design: boost both UTF8 and UTF8UTF8 for each good sequence
  2398. Boost(destatep, F_UTF8, total_boost);
  2399. Boost(destatep, F_UTF8UTF8, total_boost);
  2400. destatep->utf8_minicount[5] += destatep->utf8_minicount[2]; // total chars
  2401. destatep->utf8_minicount[5] += destatep->utf8_minicount[3]; // total chars
  2402. destatep->utf8_minicount[5] += destatep->utf8_minicount[4]; // total chars
  2403. destatep->utf8_minicount[2] = 0;
  2404. destatep->utf8_minicount[3] = 0;
  2405. destatep->utf8_minicount[4] = 0;
  2406. // Whack (2 bytes) for errors
  2407. int error_whack = 2 * destatep->utf8_minicount[1];
  2408. error_whack *= kGentlePairWhack;
  2409. error_whack >>= weightshift;
  2410. Whack(destatep, F_UTF8, error_whack);
  2411. Whack(destatep, F_UTF8UTF8, error_whack);
  2412. destatep->utf8_minicount[1] = 0;
  2413. return total_boost - error_whack;
  2414. }
  2415. // Boost, whack, or leave alone UTF8UTF8 probablilty
  2416. //
  2417. // We are looking for
  2418. // (1) chars ONLY in set UTF8(0080)..UTF8(00FF), including for 80..9F the
  2419. // MS CP1252 mappings, and
  2420. // (2) sequences of 2 or more such characters
  2421. //
  2422. // If so, we could be looking at some non-7-bit encoding extra-converted
  2423. // to UTF-8. The most common observed is CP1252->UTF8 twice,
  2424. // 1252=>UTF8 : 1252=>UTF8
  2425. // where the colon means "take those bytes and pretend that they are 1252".
  2426. // We have a couple of examples of BIG5 bytes converted as though
  2427. // they were 1252,
  2428. // BIG5 : 1252=>UTF8
  2429. //
  2430. // Of course, we don't want correctly converted 1252 to be flagged here
  2431. // 1252=>UTF8
  2432. // So we want the input high bytes to be in pairs or longer, hence the
  2433. // output UTF8 in groups of four bytes or more
  2434. //
  2435. // Good chars: C2xx, C3xx,
  2436. // Good chars: C592, C593, C5A0, C5A1, C5B8, C5BD, C5BE, C692, CB86, CB9C
  2437. // Good chars: E280xx E282AC E284A2
  2438. // C2xx 1100001x 10xxxxxx (128/128)
  2439. // C5xx 11000101 10xx00xx (16/4)
  2440. // C5xx 11000101 10111xxx (8/3)
  2441. // C692 11000110 10010010 (1/1)
  2442. // CBxx 11001011 100xx1x0 (8/2)
  2443. // E28x 11100010 10000xx0 (4/3)
  2444. //
  2445. // Returns total boost
  2446. int CheckUTF8UTF8Seq(DetectEncodingState* destatep, int weightshift) {
  2447. int this_pair = destatep->prior_interesting_pair[OtherPair];
  2448. int startbyteoffset = this_pair * 2;
  2449. int endbyteoffset = destatep->next_interesting_pair[OtherPair] * 2;
  2450. char* startbyte = &destatep->interesting_pairs[OtherPair][startbyteoffset];
  2451. char* endbyte = &destatep->interesting_pairs[OtherPair][endbyteoffset];
  2452. int pair_number = this_pair;
  2453. for (char* s = startbyte; s < endbyte; s += 2) {
  2454. int next = destatep->next_utf8utf8_ministate;
  2455. if (!ConsecutivePair(destatep, pair_number)) {
  2456. // Insert two blanks into the sequence to avoid wrong splices
  2457. // go back to no odd-byte offset
  2458. destatep->utf8utf8_odd_byte = 0;
  2459. int sub = UTF88Sub(' ', ' ');
  2460. ++destatep->utf8utf8_minicount[static_cast<int>(kMiniUTF8UTF8Count[next][sub])];
  2461. next = kMiniUTF8UTF8State[next][sub];
  2462. }
  2463. int odd = destatep->utf8utf8_odd_byte;
  2464. if (s + 1 + odd >= endbyte) continue;
  2465. int sub = UTF88Sub(s[0 + odd], s[1 + odd]);
  2466. destatep->utf8utf8_odd_byte ^= kMiniUTF8UTF8Odd[next][sub];
  2467. ++destatep->utf8utf8_minicount[
  2468. static_cast<int>(kMiniUTF8UTF8Count[next][sub])];
  2469. destatep->next_utf8utf8_ministate = kMiniUTF8UTF8State[next][sub];
  2470. ++pair_number;
  2471. }
  2472. // Boost for completed good sequences; each count covers two chars.
  2473. // Design: boost UTF8UTF8 above UTF8 for each good sequence
  2474. int total_boost = (2) * destatep->utf8utf8_minicount[2] +
  2475. (2) * destatep->utf8utf8_minicount[3] +
  2476. (2) * destatep->utf8utf8_minicount[4];
  2477. total_boost *= kGentleOnePair;
  2478. total_boost >>= weightshift;
  2479. Boost(destatep, F_UTF8UTF8, total_boost);
  2480. // Track total characters
  2481. destatep->utf8utf8_minicount[5] += destatep->utf8utf8_minicount[2];
  2482. destatep->utf8utf8_minicount[5] += destatep->utf8utf8_minicount[3];
  2483. destatep->utf8utf8_minicount[5] += destatep->utf8utf8_minicount[4];
  2484. destatep->utf8utf8_minicount[2] = 0;
  2485. destatep->utf8utf8_minicount[3] = 0;
  2486. destatep->utf8utf8_minicount[4] = 0;
  2487. // Design: Do not whack UTF8UTF8 below UTF8 for each bad sequence
  2488. destatep->utf8utf8_minicount[1] = 0;
  2489. return total_boost;
  2490. }
  2491. // We give a gentle boost for each paired SO ... SI, whack others
  2492. void CheckIso2022ActiveSeq(DetectEncodingState* destatep) {
  2493. int this_pair = destatep->prior_interesting_pair[OtherPair];
  2494. int startbyteoffset = this_pair * 2;
  2495. int endbyteoffset = destatep->next_interesting_pair[OtherPair] * 2;
  2496. char* startbyte = &destatep->interesting_pairs[OtherPair][startbyteoffset];
  2497. char* endbyte = &destatep->interesting_pairs[OtherPair][endbyteoffset];
  2498. // Initial <esc> char must precede SO/SI
  2499. // HZ_GB_2312 has no alternation constraint on 1- and 2-byte segments
  2500. // ISO-2022-JP (JIS) has no alternation constraint on 1- and 2-byte segments
  2501. // ISO-2022-CN has no alternation constraint on 1- and 2-byte segments
  2502. // ISO-2022-KR requires alternation between 1- and 2-byte segments
  2503. // JIS:
  2504. // <esc> ( B ISO-2022-JP [1b 28 42] SI to ASCII
  2505. // <esc> ( J ISO-2022-JP [1b 28 4a] SI to X0201
  2506. // <esc> $ @ ISO-2022-JP [1b 24 40] SO to X0208-78 twobyte
  2507. // <esc> $ B ISO-2022-JP [1b 24 42] SO to X0208-83 twobyte
  2508. for (char* s = startbyte; s < endbyte; s += 2) {
  2509. if (s[0] == 0x1b) {
  2510. if (s[1] == 0x24) {
  2511. // <esc> $ is SO
  2512. destatep->next_2022_state = SOSI_TWOBYTE; // SO to two-byte
  2513. } else if (s[1] == 0x28) {
  2514. if (destatep->next_2022_state == SOSI_TWOBYTE) {
  2515. Boost(destatep, F_JIS, kGentlePairBoost);
  2516. } else if (destatep->next_2022_state == SOSI_ONEBYTE) {
  2517. Whack(destatep, F_JIS, kGentlePairWhack);
  2518. }
  2519. destatep->next_2022_state = SOSI_ONEBYTE; // JIS SI to one-byte
  2520. } else {
  2521. Whack(destatep, F_JIS, kBadPairWhack);
  2522. Whack(destatep, F_ISO_2022_CN, kBadPairWhack);
  2523. Whack(destatep, F_ISO_2022_KR, kBadPairWhack);
  2524. destatep->next_2022_state = SOSI_ERROR; // not 2022
  2525. }
  2526. } else if (s[0] == 0x0e) {
  2527. // <so>
  2528. Whack(destatep, F_JIS, kBadPairWhack);
  2529. if (destatep->next_2022_state != SOSI_NONE) {
  2530. destatep->next_2022_state = SOSI_TWOBYTE; // SO to two-byte
  2531. } else {
  2532. // ESC required before SO/SI
  2533. Whack(destatep, F_ISO_2022_CN, kBadPairWhack * 4);
  2534. Whack(destatep, F_ISO_2022_KR, kBadPairWhack * 4);
  2535. destatep->next_2022_state = SOSI_ERROR; // SO not after SI
  2536. }
  2537. } else if (s[0] == 0x0f) {
  2538. // <si>
  2539. Whack(destatep, F_JIS, kBadPairWhack);
  2540. if (destatep->next_2022_state != SOSI_NONE) {
  2541. if (destatep->next_2022_state == SOSI_TWOBYTE) {
  2542. Boost(destatep, F_ISO_2022_CN, kGentlePairBoost);
  2543. Boost(destatep, F_ISO_2022_KR, kGentlePairBoost);
  2544. } else if (destatep->next_2022_state == SOSI_ONEBYTE) {
  2545. Whack(destatep, F_ISO_2022_CN, kGentlePairWhack);
  2546. Whack(destatep, F_ISO_2022_KR, kGentlePairWhack);
  2547. }
  2548. destatep->next_2022_state = SOSI_ONEBYTE; // SI to one-byte
  2549. } else {
  2550. // ESC required before SO/SI
  2551. Whack(destatep, F_ISO_2022_CN, kBadPairWhack * 4);
  2552. Whack(destatep, F_ISO_2022_KR, kBadPairWhack * 4);
  2553. destatep->next_2022_state = SOSI_ERROR; // SI not after SO
  2554. }
  2555. } else if (s[0] <= 0x1f) {
  2556. // Some other control code. Allow ht lf [ff] cr
  2557. if ((s[0] != 0x09) && (s[0] != 0x0a) &&
  2558. (s[0] != 0x0c) && (s[0] != 0x0d)) {
  2559. // Otherwise these can float to the top on bad bytes
  2560. Whack(destatep, F_JIS, kBadPairWhack);
  2561. Whack(destatep, F_ISO_2022_CN, kBadPairWhack);
  2562. Whack(destatep, F_ISO_2022_KR, kBadPairWhack);
  2563. }
  2564. }
  2565. }
  2566. // If no start, keep the probability pinned at zero (or below)
  2567. if (destatep->next_2022_state == SOSI_NONE) {
  2568. destatep->enc_prob[F_ISO_2022_CN] =
  2569. minint(0, destatep->enc_prob[F_ISO_2022_CN]);
  2570. destatep->enc_prob[F_ISO_2022_KR] =
  2571. minint(0, destatep->enc_prob[F_ISO_2022_KR]);
  2572. destatep->enc_prob[F_JIS] =
  2573. minint(0, destatep->enc_prob[F_JIS]);
  2574. }
  2575. }
  2576. // We give a gentle boost for each paired ~{ ... ~}, whack others
  2577. void CheckHzActiveSeq(DetectEncodingState* destatep) {
  2578. int this_pair = destatep->prior_interesting_pair[AsciiPair];
  2579. int startbyteoffset = this_pair * 2;
  2580. int endbyteoffset = destatep->next_interesting_pair[AsciiPair] * 2;
  2581. char* startbyte = &destatep->interesting_pairs[AsciiPair][startbyteoffset];
  2582. char* endbyte = &destatep->interesting_pairs[AsciiPair][endbyteoffset];
  2583. for (char* s = startbyte; s < endbyte; s += 2) {
  2584. // Look for initial ~{ pair
  2585. if ((s[0] == '~') && (s[1] == '{')) {
  2586. destatep->next_hz_state = SOSI_TWOBYTE; // SO to two-byte
  2587. }
  2588. // Also look for closing ~} pair
  2589. if ((s[0] == '~') && (s[1] == '}')) {
  2590. if (destatep->next_hz_state == SOSI_TWOBYTE) {
  2591. Boost(destatep, F_HZ_GB_2312, kGentlePairBoost);
  2592. } else if (destatep->next_hz_state == SOSI_ONEBYTE) {
  2593. Whack(destatep, F_HZ_GB_2312, kGentlePairWhack);
  2594. }
  2595. destatep->next_hz_state = SOSI_ONEBYTE; // SI to one-byte
  2596. }
  2597. }
  2598. // If no start, keep the probability pinned at zero (or below)
  2599. if (destatep->next_hz_state == SOSI_NONE) {
  2600. destatep->enc_prob[F_HZ_GB_2312] =
  2601. minint(0, destatep->enc_prob[F_HZ_GB_2312]);
  2602. }
  2603. }
  2604. // We give a gentle boost after an odd number of 8Fxxxx triples, which
  2605. // put subsequent bigrams out of phase until a low byte or another 8Fxxxx
  2606. void CheckEucJpSeq(DetectEncodingState* destatep) {
  2607. int this_pair = destatep->prior_interesting_pair[OtherPair];
  2608. int startbyteoffset = this_pair * 2;
  2609. int endbyteoffset = destatep->next_interesting_pair[OtherPair] * 2;
  2610. char* startbyte = &destatep->interesting_pairs[OtherPair][startbyteoffset];
  2611. char* endbyte = &destatep->interesting_pairs[OtherPair][endbyteoffset];
  2612. for (char* s = startbyte; s < endbyte; s += 2) {
  2613. // Boost if out of phase (otherwise, EUC-JP will score badly after 8Fxxxx)
  2614. if (destatep->next_eucjp_oddphase) {
  2615. //printf(" EucJp boost[%02x%02x]\n", s[0], s[1]); // TEMP
  2616. Boost(destatep, F_EUC_JP, kGentlePairBoost * 2);
  2617. }
  2618. uint8 s0 = static_cast<uint8>(s[0]);
  2619. uint8 s1 = static_cast<uint8>(s[1]);
  2620. // Look for phase flip at 8F
  2621. if ((s0 & 0x80) == 0x00) {
  2622. destatep->next_eucjp_oddphase = false;
  2623. } else if (s0 == 0x8f) {
  2624. destatep->next_eucjp_oddphase = !destatep->next_eucjp_oddphase;
  2625. }
  2626. if ((s1 & 0x80) == 0x00) {
  2627. destatep->next_eucjp_oddphase = false;
  2628. } else if (s1 == 0x8f) {
  2629. destatep->next_eucjp_oddphase = !destatep->next_eucjp_oddphase;
  2630. }
  2631. }
  2632. }
  2633. // Boost, whack, or leave alone BINARY probablilty
  2634. // Also called if UTF 16/32 active
  2635. void CheckBinaryDensity(const uint8* src, DetectEncodingState* destatep,
  2636. int delta_otherpairs) {
  2637. // No change if not much gathered information
  2638. if (delta_otherpairs == 0) {
  2639. // Only ASCII pairs this call
  2640. return;
  2641. }
  2642. int next_pair = destatep->next_interesting_pair[OtherPair];
  2643. // Look at density of interesting pairs [0..src)
  2644. int delta_offset = static_cast<int>(src - destatep->initial_src); // actual
  2645. // Look at density of interesting pairs [0..next_interesting)
  2646. int low_byte = destatep->interesting_offsets[OtherPair][0];
  2647. //int high_byte = destatep->interesting_offsets[OtherPair][next_pair - 1] + 2;
  2648. //int byte_span = high_byte - low_byte;
  2649. int byte_span = delta_offset - low_byte;
  2650. // If all ASCII for the first 4KB, reject
  2651. // If mostly ASCII in the first 5KB, reject
  2652. if ((low_byte >= kBinaryHardAsciiLimit) || (delta_offset >= kBinarySoftAsciiLimit)) {
  2653. // Not binary early enough in text
  2654. Whack(destatep, F_BINARY, kBadPairWhack * 4);
  2655. Whack(destatep, F_UTF_32BE, kBadPairWhack * 4);
  2656. Whack(destatep, F_UTF_32LE, kBadPairWhack * 4);
  2657. Whack(destatep, F_UTF_16BE, kBadPairWhack * 4);
  2658. Whack(destatep, F_UTF_16LE, kBadPairWhack * 4);
  2659. return;
  2660. }
  2661. // Density 1.0 for N pairs takes 2*N bytes
  2662. // Whack if < 1/16 after first non_ASCII pair
  2663. if ((next_pair * 2 * 16) < byte_span) {
  2664. // Not dense enough
  2665. Whack(destatep, F_BINARY, kBadPairWhack * 4);
  2666. Whack(destatep, F_UTF_32BE, kBadPairWhack * 4);
  2667. Whack(destatep, F_UTF_32LE, kBadPairWhack * 4);
  2668. Whack(destatep, F_UTF_16BE, kBadPairWhack * 4);
  2669. Whack(destatep, F_UTF_16LE, kBadPairWhack * 4);
  2670. }
  2671. if (next_pair < 8) {
  2672. // Fewer than 8 non-ASCII total; too soon to boost
  2673. return;
  2674. }
  2675. // Density 1.0 for N pairs takes 2*N bytes
  2676. // Boost if density >= 1/4, whack if < 1/16
  2677. if ((next_pair * 2 * 4) >= byte_span) {
  2678. // Very dense
  2679. // Only boost if at least 2 quadrants seen
  2680. if (destatep->binary_quadrants_count >= 2) {
  2681. Boost(destatep, F_BINARY, kSmallInitDiff);
  2682. Boost(destatep, F_UTF_32BE, kSmallInitDiff);
  2683. Boost(destatep, F_UTF_32LE, kSmallInitDiff);
  2684. Boost(destatep, F_UTF_16BE, kSmallInitDiff);
  2685. Boost(destatep, F_UTF_16LE, kSmallInitDiff);
  2686. }
  2687. }
  2688. }
  2689. // Look at a number of special-case encodings whose reliable detection depends
  2690. // on sequencing or other properties
  2691. // AsciiPair probibilities (UTF7 and HZ) are all done here
  2692. void ActiveSpecialBoostWhack(const uint8* src, DetectEncodingState* destatep) {
  2693. int delta_asciipairs = destatep->next_interesting_pair[AsciiPair] -
  2694. destatep->prior_interesting_pair[AsciiPair];
  2695. int delta_otherpairs = destatep->next_interesting_pair[OtherPair] -
  2696. destatep->prior_interesting_pair[OtherPair];
  2697. // The two pure ASCII encodings
  2698. if (UTF7OrHzActive(destatep) && (delta_asciipairs > 0)) {
  2699. // Adjust per pair
  2700. for (int i = 0; i < delta_asciipairs; ++i) {
  2701. int next_pair = destatep->prior_interesting_pair[AsciiPair] + i;
  2702. uint8 byte1 = destatep->interesting_pairs[AsciiPair][next_pair * 2 + 0];
  2703. uint8 byte2 = destatep->interesting_pairs[AsciiPair][next_pair * 2 + 1];
  2704. if (byte1 == '+') {
  2705. // Boost, whack, or leave alone UTF-7 probablilty
  2706. UTF7BoostWhack(destatep, next_pair, byte2);
  2707. if (destatep->debug_data != NULL) {
  2708. // Show UTF7 entry
  2709. char buff[16];
  2710. snprintf(buff, sizeof(buff), "%02x%02x+", byte1, byte2);
  2711. SetDetailsEncProb(destatep,
  2712. destatep->interesting_offsets[AsciiPair][next_pair],
  2713. kMostLikelyEncoding[(byte1 << 8) + byte2],
  2714. buff);
  2715. }
  2716. } else if (byte1 == '~') {
  2717. // Boost, whack, or leave alone HZ probablilty
  2718. HzBoostWhack(destatep, byte2);
  2719. if (destatep->debug_data != NULL) {
  2720. // Show Hz entry
  2721. char buff[16];
  2722. snprintf(buff, sizeof(buff), "%02x%02x~", byte1, byte2);
  2723. SetDetailsEncProb(destatep,
  2724. destatep->interesting_offsets[AsciiPair][next_pair],
  2725. kMostLikelyEncoding[(byte1 << 8) + byte2],
  2726. buff);
  2727. }
  2728. }
  2729. }
  2730. // Kill UTF-7 now if at least 8 + pairs and not confirmed valid UTF-7
  2731. if ((destatep->utf7_starts >= 8) && (destatep->prior_utf7_offset == 0)) {
  2732. Whack(destatep, F_UTF7, kBadPairWhack * 8); // flush
  2733. }
  2734. }
  2735. // All the other encodings
  2736. if (OtherActive(destatep) && (delta_otherpairs > 0)) {
  2737. // Adjust per pair
  2738. int biggest_weightshift = 0;
  2739. for (int i = 0; i < delta_otherpairs; ++i) {
  2740. int next_pair = destatep->prior_interesting_pair[OtherPair] + i;
  2741. uint8 byte1 = destatep->interesting_pairs[OtherPair][next_pair * 2 + 0];
  2742. uint8 byte2 = destatep->interesting_pairs[OtherPair][next_pair * 2 + 1];
  2743. int off = destatep->interesting_offsets[OtherPair][next_pair];
  2744. int weightshift = destatep->interesting_weightshift[OtherPair][next_pair];
  2745. biggest_weightshift = maxint(biggest_weightshift, weightshift);
  2746. if (byte1 == 0x00) {
  2747. if (byte2 == 0x00) {
  2748. UTF1632BoostWhack(destatep, off, byte1);
  2749. } else if ((kIsPrintableAscii[byte2] != 0) && ((off & 1) != 0)) {
  2750. // We have 00xx at an odd offset. Turn into preceding even offset
  2751. // for possible Ascii text in UTF-16LE or UTF-32LE (vs BE)
  2752. // This will cascade into caller's probability update
  2753. // 00 is illegal for all other encodings, so it doesn't matter to them
  2754. UTF16MakeEven(destatep, next_pair);
  2755. }
  2756. if (destatep->debug_data != NULL) {
  2757. // Show 0000 detail entry for this bigram
  2758. char buff[16];
  2759. snprintf(buff, sizeof(buff), "%02x%02xZ", byte1, byte2);
  2760. SetDetailsEncProb(destatep,
  2761. destatep->interesting_offsets[OtherPair][next_pair],
  2762. kMostLikelyEncoding[(byte1 << 8) + byte2],
  2763. buff);
  2764. }
  2765. }
  2766. if (byte1 == 0xff) {
  2767. if (byte2 == 0xff) {
  2768. UTF1632BoostWhack(destatep, off, byte1);
  2769. }
  2770. if (destatep->debug_data != NULL) {
  2771. // Show FFFF detail entry for this bigram
  2772. char buff[16];
  2773. snprintf(buff, sizeof(buff), "%02x%02xF", byte1, byte2);
  2774. SetDetailsEncProb(destatep,
  2775. destatep->interesting_offsets[OtherPair][next_pair],
  2776. kMostLikelyEncoding[(byte1 << 8) + byte2],
  2777. buff);
  2778. }
  2779. }
  2780. if (BinaryActive(destatep)) {
  2781. BinaryBoostWhack(destatep, byte1, byte2);
  2782. }
  2783. } // End for i
  2784. // Adjust per entire-pair-span
  2785. if (UTF8Active(destatep)) {
  2786. CheckUTF8Seq(destatep, biggest_weightshift);
  2787. }
  2788. if (UTF8UTF8Active(destatep)) {
  2789. CheckUTF8UTF8Seq(destatep, biggest_weightshift);
  2790. }
  2791. if (Iso2022Active(destatep)) {
  2792. CheckIso2022ActiveSeq(destatep);
  2793. }
  2794. if (HzActive(destatep)) {
  2795. CheckHzActiveSeq(destatep);
  2796. }
  2797. if (EUCJPActive(destatep)) {
  2798. CheckEucJpSeq(destatep);
  2799. }
  2800. if (BinaryActive(destatep) || UTF1632Active(destatep)) {
  2801. CheckBinaryDensity(src, destatep, delta_otherpairs);
  2802. }
  2803. }
  2804. // ISO-2022 do OK on their own, using stright probabilities? Not on bad bytes
  2805. if (destatep->debug_data != NULL) {
  2806. // Show sequencing result
  2807. SetDetailsEncLabel(destatep, "seq");
  2808. }
  2809. }
  2810. void PrintTopEnc(DetectEncodingState* destatep, int n) {
  2811. // Print top n or fewer
  2812. int temp_sort[NUM_RANKEDENCODING];
  2813. for (int j = 0; j < destatep->rankedencoding_list_len; ++j) {
  2814. int rankedencoding = destatep->rankedencoding_list[j];
  2815. temp_sort[j] = destatep->enc_prob[rankedencoding];
  2816. }
  2817. qsort(temp_sort, destatep->rankedencoding_list_len,
  2818. sizeof(temp_sort[0]), IntCompare);
  2819. int top_n = minint(n, destatep->rankedencoding_list_len);
  2820. int showme = temp_sort[top_n - 1]; // Print this value and above
  2821. printf("rankedencodingList top %d: ", top_n);
  2822. for (int j = 0; j < destatep->rankedencoding_list_len; ++j) {
  2823. int rankedencoding = destatep->rankedencoding_list[j];
  2824. if (showme <= destatep->enc_prob[rankedencoding]) {
  2825. printf("%s=%d ",
  2826. MyEncodingName(kMapToEncoding[rankedencoding]),
  2827. destatep->enc_prob[rankedencoding]);
  2828. }
  2829. }
  2830. printf("\n\n");
  2831. }
  2832. // If the same bigram repeats, don't boost its best encoding too much
  2833. bool RepeatedBigram(DetectEncodingState* destatep, uint8 byte1, uint8 byte2) {
  2834. int this_bigram = (byte1 << 8) | byte2;
  2835. // If 00xx 01xx 02xx ... 1fxx, take out bottom 4 bits of xx.
  2836. // This ignores parts of Yahoo 0255 0254 0243 0247 0245 0243 0250 0255 ...
  2837. // It may screw up UTF-16BE
  2838. // It may screw up ISO-2022 (1b24 suppresses 1b28)
  2839. if (byte1 < 0x20) {
  2840. this_bigram &= 0xfff0;
  2841. }
  2842. if (this_bigram == destatep->prior_bigram[0]) {return true;}
  2843. if (this_bigram == destatep->prior_bigram[1]) {return true;}
  2844. if (this_bigram == destatep->prior_bigram[2]) {return true;}
  2845. if (this_bigram == destatep->prior_bigram[3]) {return true;}
  2846. // Round-robin replacement
  2847. destatep->prior_bigram[destatep->next_prior_bigram] = this_bigram;
  2848. destatep->next_prior_bigram = (destatep->next_prior_bigram + 1) & 3;
  2849. return false;
  2850. }
  2851. // Sometimes illegal bytes are used as markers between text that Javascript
  2852. // is going to decode. Don't overboost the Binary encoding for markers 01-FF.
  2853. // Just count first pair per 8x4 bucket
  2854. bool RepeatedBinary(DetectEncodingState* destatep, uint8 byte1, uint8 byte2) {
  2855. int bucket8x4 = ((byte1 & 0xe0) >> 3) | ((byte2 & 0xc0) >> 6);
  2856. uint32 bucket8x4_mask = 1 << bucket8x4;
  2857. if ((destatep->binary_8x4_seen & bucket8x4_mask) == 0) {
  2858. destatep->binary_8x4_seen |= bucket8x4_mask;
  2859. destatep->binary_8x4_count += 1;
  2860. return false;
  2861. }
  2862. return true;
  2863. }
  2864. // Find current top two rankedencoding probabilities
  2865. void ReRank(DetectEncodingState* destatep) {
  2866. destatep->top_prob = -1;
  2867. destatep->second_top_prob = -1;
  2868. // Leave unchanged
  2869. //destatep->top_rankedencoding =
  2870. // destatep->rankedencoding_list[0]; // Just to make well-defined
  2871. //destatep->second_top_rankedencoding =
  2872. // destatep->rankedencoding_list[1]; // Just to make well-defined
  2873. for (int j = 0; j < destatep->rankedencoding_list_len; j++) {
  2874. int rankedencoding = destatep->rankedencoding_list[j];
  2875. if (destatep->top_prob < destatep->enc_prob[rankedencoding]) {
  2876. // Make sure top 2 are in different superset groups
  2877. if (kMapEncToBaseEncoding[kMapToEncoding[destatep->top_rankedencoding]] !=
  2878. kMapEncToBaseEncoding[kMapToEncoding[rankedencoding]]) {
  2879. destatep->second_top_prob =
  2880. destatep->top_prob; // old top to second
  2881. destatep->second_top_rankedencoding =
  2882. destatep->top_rankedencoding; // old top to second
  2883. }
  2884. destatep->top_prob = destatep->enc_prob[rankedencoding];
  2885. destatep->top_rankedencoding = rankedencoding;
  2886. } else if (destatep->second_top_prob < destatep->enc_prob[rankedencoding]) {
  2887. if (kMapEncToBaseEncoding[kMapToEncoding[destatep->top_rankedencoding]] !=
  2888. kMapEncToBaseEncoding[kMapToEncoding[rankedencoding]]) {
  2889. destatep->second_top_prob = destatep->enc_prob[rankedencoding];
  2890. destatep->second_top_rankedencoding = rankedencoding;
  2891. }
  2892. }
  2893. }
  2894. }
  2895. void SimplePrune(DetectEncodingState* destatep, int prune_diff) {
  2896. // Prune the list of active encoding families
  2897. int keep_prob = destatep->top_prob - prune_diff;
  2898. destatep->active_special = 0;
  2899. int k = 0;
  2900. for (int j = 0; j < destatep->rankedencoding_list_len; j++) {
  2901. bool keep = true;
  2902. int rankedencoding = destatep->rankedencoding_list[j];
  2903. // If count is too low, ditch it
  2904. if (destatep->enc_prob[rankedencoding] < keep_prob) {keep = false;}
  2905. // Keep it. This will always keep at least top_prob rankedencoding
  2906. if (keep) {
  2907. destatep->active_special |= kSpecialMask[kMapToEncoding[rankedencoding]];
  2908. destatep->rankedencoding_list[k++] = rankedencoding;
  2909. }
  2910. }
  2911. destatep->rankedencoding_list_len = k;
  2912. }
  2913. // Recalculate reliable
  2914. void CalcReliable(DetectEncodingState* destatep) {
  2915. // Encoding result is reliable if big difference in top two, or if
  2916. // only Ascii7 ever encountered
  2917. // Also reliable if exactly one OtherPair and it's best encoding matches top
  2918. destatep->reliable = false;
  2919. if (destatep->next_interesting_pair[OtherPair] == 0) {
  2920. // Only 7-bit ASCII
  2921. destatep->reliable = true;
  2922. return;
  2923. }
  2924. if ((destatep->top_prob - destatep->second_top_prob) >=
  2925. FLAGS_ced_reliable_difference) {
  2926. destatep->reliable = true;
  2927. return;
  2928. }
  2929. if (destatep->next_interesting_pair[OtherPair] == 1) {
  2930. uint8 byte1 = destatep->interesting_pairs[OtherPair][0];
  2931. uint8 byte2 = destatep->interesting_pairs[OtherPair][1];
  2932. int best_enc = kMostLikelyEncoding[(byte1 << 8) + byte2];
  2933. if (best_enc == destatep->top_rankedencoding) {
  2934. destatep->reliable = true;
  2935. return;
  2936. }
  2937. }
  2938. // If we pruned to one encoding, we are done
  2939. if (destatep->rankedencoding_list_len == 1) {
  2940. destatep->reliable = true;
  2941. destatep->done = true;
  2942. return;
  2943. }
  2944. // If we pruned to two or three encodings in the same *superset/subset
  2945. // rankedencoding* and enough pairs, we are done. Else keep going
  2946. if (destatep->rankedencoding_list_len == 2) {
  2947. Encoding enc0 = kMapToEncoding[destatep->rankedencoding_list[0]];
  2948. Encoding enc1 = kMapToEncoding[destatep->rankedencoding_list[1]];
  2949. if (kMapEncToBaseEncoding[enc0] == kMapEncToBaseEncoding[enc1]) {
  2950. if (destatep->prune_count >= 3) {
  2951. destatep->reliable = true;
  2952. destatep->done = true;
  2953. return;
  2954. }
  2955. }
  2956. } else if (destatep->rankedencoding_list_len == 3) {
  2957. Encoding enc0 = kMapToEncoding[destatep->rankedencoding_list[0]];
  2958. Encoding enc1 = kMapToEncoding[destatep->rankedencoding_list[1]];
  2959. Encoding enc2 = kMapToEncoding[destatep->rankedencoding_list[2]];
  2960. Encoding base0 = kMapEncToBaseEncoding[enc0];
  2961. Encoding base1 = kMapEncToBaseEncoding[enc1];
  2962. Encoding base2 = kMapEncToBaseEncoding[enc2];
  2963. if ((base0 == base1) && (base0 == base2)) {
  2964. if (destatep->prune_count >= 3) {
  2965. destatep->reliable = true;
  2966. destatep->done = true;
  2967. return;
  2968. }
  2969. }
  2970. }
  2971. }
  2972. // Find current top two rankedencoding probabilities
  2973. void FindTop2(DetectEncodingState* destatep,
  2974. int* first_renc, int* second_renc,
  2975. int* first_prob, int* second_prob) {
  2976. *first_prob = -1;
  2977. *second_prob = -1;
  2978. *first_renc = 0;
  2979. *second_renc = 0;
  2980. for (int j = 0; j < destatep->rankedencoding_list_len; j++) {
  2981. int rankedencoding = destatep->rankedencoding_list[j];
  2982. if (*first_prob < destatep->enc_prob[rankedencoding]) {
  2983. *second_prob = *first_prob; // old top to second
  2984. *second_renc = *first_renc; // old top to second
  2985. *first_prob = destatep->enc_prob[rankedencoding];
  2986. *first_renc = rankedencoding;
  2987. } else if (*second_prob < destatep->enc_prob[rankedencoding]) {
  2988. *second_prob = destatep->enc_prob[rankedencoding];
  2989. *second_renc = rankedencoding;
  2990. }
  2991. }
  2992. }
  2993. void PrintRankedEncodingList(DetectEncodingState* destatep, const char* str) {
  2994. printf("Current ranked encoding list %s\n", str);
  2995. for (int j = 0; j < destatep->rankedencoding_list_len; j++) {
  2996. int rankedencoding = destatep->rankedencoding_list[j];
  2997. if ((rankedencoding < 0) || (rankedencoding > NUM_RANKEDENCODING)) {
  2998. printf(" [%d] BOGUS rankedencoding = %d\n", j, rankedencoding);
  2999. } else {
  3000. printf(" [%d] rankedencoding = %d %-12.12s enc_prob = %d\n",
  3001. j, rankedencoding, MyRankedEncName(rankedencoding),
  3002. destatep->enc_prob[rankedencoding]);
  3003. }
  3004. }
  3005. printf("End current ranked encoding list\n\n");
  3006. }
  3007. // Map unencoded bytes down to five bits, largely preserving letters
  3008. // This design struggles to put 33 values into 5 bits.
  3009. #define XX 0 // Punctuation (00-7F range)
  3010. #define HA 27 // High vowel a in Latin1/2/sometimes7
  3011. #define HE 28 // High vowel e
  3012. #define HI 29 // High vowel i
  3013. #define HO 30 // High vowel o
  3014. #define HU 30 // High vowel u on top of HO
  3015. #define Hc 31 // High consonant (80-FF range)
  3016. static const char kMapToFiveBits[256] = {
  3017. XX,XX,XX,XX,XX,XX,XX,XX, XX,XX,XX,XX,XX,XX,XX,XX,
  3018. XX,XX,XX,XX,XX,XX,XX,XX, XX,XX,XX,XX,XX,XX,XX,XX,
  3019. XX,XX,XX,XX,XX,XX,XX,XX, XX,XX,XX,XX,XX,XX,XX,XX,
  3020. XX,XX,XX,XX,XX,XX,XX,XX, XX,XX,XX,XX,XX,XX,XX,XX,
  3021. XX, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
  3022. 16,17,18,19,20,21,22,23, 24,25,26,XX,XX,XX,XX,XX,
  3023. XX, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
  3024. 16,17,18,19,20,21,22,23, 24,25,26,XX,XX,XX,XX,XX,
  3025. Hc,HA,Hc,Hc,Hc,Hc,Hc,Hc, HO,Hc,Hc,Hc,Hc,Hc,Hc,Hc,
  3026. Hc,HA,Hc,Hc,Hc,Hc,Hc,Hc, HO,Hc,Hc,Hc,Hc,Hc,Hc,Hc,
  3027. Hc,HA,Hc,Hc,Hc,Hc,Hc,Hc, HO,Hc,Hc,Hc,Hc,Hc,Hc,Hc,
  3028. Hc,HA,Hc,Hc,Hc,Hc,Hc,Hc, HO,Hc,Hc,Hc,Hc,Hc,Hc,Hc,
  3029. Hc,HA,HA,HA,HA,Hc,Hc,Hc, Hc,HE,HE,HE,HI,HI,HI,Hc,
  3030. Hc,Hc,Hc,HO,HO,HO,HO,Hc, Hc,HU,HU,HU,HU,Hc,Hc,Hc,
  3031. Hc,HA,HA,HA,HA,Hc,Hc,Hc, Hc,HE,HE,HE,HI,HI,HI,Hc,
  3032. Hc,Hc,Hc,HO,HO,HO,HO,Hc, Hc,HU,HU,HU,HU,Hc,Hc,Hc,
  3033. };
  3034. #undef XX
  3035. #undef HA
  3036. #undef HE
  3037. #undef HI
  3038. #undef HO
  3039. #undef HU
  3040. #undef Hc
  3041. static const int kTriLatin1Likely = 1;
  3042. static const int kTriLatin2Likely = 2;
  3043. static const int kTriLatin7Likely = 3;
  3044. // Each table entry has 32 times two bits, selected by byte[2]
  3045. // Entry subscript is selected by byte[0] and byte[1]
  3046. // Latin1/2/7 boost vector, generated 2007.09.26 by postproc-enc-detect-short.cc
  3047. static const uint64 kLatin127Trigrams[1024] = {
  3048. 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
  3049. 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
  3050. 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
  3051. 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
  3052. 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
  3053. 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
  3054. 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
  3055. 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
  3056. 0x0000000000000000ULL, 0x304080c0402c3330ULL, 0x0008400004000000ULL, 0x082800000c200000ULL,
  3057. 0x23a0000420800030ULL, 0x00000000000ccc00ULL, 0x0500100100100000ULL, 0x0388400000200010ULL,
  3058. 0x0000000000000c00ULL, 0xd0f0300740f0cf00ULL, 0x2aa0a2a22882a2acULL, 0x081d800000000080ULL,
  3059. 0x0c82000020000000ULL, 0x200a03c000a00000ULL, 0x0008400400290000ULL, 0x0400870000000000ULL,
  3060. 0x00f040c00000c080ULL, 0x0008004000000410ULL, 0x0020300000000030ULL, 0x00a030002c300000ULL,
  3061. 0x0c8030c020a00000ULL, 0x15410030f0f4c000ULL, 0x3000000300a00000ULL, 0xa2880980a0880a88ULL,
  3062. 0x0900300000000000ULL, 0x0000040100300000ULL, 0x0888820020a00000ULL, 0xc044002242010000ULL,
  3063. 0x000000121d300040ULL, 0x40100040440c0d54ULL, 0x00008423102f8144ULL, 0x0b40808400000280ULL,
  3064. 0x0000000000000000ULL, 0x0680a000000c0000ULL, 0x0880008020aa0000ULL, 0x2aaa0141010a4940ULL,
  3065. 0xcb80000000010000ULL, 0x2280000000000000ULL, 0x5248000001800000ULL, 0x8000401004040010ULL,
  3066. 0x1540010201001010ULL, 0x0080080400000000ULL, 0x5a00044040000108ULL, 0x0288000282080008ULL,
  3067. 0x4800008002200000ULL, 0x4a00000000010100ULL, 0x8a88040080000800ULL, 0x0140800000000400ULL,
  3068. 0x40010050000c0000ULL, 0x0000008000000000ULL, 0x0028000020140040ULL, 0x8620401401005308ULL,
  3069. 0xc082000000000400ULL, 0x05c0b004c0240600ULL, 0x0288000080000000ULL, 0x0000014000000000ULL,
  3070. 0x00000000040000c0ULL, 0x8001861008004280ULL, 0x0200000000000300ULL, 0x0000240242288620ULL,
  3071. 0x801000c05434c200ULL, 0x9020162040a2d2b4ULL, 0x0021840000240704ULL, 0x2a80280080084908ULL,
  3072. 0x0000000000000000ULL, 0x0500004000000040ULL, 0x0080000000040000ULL, 0x0108058104440000ULL,
  3073. 0x0900000000040000ULL, 0x00c0000000208008ULL, 0x2000005000000000ULL, 0x0080000000050000ULL,
  3074. 0x0808000000001080ULL, 0x9880810100308000ULL, 0x2285480080081a08ULL, 0x8a80000080080000ULL,
  3075. 0x1450000000600010ULL, 0x2210000100000000ULL, 0x8a88000100011000ULL, 0x1541804000000010ULL,
  3076. 0xc084011140040100ULL, 0x0000000000000800ULL, 0x0400000000000030ULL, 0x2a800000a0890128ULL,
  3077. 0x1140a00054000104ULL, 0x1440000101200404ULL, 0x028800400400d800ULL, 0x0000000000000000ULL,
  3078. 0x0000000000002330ULL, 0x0020820228a02280ULL, 0xa2888a02aa8008a8ULL, 0xd0040a0044202500ULL,
  3079. 0x8000044104a29424ULL, 0xc000100178b2c5b4ULL, 0x0000810100241504ULL, 0xd040030000380008ULL,
  3080. 0x0000000000000000ULL, 0x26c08c0000200130ULL, 0x4a08000110080000ULL, 0x2aa0004001080800ULL,
  3081. 0x0aac000000004000ULL, 0x2000000000200000ULL, 0x4240000100020000ULL, 0x4100000080000000ULL,
  3082. 0x4900040000000000ULL, 0x0800000400300040ULL, 0x6a80000000040800ULL, 0x2a08182000588008ULL,
  3083. 0x0a00000c81000008ULL, 0x0a000c0010000000ULL, 0x8a88001080280808ULL, 0x0020000200300600ULL,
  3084. 0xaac00000900a0000ULL, 0x0000100004000000ULL, 0x0020081020000000ULL, 0x8220105010084110ULL,
  3085. 0x4a80800000004000ULL, 0x050000c0c0200000ULL, 0x288c000084000000ULL, 0xa048082280000000ULL,
  3086. 0x0000000000000000ULL, 0x8000900000032080ULL, 0xee889e81b8880820ULL, 0xc2200a8142800424ULL,
  3087. 0xc020141543361010ULL, 0x10a000204a801634ULL, 0x3a808800802a00a0ULL, 0x28808b00803d0800ULL,
  3088. 0x0000000000000000ULL, 0x0020000000000030ULL, 0x0808400121010040ULL, 0x0c28240100200040ULL,
  3089. 0x2008200028800000ULL, 0xc10004c80f30c030ULL, 0x0400440114100000ULL, 0x2208200280a22220ULL,
  3090. 0x0600000030c01000ULL, 0x1201001040c00000ULL, 0x0aa02ea22aa22aa0ULL, 0x30008000000200a0ULL,
  3091. 0x20c8400400800000ULL, 0x08280b0420800000ULL, 0x0800100000210000ULL, 0x10000300c0100400ULL,
  3092. 0xc8c0000420000000ULL, 0x1000000010000000ULL, 0x0420000400000000ULL, 0x0220000500204000ULL,
  3093. 0x2200000420000000ULL, 0x0000540400000000ULL, 0x0000000020000000ULL, 0x00080c00a0810080ULL,
  3094. 0x1540000000043000ULL, 0x0000000000100000ULL, 0x2e88a22220200a20ULL, 0xc06030e34ea503a0ULL,
  3095. 0x0001100204048500ULL, 0x000000e0000c0d54ULL, 0x3000820310a31400ULL, 0x13088c0320e00280ULL,
  3096. 0x0000000000000000ULL, 0x0480000000200000ULL, 0x4000200100000000ULL, 0x0000300040040000ULL,
  3097. 0x4400000000000000ULL, 0x0401000002240000ULL, 0x0540000000040000ULL, 0x4004010000000000ULL,
  3098. 0x4001111001100000ULL, 0x2880000000300040ULL, 0x4040004040002404ULL, 0x0200000000000000ULL,
  3099. 0x0140040000100000ULL, 0x4040010040040080ULL, 0x0a00140000041004ULL, 0x0000a00400808000ULL,
  3100. 0x1010200000430040ULL, 0x0010000000000000ULL, 0x0540000000104000ULL, 0x1400114005000000ULL,
  3101. 0x0000204000440010ULL, 0x0500000000004400ULL, 0x4500000018000400ULL, 0x0000400000000000ULL,
  3102. 0x000000300000cc00ULL, 0x0100001011300000ULL, 0x0040000000000000ULL, 0xc0e0000248a00444ULL,
  3103. 0x0000040020340144ULL, 0x0000046445105454ULL, 0x32a0a80280880128ULL, 0x0880040000100100ULL,
  3104. 0x0000000000000000ULL, 0x14003000030c0004ULL, 0x4a04001100000000ULL, 0x0a00108010000000ULL,
  3105. 0x28a8004000200248ULL, 0x0100040000b00000ULL, 0x42000000000008c0ULL, 0x6008044010550010ULL,
  3106. 0x0800401000010400ULL, 0x080080040cf80000ULL, 0x5080000001001010ULL, 0x2a80100000000000ULL,
  3107. 0xcc8010010d401100ULL, 0x0200000001001000ULL, 0x0480001004001000ULL, 0x8d00800040b40210ULL,
  3108. 0x6200800000300000ULL, 0x0000010000000000ULL, 0x0428004100010000ULL, 0x4320105141501100ULL,
  3109. 0xe28c0000000c1000ULL, 0xd5c000c3c0e00300ULL, 0x0001000000100200ULL, 0x1004010202400008ULL,
  3110. 0x0000000000003000ULL, 0x2aa038a0800aab08ULL, 0x2a88038000000000ULL, 0xc220040242f09720ULL,
  3111. 0x8020200200ba0420ULL, 0x0020106105101004ULL, 0x0480800000220400ULL, 0x2280100080000008ULL,
  3112. 0x0000000000000000ULL, 0x9000000000200000ULL, 0x0001000000100000ULL, 0x2aa40c0000080800ULL,
  3113. 0x0040000040010000ULL, 0x0040000000c01000ULL, 0x4000000040000400ULL, 0x0000001000200000ULL,
  3114. 0x0000010000000000ULL, 0x05808004000c0000ULL, 0x50400c0000000400ULL, 0x020040008f000040ULL,
  3115. 0x0800000000100000ULL, 0x0000000000000000ULL, 0x0a08440000004000ULL, 0x0064000400008200ULL,
  3116. 0x0010010010034170ULL, 0x0000000010000000ULL, 0x0100204021000000ULL, 0x022000d000010100ULL,
  3117. 0x0840300000c00000ULL, 0x1400000040204400ULL, 0x09800c0040000000ULL, 0x0209708000000000ULL,
  3118. 0x000000000000c040ULL, 0x90000c50204040a0ULL, 0x0000000000000000ULL, 0x00e1500040200004ULL,
  3119. 0x8020260540204494ULL, 0x0020026150201054ULL, 0x0281800380105634ULL, 0x0884900481105000ULL,
  3120. 0x0000000000000000ULL, 0x84203c00002c0200ULL, 0xc089040000000000ULL, 0xc2a8100040200004ULL,
  3121. 0xe00c1c0000000000ULL, 0x0ce1330080200080ULL, 0x0000000000200000ULL, 0xc400110000404010ULL,
  3122. 0x0088400000000000ULL, 0x00083cc00c00c00cULL, 0xcac01c00c000580cULL, 0xe300b0f000100000ULL,
  3123. 0x0300000000000000ULL, 0xc0000f0000000000ULL, 0xc3c01c0400000000ULL, 0x81008004c0f40000ULL,
  3124. 0xc3d8003000000440ULL, 0x0000000000000000ULL, 0xc430000000000000ULL, 0x0060000000001000ULL,
  3125. 0x0800000000000000ULL, 0x00c03300f0fc0008ULL, 0x3000000400200010ULL, 0xa2a80892a0880a28ULL,
  3126. 0x0500000040000004ULL, 0x0000000000000000ULL, 0xc80032070c200020ULL, 0x0220820060a296a0ULL,
  3127. 0x802084021db486a0ULL, 0x00000d60080c0080ULL, 0xb281803313a32428ULL, 0x1808300320300000ULL,
  3128. 0x0000000000000000ULL, 0x85208cc0ccac1f20ULL, 0x2081000186100808ULL, 0x22a80880000a0808ULL,
  3129. 0xaaa8086880000000ULL, 0x802084800a2e9200ULL, 0xa280000000002008ULL, 0xa000000080080400ULL,
  3130. 0x2080010000000008ULL, 0x802020c00c028c80ULL, 0x2080000000140810ULL, 0x2a80086080080008ULL,
  3131. 0x2a800000a8000800ULL, 0xaa881800a2080800ULL, 0xaa98004080280808ULL, 0x004483d0c0300000ULL,
  3132. 0xa280002080080000ULL, 0x0000000000300000ULL, 0x22a1030000000008ULL, 0xa8a0301088880880ULL,
  3133. 0xaa80002080222808ULL, 0x85400c03fc030400ULL, 0x8a88000000000008ULL, 0xa008008010080008ULL,
  3134. 0x0000000000010000ULL, 0x0040100000301040ULL, 0x28800000a0002008ULL, 0x122482306cbc0eacULL,
  3135. 0x8020224222b8c6a0ULL, 0x802002004a82c284ULL, 0x0aa08fc440a41c80ULL, 0x888080d181385098ULL,
  3136. 0x0000000000000000ULL, 0x00c0b000000c0080ULL, 0x2208001000000800ULL, 0x0a28000000200000ULL,
  3137. 0x0000000300000000ULL, 0x00c1040000200000ULL, 0x0203020000000000ULL, 0x0248000000020000ULL,
  3138. 0x0000840000100000ULL, 0x0a808c00c000008cULL, 0x5200040040000004ULL, 0x02000c00000080a0ULL,
  3139. 0x0b0c000020000000ULL, 0x0b04000001000000ULL, 0x088c0010002000c0ULL, 0x80e08b00c0030c20ULL,
  3140. 0x0280000200014040ULL, 0x0000000000000000ULL, 0x0e20a0a008000020ULL, 0x0e280fd03f00111cULL,
  3141. 0x200080c020001000ULL, 0x8cc00c02c02f0400ULL, 0x480c0001000c404cULL, 0x0208014281080808ULL,
  3142. 0x000000000000fcfcULL, 0x004403300cf00030ULL, 0x2200000000004400ULL, 0x02202000c08c0c20ULL,
  3143. 0x02202022683a80a0ULL, 0x4020228028008c00ULL, 0x32208cc0002c0200ULL, 0x3ec00c0080304008ULL,
  3144. 0x0000000000000000ULL, 0x34000c00002c0000ULL, 0x0b00000100100030ULL, 0x0823018000000000ULL,
  3145. 0x0e8c001c01e00000ULL, 0x1200800600330000ULL, 0x4000110000000000ULL, 0x0080000300000000ULL,
  3146. 0x0800000000000000ULL, 0x08c08c04000c0000ULL, 0x0080400000880000ULL, 0x0a08000080c00008ULL,
  3147. 0x0800000304400000ULL, 0x0208000000c00000ULL, 0x2888300080400800ULL, 0x8dc0204400000000ULL,
  3148. 0xc0000000c0800000ULL, 0x0000c10000000000ULL, 0x24000c4010c00000ULL, 0x272000541d811000ULL,
  3149. 0x0200400000001000ULL, 0x0400000400001004ULL, 0xc08c007004001000ULL, 0x2048004000000000ULL,
  3150. 0x000000000003fcfcULL, 0x2aa030000cf8c800ULL, 0xe280000000000000ULL, 0x0a21008142000340ULL,
  3151. 0x0021002000b61040ULL, 0x800004064006d444ULL, 0x3aa0800300230008ULL, 0x0b00030000300000ULL,
  3152. 0x0000000000000000ULL, 0x01c080000000040cULL, 0x0100000000004000ULL, 0x0aa8018010001000ULL,
  3153. 0x0800000000100000ULL, 0x3000000000008c00ULL, 0x5400000013000000ULL, 0x02c0c00004004010ULL,
  3154. 0x5241100010000c00ULL, 0x0e00080000000808ULL, 0x5281000000000800ULL, 0x0a08108020000800ULL,
  3155. 0x0a80000000005210ULL, 0x0100000041000000ULL, 0x2a88000002080110ULL, 0x8520800000c00080ULL,
  3156. 0x01000010108c0100ULL, 0x0000000000000000ULL, 0x42a0420080000000ULL, 0x0020001004010010ULL,
  3157. 0xc4000000000c0000ULL, 0x01000c00c0200400ULL, 0x4600000100000000ULL, 0x0000000000000000ULL,
  3158. 0x0010001000000010ULL, 0x910400900820d030ULL, 0x2280000000000000ULL, 0xc2212004400040e4ULL,
  3159. 0x8001000000b61420ULL, 0xa00002a248e810b4ULL, 0x32008000002c0008ULL, 0x0c010034803c5010ULL,
  3160. 0x0000000000000000ULL, 0x85008002002c0000ULL, 0x0204001000004010ULL, 0x0120008000200000ULL,
  3161. 0x000010000c2000c0ULL, 0xccc0000000200000ULL, 0x0400000c00100040ULL, 0x0003300100004100ULL,
  3162. 0x4000551040000004ULL, 0x0e0080000c820808ULL, 0xc000000000080800ULL, 0xc803000000000000ULL,
  3163. 0x0a4000c000200000ULL, 0x0040000000c00000ULL, 0x0918145000405000ULL, 0x81400000c0300400ULL,
  3164. 0x0050000000000000ULL, 0xd000045000000000ULL, 0x0400004000400000ULL, 0x0420104010000110ULL,
  3165. 0x0700000000203000ULL, 0x34800300c0e00704ULL, 0x4440100044000400ULL, 0x0040000040000000ULL,
  3166. 0x0030000044000000ULL, 0xeaaca0008808c880ULL, 0x0a01000000200000ULL, 0x1220a300403ccf20ULL,
  3167. 0x002024c200b61044ULL, 0x802014346aa2d434ULL, 0x30008c00c0820c44ULL, 0x0a000000000c4800ULL,
  3168. 0x0000000000000000ULL, 0x0000404000340c90ULL, 0x08a8a10820800280ULL, 0x8128009022201000ULL,
  3169. 0x0020808228a000a0ULL, 0x0020400100410000ULL, 0x0400000110000000ULL, 0xa609000000200000ULL,
  3170. 0x8008330000d00000ULL, 0x8060100040404010ULL, 0xeaa00ea0ea00808cULL, 0x200c8020a0000020ULL,
  3171. 0x0408800020200000ULL, 0x0189001403200000ULL, 0xc00800000000c000ULL, 0x200430c00c300000ULL,
  3172. 0x0100300100004000ULL, 0x0000040000000000ULL, 0x2420000400001000ULL, 0x89a1200400000000ULL,
  3173. 0x20c8a000208c0000ULL, 0x8080000000000000ULL, 0x28a0108020210080ULL, 0xa2a84800a0880988ULL,
  3174. 0x258008000400c000ULL, 0x0140000000100000ULL, 0xa028a222a0aa0228ULL, 0xc060012054044040ULL,
  3175. 0x0010010400000000ULL, 0x00000050150c0114ULL, 0x0000008010c20010ULL, 0xaa088000a0200880ULL,
  3176. 0x0000000000000000ULL, 0x0700b0c0000c0000ULL, 0x2200040000080030ULL, 0x2aa8808040240800ULL,
  3177. 0x08b0500000000100ULL, 0x1000830400200000ULL, 0x4204000010000000ULL, 0x40c2200050040050ULL,
  3178. 0x0104404001010000ULL, 0x1a808c8103c00030ULL, 0x30900010c0000b00ULL, 0x200812b283000008ULL,
  3179. 0x000c000020e00000ULL, 0x2140000000400000ULL, 0x0288000080200000ULL, 0x8060a200c8a20280ULL,
  3180. 0x0400114010215000ULL, 0x0000000000000000ULL, 0x082b200002000010ULL, 0x22a0030000031000ULL,
  3181. 0x008100001000000cULL, 0x05400c00c0230400ULL, 0xca3000003c080100ULL, 0x0000000020000004ULL,
  3182. 0x0000000100000000ULL, 0x8004320813f5c000ULL, 0xa280080200000800ULL, 0xc22000044e334c20ULL,
  3183. 0x000004146e361024ULL, 0x800126806aa0d584ULL, 0xb000a0040023c41cULL, 0x0a083000803053d8ULL,
  3184. 0x0000000000000000ULL, 0x0000100000020000ULL, 0x0000000010000010ULL, 0x0000000045040004ULL,
  3185. 0x0000000000100000ULL, 0x0000020400000010ULL, 0x0003015000000000ULL, 0x0400000000000000ULL,
  3186. 0x0000000400000000ULL, 0x0100000000000800ULL, 0x0000001000000000ULL, 0x0000000000000000ULL,
  3187. 0x0000000040000000ULL, 0x0000000000000000ULL, 0x0004001000000000ULL, 0x0008001000000000ULL,
  3188. 0x0010000000000004ULL, 0x0000010100001000ULL, 0x0004000000000004ULL, 0x0000014040050014ULL,
  3189. 0x0014000000000040ULL, 0x5540000000041000ULL, 0x0000000000000000ULL, 0x0000040000000d00ULL,
  3190. 0x0000000000000000ULL, 0x0000000000100000ULL, 0x0001000000000000ULL, 0x0000000000000000ULL,
  3191. 0x0000000000000000ULL, 0x0000000000000000ULL, 0x4500000000040400ULL, 0x0000800000000400ULL,
  3192. 0x0000000000000000ULL, 0x13e080000020000cULL, 0xcf00001005100000ULL, 0x04a8008000200300ULL,
  3193. 0x00280100100000c0ULL, 0x1c8c000040200000ULL, 0x0600005000100000ULL, 0x050800000c104000ULL,
  3194. 0x4c10101000110000ULL, 0x0c00000000300000ULL, 0x22040c00100000c0ULL, 0x0800700010100000ULL,
  3195. 0x0000000000001000ULL, 0x0a08000010000040ULL, 0x0800034004210010ULL, 0x04e0000400000000ULL,
  3196. 0x0800030020000000ULL, 0x0000005000000000ULL, 0x0400110101304110ULL, 0x0428000010a01000ULL,
  3197. 0x060b000000800010ULL, 0x35810c00c020c000ULL, 0x00800c4321800000ULL, 0x4208088020000080ULL,
  3198. 0x040000111003ff00ULL, 0x0020900020202080ULL, 0x22888180a8000888ULL, 0x0225200542005420ULL,
  3199. 0x2020040400340020ULL, 0x10300424500cc444ULL, 0x3081a00400e00200ULL, 0x33001300c0300000ULL,
  3200. 0x0000000000000000ULL, 0x04003c0000000000ULL, 0x0a04001000100100ULL, 0x1408000001000000ULL,
  3201. 0x1800000044100000ULL, 0x3400040400000300ULL, 0x5000040801000040ULL, 0x4088401040000040ULL,
  3202. 0x1010110130100000ULL, 0xca800c3000300000ULL, 0x5a01000000080100ULL, 0x020280000cd01300ULL,
  3203. 0x0302000410200010ULL, 0x0000102000300000ULL, 0x0b09000000000000ULL, 0x20008004c4800004ULL,
  3204. 0x28c0410010000000ULL, 0x0004015041000050ULL, 0x0a01006000200200ULL, 0x0020d00000100040ULL,
  3205. 0x0010a00100900000ULL, 0x3500bf00c0030300ULL, 0x080c010000200d00ULL, 0x2248000004020010ULL,
  3206. 0x0000c00000000000ULL, 0x8044b00200e08000ULL, 0xaaa82aa2aa8a2aa8ULL, 0x0220002241c08604ULL,
  3207. 0x4200260440328444ULL, 0x68001226103008b4ULL, 0x3a0080c0b0000400ULL, 0x2a804804803c4008ULL,
  3208. 0x0000000000000000ULL, 0x04008c0300000400ULL, 0x008000c0000c0000ULL, 0x088001000000001cULL,
  3209. 0x0840000001000010ULL, 0x0400000000200c00ULL, 0x4244000101040000ULL, 0x4238007011100000ULL,
  3210. 0x1000d00100000010ULL, 0x1d00800400300000ULL, 0x4204080c00000000ULL, 0x2a88080080000008ULL,
  3211. 0x08001c0200001000ULL, 0x0a00000400000000ULL, 0x8a88003080080000ULL, 0x0521800400300000ULL,
  3212. 0x3200051000201000ULL, 0x0000000000000000ULL, 0x0020801404000000ULL, 0x322010401c0c101cULL,
  3213. 0x0c01100013000000ULL, 0x04003000c0204000ULL, 0x088c0020a0cc0000ULL, 0x2200000080000018ULL,
  3214. 0x0404000044000000ULL, 0x82a0b000008820b0ULL, 0x0000040020440000ULL, 0xc2650004403f1420ULL,
  3215. 0x0021340241b64464ULL, 0x8020040242c2d474ULL, 0x32018c0480288000ULL, 0x00800b0080300000ULL,
  3216. 0x0000000000000000ULL, 0x05008c0000040130ULL, 0xc0d8000000800000ULL, 0x0020000020200200ULL,
  3217. 0x23a2000120204000ULL, 0x5052100550104150ULL, 0x1000101100040000ULL, 0xc40001c301000000ULL,
  3218. 0x8288000000c00000ULL, 0x5150040144d01404ULL, 0xea8c0ea028ae088cULL, 0xc31010c000000c80ULL,
  3219. 0x0002000060000000ULL, 0xc80800f030000000ULL, 0x0000000400300000ULL, 0xc00080c00ff0c344ULL,
  3220. 0x00080001200c0000ULL, 0x0000050080000000ULL, 0x0328000300300000ULL, 0x082030000cc01040ULL,
  3221. 0xeb08800100004000ULL, 0x8030003300c80f00ULL, 0xfb0d0000e4ac0000ULL, 0x0020006080000008ULL,
  3222. 0x0500100100040000ULL, 0x1140000000000000ULL, 0xcb883330a0e00000ULL, 0xc000010050000080ULL,
  3223. 0x0010104005b54150ULL, 0x40111d5155001554ULL, 0x80000070140f0004ULL, 0x0b0830c3a0003380ULL,
  3224. 0x0000000000000000ULL, 0x04c13000000f830cULL, 0x2808000000000000ULL, 0x2810000000000800ULL,
  3225. 0x08c0080004400000ULL, 0x04c0240300801c20ULL, 0x4040000080000004ULL, 0x0000400100100010ULL,
  3226. 0x020001008000c0c0ULL, 0x1d008c000c3c0000ULL, 0x0080003000000800ULL, 0x2288080080000008ULL,
  3227. 0x0a84004020220000ULL, 0x0800080000100000ULL, 0xaa80004080400008ULL, 0x8024000400c01660ULL,
  3228. 0x80841c2001000104ULL, 0x0001000000000000ULL, 0x0020028020020280ULL, 0x0860404011900100ULL,
  3229. 0xec80080200000000ULL, 0x010103c100200400ULL, 0x0200004000000000ULL, 0x0000000000400400ULL,
  3230. 0x000010000003fcfcULL, 0x8040083238c20000ULL, 0x08800220a0920a00ULL, 0x08210004483c0c24ULL,
  3231. 0xc020240740b0a200ULL, 0x802006014a201494ULL, 0x3201233070ac0e00ULL, 0x08002806033a48a0ULL,
  3232. 0x0000000000000000ULL, 0x8020820028a00680ULL, 0x2000002000000104ULL, 0x22a80801100a0808ULL,
  3233. 0xa2a8002080000000ULL, 0xa000800008a08000ULL, 0x0000100000400000ULL, 0x8000002100000000ULL,
  3234. 0x0000010000004404ULL, 0xa2a0088080000888ULL, 0x0000000010400800ULL, 0xa280082080080008ULL,
  3235. 0x2280000080010008ULL, 0x2000000000000000ULL, 0x228800008c080808ULL, 0x8021828002a98200ULL,
  3236. 0xa200002000080000ULL, 0x0000040000000000ULL, 0x22a0000080000000ULL, 0x202882c200800080ULL,
  3237. 0xa000000001004000ULL, 0x000000c808a00600ULL, 0x0000000010000000ULL, 0x000001000000040cULL,
  3238. 0x0000000000000000ULL, 0x802002a2a8aa82a0ULL, 0x20000024a8088228ULL, 0x8020820001000000ULL,
  3239. 0x8020000000808280ULL, 0x8000000000000000ULL, 0x0020800000200280ULL, 0x2080082280a00888ULL,
  3240. 0x0000000000000000ULL, 0x0000015000000040ULL, 0x0000040000040000ULL, 0x0100010010001000ULL,
  3241. 0x0000003210008000ULL, 0x0000000404000000ULL, 0x0000000000000400ULL, 0x0200000000000000ULL,
  3242. 0x0000000000000100ULL, 0x5180014400004050ULL, 0x1000000014000000ULL, 0x4200000000000000ULL,
  3243. 0x0040200000000000ULL, 0x0201004000000000ULL, 0x0a00000000000010ULL, 0x0040200000800000ULL,
  3244. 0x0040051000000500ULL, 0x0000000100800400ULL, 0x6000000000000000ULL, 0x0000000000000000ULL,
  3245. 0x280000c1400040ccULL, 0x4180001000000000ULL, 0x00000000c1000104ULL, 0x0000000000000000ULL,
  3246. 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0080000000c00000ULL, 0x0004006066004000ULL,
  3247. 0x0000005000040440ULL, 0x0000106005804044ULL, 0x0000a10511004440ULL, 0x0000000000000110ULL,
  3248. 0x0000000000000000ULL, 0x0000000000080000ULL, 0xeb0808a020800080ULL, 0x29a80081002a1800ULL,
  3249. 0x0b2c000202100100ULL, 0x0001000000888000ULL, 0x2280102010000000ULL, 0x020000602a004110ULL,
  3250. 0x8a800160a6108100ULL, 0x0280000000000020ULL, 0x8a8000a0a8808208ULL, 0x0280882080500308ULL,
  3251. 0x0b18010020804100ULL, 0xeb080000c0080080ULL, 0x2b08000000810130ULL, 0x0000000008040020ULL,
  3252. 0xaa0a08e082894140ULL, 0x0000000000000000ULL, 0x202081409010001cULL, 0x8aa8805082806000ULL,
  3253. 0xeb082900289c0000ULL, 0x0000000000008000ULL, 0xf80c2e20002e0000ULL, 0xa288080420880888ULL,
  3254. 0x0000010000000000ULL, 0x0000000000102000ULL, 0x22880000a8a80808ULL, 0x022022a22aa880a0ULL,
  3255. 0x0000222222aa0620ULL, 0x0000022002800000ULL, 0x208080004028a000ULL, 0x2b888800801c0828ULL,
  3256. 0x0000000000000000ULL, 0x22e0828280a08028ULL, 0xaa88002082080308ULL, 0x0ea80080410a0040ULL,
  3257. 0x2a28222000a00000ULL, 0x8aa2808028a0a2a0ULL, 0x0200001000000000ULL, 0x82080000a0000000ULL,
  3258. 0x8800000082000808ULL, 0x2a008a0000300888ULL, 0x0a80080080080808ULL, 0xaa882800840b0808ULL,
  3259. 0x0a80000080000040ULL, 0xea080820a0000000ULL, 0xaa88080080080808ULL, 0x8040a2800a8024a0ULL,
  3260. 0xaa800020a0080808ULL, 0x0000040000000000ULL, 0x2a280a0080080880ULL, 0x2a20081080008a00ULL,
  3261. 0x2a88882088aa0008ULL, 0x81800202c0a01480ULL, 0xea88082082200000ULL, 0xaa88002080080008ULL,
  3262. 0x0000100000000000ULL, 0x802082a22aa0a2a0ULL, 0x2e80000000000000ULL, 0x0220a2a26aa0a2a8ULL,
  3263. 0x800022a2228a22a0ULL, 0x880002212e82c0b0ULL, 0x02a0aa0002a82228ULL, 0x2d808b0080380008ULL,
  3264. 0x0000000000000000ULL, 0x000407551c154244ULL, 0x2a00208088a02228ULL, 0x12a82182a2402a88ULL,
  3265. 0xe32821e020826d00ULL, 0x801130100ccc1330ULL, 0x028010c000841008ULL, 0x88a08002a0a664a0ULL,
  3266. 0x0048270080000100ULL, 0x00001f010cd10f30ULL, 0xe2242ce22aaea2a0ULL, 0xc2c00cc20ae22460ULL,
  3267. 0xe208003128021c10ULL, 0x2a2021c010821080ULL, 0x2a88202082202020ULL, 0x4010111104941410ULL,
  3268. 0xc80c02c182b00080ULL, 0x0000040000000000ULL, 0xe28030068002c300ULL, 0x2aa02024a2a22228ULL,
  3269. 0xe20889328aa22080ULL, 0x0000000000210100ULL, 0xaa0028e0a9b221a0ULL, 0x2000008080400000ULL,
  3270. 0x0000010041150404ULL, 0x0000105114410100ULL, 0xeaa82aa6aaaaaaa8ULL, 0x000000f44300c434ULL,
  3271. 0x0000222222b00020ULL, 0x0000002000000000ULL, 0x0000004014000000ULL, 0x0039b3f73fbcd3fcULL,
  3272. 0x0000000000000000ULL, 0x0000104015045040ULL, 0x20a80490a08800a0ULL, 0x40a8258410a909a0ULL,
  3273. 0xe0a8a2022aa2e2a0ULL, 0xc111010014000500ULL, 0x2080044041840004ULL, 0x28a8200220a2aba0ULL,
  3274. 0x008400a0a2840800ULL, 0x0101015451009464ULL, 0x20000ea0e02c2c2cULL, 0xe2a828a2aca2aaa8ULL,
  3275. 0x682020a228a222a0ULL, 0xe8882ae22aa2a2a0ULL, 0xe9a80e6022a24140ULL, 0x0011055005001040ULL,
  3276. 0x2aa8208229a0aaa4ULL, 0x0000040000000000ULL, 0x28a0228026a62260ULL, 0xe2a020a422a2a020ULL,
  3277. 0xe808a0022aa1a220ULL, 0x0000010014000100ULL, 0x28ac22802aa2a020ULL, 0x0020000000000000ULL,
  3278. 0x0100010100040000ULL, 0x0000000000000000ULL, 0x22a822a22a8aaaa0ULL, 0x0000000000000000ULL,
  3279. 0x0000102410800100ULL, 0x0000000000000000ULL, 0x0000000002000000ULL, 0x00000fb2a08c0aa8ULL,
  3280. 0x0000000000000000ULL, 0x4010005015440140ULL, 0x18c81c00b180001cULL, 0x2800048021820800ULL,
  3281. 0x8ab820c06a802580ULL, 0x00100170f4040000ULL, 0x4000144041041404ULL, 0x0ac800d0002e440cULL,
  3282. 0x20880820a2000808ULL, 0x400000f03f300c00ULL, 0xaa000ea22aa22aa0ULL, 0xa2880ac0a8942a20ULL,
  3283. 0xaa880a81a1804188ULL, 0xeea022a0aaa02080ULL, 0xaaa820a2aaa66120ULL, 0x0000005115800150ULL,
  3284. 0x2a880920a0840040ULL, 0x0000040000000000ULL, 0xaea82222aaa22a28ULL, 0x8a28041260055150ULL,
  3285. 0xa28824008aa28880ULL, 0x0000025014019000ULL, 0xea882ae02aa200a0ULL, 0x0000000000000000ULL,
  3286. 0x0000000040000400ULL, 0x0000000000000000ULL, 0xaaa82aa22aaaaaa0ULL, 0x0000000000000000ULL,
  3287. 0x0000000000000000ULL, 0x002003003c80c000ULL, 0x0000020014000000ULL, 0x00200010a0980a20ULL,
  3288. 0x0000000000000000ULL, 0x0020001200801240ULL, 0x0a88000089800020ULL, 0xcaa00080a1000000ULL,
  3289. 0x0a200c0020a04080ULL, 0x4002034003840880ULL, 0x4690500190000050ULL, 0x2228004000601000ULL,
  3290. 0x0a803f00803f400cULL, 0x400033e24dd0cf34ULL, 0xaa80a2a229a220a0ULL, 0x0a224000002c0000ULL,
  3291. 0x028000202000008cULL, 0x0a08000070000030ULL, 0x00800c040020000cULL, 0x0000000002850000ULL,
  3292. 0x02881cc310200000ULL, 0x0000040004000000ULL, 0xcba8000400000080ULL, 0xcaa02c0680000000ULL,
  3293. 0xcc880002008c4080ULL, 0x300000f007f0cf0cULL, 0x0a80001080a00000ULL, 0x820880802a880a80ULL,
  3294. 0x0000050001040004ULL, 0x0000011000000000ULL, 0x0a8020a2a0202000ULL, 0x0000022202008000ULL,
  3295. 0x0000222212808000ULL, 0x0020226010000000ULL, 0x000033f33ff3c33cULL, 0x00288002a08c02a8ULL,
  3296. 0x0000000000000000ULL, 0x04408e0000008200ULL, 0x0808004000900000ULL, 0x0aa8200010ca00c0ULL,
  3297. 0x0ba80101005d4010ULL, 0x00018604802c8288ULL, 0x00049400101c0000ULL, 0x000c101110505010ULL,
  3298. 0x0000000000100000ULL, 0x30000c00c022000cULL, 0xd0c00dd0d51d431cULL, 0x0008000010100000ULL,
  3299. 0x000c1001a0280000ULL, 0x0bc80000c0000000ULL, 0x0a00000080280000ULL, 0x8000a00220308420ULL,
  3300. 0x0808000010301000ULL, 0x0000040000000000ULL, 0x0d00031480100000ULL, 0x07200000108c0300ULL,
  3301. 0x0bc0a0c000004000ULL, 0x8000b002c0208480ULL, 0x340c0100118c111cULL, 0x8008008020890000ULL,
  3302. 0x0000000000040010ULL, 0x0020b00320c1d0b0ULL, 0x00002000000c0000ULL, 0x0020be226e2008a0ULL,
  3303. 0x002010c03fb0a6a0ULL, 0x00202e222aaec284ULL, 0x00008f0000208400ULL, 0x0000000000300000ULL,
  3304. };
  3305. // Latin1 6%, Latin2 11%, Latin7 3%
  3306. // Just for debugging. not thread-safe
  3307. static char tri_string[4];
  3308. char* Latin127Str(int trisub) {
  3309. tri_string[0] = "_abcdefghijklmnopqrstuvwxyzAEIOC"[(trisub >> 10) & 0x1f];
  3310. tri_string[1] = "_abcdefghijklmnopqrstuvwxyzAEIOC"[(trisub >> 5) & 0x1f];
  3311. tri_string[2] = "_abcdefghijklmnopqrstuvwxyzAEIOC"[(trisub >> 0) & 0x1f];
  3312. tri_string[3] = '\0';
  3313. return tri_string;
  3314. }
  3315. // Returns two bits per three-byte trigram, indicating
  3316. // dont-care, Latin1 likely, Latin2 likely, and Latin7 (ISO-8859-13) likely
  3317. int TrigramValue(const uint8* trisrc) {
  3318. int byte0_p = kMapToFiveBits[trisrc[0]];
  3319. int byte1_p = kMapToFiveBits[trisrc[1]];
  3320. int byte2_p = kMapToFiveBits[trisrc[2]];
  3321. int subscr = ((byte0_p) << 5) | byte1_p;
  3322. int temp = static_cast<int>((kLatin127Trigrams[subscr] >> (byte2_p * 2)));
  3323. //printf("%s=%d ", Latin127Str((subscr << 5) | byte2_p), temp & 3);
  3324. return temp & 3;
  3325. }
  3326. // Put out trigrams for surrounding 32 bytes for Latin encodings
  3327. // Return true if more Latin2 & 7 than Latin1
  3328. bool BoostLatin127Trigrams(int tri_block_offset,
  3329. DetectEncodingState* destatep) {
  3330. //printf("BoostLatin127Trigrams[%06x]\n", tri_block_offset);
  3331. int excess_latin27 = 0;
  3332. int srclen = destatep->limit_src - destatep->initial_src;
  3333. int hi_limit = minint(tri_block_offset + 32, srclen - 2);
  3334. const uint8* trisrc = &destatep->initial_src[tri_block_offset];
  3335. const uint8* trisrclimit = &destatep->initial_src[hi_limit];
  3336. while (trisrc < trisrclimit) {
  3337. // Selectively boost Latin1, Latin2, or Latin7 and friends
  3338. int trigram_val = TrigramValue(trisrc);
  3339. if (trigram_val != 0) {
  3340. if (FLAGS_enc_detect_source) {
  3341. PsHighlight(trisrc, destatep->initial_src, trigram_val, 1);
  3342. }
  3343. if (trigram_val == kTriLatin1Likely) {
  3344. Boost(destatep, F_Latin1, kTrigramBoost);
  3345. Boost(destatep, F_CP1252, kTrigramBoost);
  3346. // We don't want to upset the relative rank of a declared 8859-15
  3347. Boost(destatep, F_ISO_8859_15, kTrigramBoost);
  3348. --excess_latin27;
  3349. } else if (trigram_val == kTriLatin2Likely) {
  3350. Boost(destatep, F_Latin2, kTrigramBoost);
  3351. Boost(destatep, F_CP1250, kTrigramBoost);
  3352. ++excess_latin27;
  3353. } else if (trigram_val == kTriLatin7Likely) {
  3354. Boost(destatep, F_ISO_8859_13, kTrigramBoost);
  3355. Boost(destatep, F_CP1257, kTrigramBoost);
  3356. // We don't want to upset the relative rank of a declared 8859-4 or -6
  3357. // for Estonian
  3358. Boost(destatep, F_Latin4, kTrigramBoost);
  3359. Boost(destatep, F_Latin6, kTrigramBoost);
  3360. ++excess_latin27;
  3361. }
  3362. }
  3363. ++trisrc;
  3364. }
  3365. //printf("\n");
  3366. return (0 < excess_latin27);
  3367. }
  3368. // Boost any encodings that need extra detection help, then prune
  3369. // src is first unscanned byte
  3370. // slowend means extra pruning when dropping out of initial slow scan
  3371. // final means last call -- no bigram at src
  3372. void BoostPrune(const uint8* src, DetectEncodingState* destatep,
  3373. int prunereason) {
  3374. int delta_asciipairs = destatep->next_interesting_pair[AsciiPair] -
  3375. destatep->prior_interesting_pair[AsciiPair];
  3376. int delta_otherpairs = destatep->next_interesting_pair[OtherPair] -
  3377. destatep->prior_interesting_pair[OtherPair];
  3378. if (prunereason == PRUNE_FINAL) {
  3379. // We are about done
  3380. // If we get here with very little accumulated data, the initial hints
  3381. // were too strong, so we derate them to n+1 / 12 for n bigrams
  3382. if (!destatep->hints_derated &&
  3383. (destatep->next_interesting_pair[OtherPair] < kDerateHintsBelow)) {
  3384. int n = destatep->next_interesting_pair[OtherPair];
  3385. // Map N pairs to (N+1)/12 portions of the initial hints, etc.
  3386. // Floor of 3/12 -- 1/12 and 2/12 are too easy to overcome
  3387. int m = maxint(3, (n + 1));
  3388. for (int i = 0; i < NUM_RANKEDENCODING; ++i) {
  3389. int original_delta = destatep->hint_prob[i];
  3390. int scaled_delta = (original_delta * m) / kDerateHintsBelow;
  3391. destatep->enc_prob[i] -= original_delta;
  3392. destatep->enc_prob[i] += scaled_delta;
  3393. }
  3394. destatep->hints_derated = true;
  3395. if (destatep->debug_data != NULL) {
  3396. // Show derated-hint result
  3397. char buff[32];
  3398. snprintf(buff, sizeof(buff), "Hints %d/%d", m, kDerateHintsBelow);
  3399. SetDetailsEncLabel(destatep, buff);
  3400. }
  3401. }
  3402. }
  3403. ++destatep->prune_count;
  3404. if (prunereason != PRUNE_FINAL) {
  3405. // Early outs
  3406. if (destatep->rankedencoding_list_len <= 1) { // nothing to prune
  3407. destatep->done = true;
  3408. return;
  3409. }
  3410. if ((destatep->prune_count > 0) &&
  3411. (delta_asciipairs + delta_otherpairs) == 0) {
  3412. // Nothing to do; must have just been called earlier
  3413. return;
  3414. }
  3415. }
  3416. // INCREMENT
  3417. // ====================
  3418. // Accumulate OtherPair probibilities over all active families
  3419. // AsciiPair probibilities are all done in ActiveSpecialBoostWhack
  3420. uint8 prior_bad_byte1 = ' '; // won't match first bad pair
  3421. uint8 prior_bad_byte2 = ' '; // won't match first bad pair
  3422. uint8 or_byte1 = 0; // Track if any current pair has a high bit
  3423. int counted_otherpairs = 0;
  3424. uint8 prior_byte1x2x = 0;
  3425. for (int i = 0; i < delta_otherpairs; ++i) {
  3426. int watch1_incr = 0;
  3427. int watch2_incr = 0;
  3428. int next_pair = destatep->prior_interesting_pair[OtherPair] + i;
  3429. uint8 byte1 = destatep->interesting_pairs[OtherPair][next_pair * 2 + 0];
  3430. uint8 byte2 = destatep->interesting_pairs[OtherPair][next_pair * 2 + 1];
  3431. uint8 byte1x2x = (byte1 & 0xf0) | ((byte2 >> 4) & 0x0f);
  3432. int weightshift = destatep->interesting_weightshift[OtherPair][next_pair];
  3433. int offset_byte12 = destatep->interesting_offsets[OtherPair][next_pair];
  3434. // To help distinguish some Cyrillic, Arabic, Greek, Hebrew, Thai
  3435. // Remember if this is a CDEF pair immediately following the previous pair
  3436. // 8xxx CxCx or CxCx 8xxx
  3437. bool next_pair_consec_hi = false;
  3438. if (ConsecutivePair(destatep, next_pair)) {
  3439. if ((byte1x2x & 0xcc) == 0xcc) { // 8xxx CxCx
  3440. next_pair_consec_hi = true;
  3441. } else if ((prior_byte1x2x & 0xcc) == 0xcc) { // CxCx 8xxx
  3442. next_pair_consec_hi = true;
  3443. }
  3444. }
  3445. //printf("prior/cur/consec %02x %02x %d\n",
  3446. // prior_byte1x2x, byte1x2x, next_pair_consec_hi);
  3447. prior_byte1x2x = byte1x2x;
  3448. or_byte1 |= byte1;
  3449. uint8 byte1f = byte1;
  3450. // Flip top bit of subscript to better separate quadrant 4 (esp. for Hebrew)
  3451. byte1f ^= (byte2 & 0x80);
  3452. // If the same bigram occurred recently, don't increment again
  3453. bool pair_used = false;
  3454. if (!RepeatedBigram(destatep, byte1, byte2)) {
  3455. ++counted_otherpairs;
  3456. pair_used = true;
  3457. // Boost both charset= declared encodings, so
  3458. // Nearly-same probability nearby encoding doesn't drift to the top
  3459. if (!FLAGS_demo_nodefault) {
  3460. destatep->enc_prob[destatep->declared_enc_1] += kDeclaredEncBoost >> weightshift;
  3461. destatep->enc_prob[destatep->declared_enc_2] += kDeclaredEncBoost >> weightshift;
  3462. }
  3463. bool was_bad_pair = false;
  3464. for (int j = 0; j < destatep->rankedencoding_list_len; j++) {
  3465. int incr_shift = 0;
  3466. int rankedencoding = destatep->rankedencoding_list[j];
  3467. Encoding enc = kMapToEncoding[rankedencoding];
  3468. // For binary, Skip over repeated marker bytes, such as 02, FF, etc.
  3469. if ((rankedencoding == F_BINARY) &&
  3470. RepeatedBinary(destatep, byte1, byte2)) {
  3471. incr_shift = 2; // count 1/4 as much if repeated
  3472. }
  3473. // If byte 1x2x for this encoding is exactly zero, illegal byte pair
  3474. // Don't increment, but instead penalize
  3475. const UnigramEntry* ue = &unigram_table[rankedencoding];
  3476. if (ue->b12[byte1x2x] == 0) {
  3477. // Don't whack consecutive duplicate bad pairs -- overkill
  3478. if ((byte1 != prior_bad_byte1) || (byte2 != prior_bad_byte2)) {
  3479. // Extra whack for illegal pair in this encoding
  3480. Whack(destatep, rankedencoding, kBadPairWhack >> weightshift);
  3481. was_bad_pair = true;
  3482. }
  3483. } else {
  3484. // OK to do the real increment
  3485. int incr = ue->b1[byte1f] + ue->b2[byte2] + ue->b12[byte1x2x];
  3486. if ((ue->b12[byte1x2x] & 0x01) != 0) {
  3487. // Use a more-precise table
  3488. int byte32x32 = ((byte1 & 0x1f) << 5) | (byte2 & 0x1f);
  3489. int hiressub = (byte2 & 0x60) >> 5; // select w/bits 5&6 of byte 2
  3490. DCHECK(ue->hires[hiressub] != NULL);
  3491. incr += ue->hires[hiressub][byte32x32];
  3492. } else {
  3493. // Default final offset
  3494. incr += ue->so;
  3495. }
  3496. incr >>= incr_shift;
  3497. incr >>= weightshift;
  3498. destatep->enc_prob[rankedencoding] += incr; // The actual increment
  3499. if (FLAGS_enc_detect_detail2) {
  3500. if (watch1_rankedenc == rankedencoding) {watch1_incr = incr;}
  3501. if (watch2_rankedenc == rankedencoding) {watch2_incr = incr;}
  3502. }
  3503. }
  3504. // If consecutive pair of high bytes, give slight boost to one-byte
  3505. // encodings that have a full alphabet in the high bytes
  3506. if (next_pair_consec_hi && HighAlphaEncoding(enc)) {
  3507. Boost(destatep, rankedencoding, kDeclaredEncBoost >> weightshift);
  3508. }
  3509. } // End for j < rankedencoding_list_len
  3510. if (was_bad_pair) {
  3511. prior_bad_byte1 = byte1;
  3512. prior_bad_byte2 = byte2;
  3513. }
  3514. // Fold in per-bigram most likely encoding for first N bigrams
  3515. if (next_pair < kBestPairsCount) {
  3516. int best_enc = kMostLikelyEncoding[(byte1 << 8) + byte2];
  3517. Boost(destatep, best_enc, kBestEncBoost >> weightshift);
  3518. }
  3519. // Possibly score 32 trigrams around a bigram to better separate
  3520. // Latin1 from Latin2 and Latin7. Especially helpful for detecting
  3521. // mis-labelled Hungarian latin2.
  3522. // If looking and at bigram 0,8,16,... do full scoring, else just 1 tri
  3523. if (destatep->do_latin_trigrams ||
  3524. destatep->looking_for_latin_trigrams) {
  3525. // If just looking, do full scan every 8 times
  3526. // Just look up one trigram the other 7 and do full scan if Latin2,7
  3527. bool scan32 = false;
  3528. const uint8* trisrc = &destatep->initial_src[offset_byte12 - 1];
  3529. if (!destatep->do_latin_trigrams) {
  3530. if ((i & 7) == 0 || trisrc + 3 > destatep->limit_src) {
  3531. scan32 = true;
  3532. } else {
  3533. scan32 = (kTriLatin1Likely < TrigramValue(trisrc));
  3534. }
  3535. }
  3536. if (destatep->do_latin_trigrams || scan32) {
  3537. // Just score each block of 32 bytes once
  3538. int tri_block_offset = offset_byte12 & ~0x1f;
  3539. if (destatep->trigram_highwater_mark <= tri_block_offset) {
  3540. bool turnon = BoostLatin127Trigrams(tri_block_offset, destatep);
  3541. if (FLAGS_counts && !destatep->do_latin_trigrams && turnon) {
  3542. ++doing_used; // First time
  3543. }
  3544. if (FLAGS_enc_detect_source) {
  3545. if (!destatep->do_latin_trigrams && turnon) {
  3546. // First time
  3547. PsHighlight(trisrc, destatep->initial_src, 0, 2);
  3548. }
  3549. }
  3550. destatep->do_latin_trigrams |= turnon;
  3551. destatep->trigram_highwater_mark = tri_block_offset + 32;
  3552. }
  3553. }
  3554. }
  3555. } // end if RepeatedBigram()
  3556. // Keep track of initial byte high 3 bits
  3557. ++destatep->byte32_count[byte1 >> 5];
  3558. // TODO: boost subset/superset also
  3559. // Boost(destatep, kRelatedEncoding[best_enc], kBestEncBoost);
  3560. if (destatep->debug_data != NULL) {
  3561. // Show detail entry for this bigram
  3562. char buff[16];
  3563. snprintf(buff, sizeof(buff), "%c%02x%02x%c%c",
  3564. pair_used ? ' ' : '[',
  3565. byte1,
  3566. byte2,
  3567. pair_used ? ' ' : ']',
  3568. (weightshift == 0) ? ' ' : '-');
  3569. SetDetailsEncProb(destatep,
  3570. destatep->interesting_offsets[OtherPair][next_pair],
  3571. kMostLikelyEncoding[(byte1 << 8) + byte2],
  3572. buff);
  3573. }
  3574. if (FLAGS_enc_detect_detail2) {
  3575. if ((watch1_incr != 0) || (watch2_incr != 0)) {
  3576. // Show increment detail for this encoding
  3577. char buff[32];
  3578. snprintf(buff, sizeof(buff), "%c%d %c%d",
  3579. (watch1_incr < 0) ? '-' : '+', watch1_incr,
  3580. (watch2_incr < 0) ? '-' : '+', watch2_incr);
  3581. SetDetailsEncLabel(destatep, buff);
  3582. }
  3583. }
  3584. } // End for i
  3585. // If no high bit on, demote all the two-byte codes
  3586. // WAS BUG. This was inside the loop above and should be outside
  3587. if ((counted_otherpairs > 0) && ((or_byte1 & 0x80) == 0)) {
  3588. // No high bit in this group (just 02xx, etc.). Whack 2-byte codes
  3589. // This keeps SJS from creeping past Latin1 on illegal C0 bytes
  3590. for (int j = 0; j < destatep->rankedencoding_list_len; j++) {
  3591. int rankedencoding = destatep->rankedencoding_list[j];
  3592. Encoding enc = kMapToEncoding[rankedencoding];
  3593. if (TwoByteEncoding(enc)) {
  3594. Whack(destatep, rankedencoding, kGentlePairWhack * counted_otherpairs);
  3595. }
  3596. }
  3597. }
  3598. // BOOST
  3599. // ====================
  3600. if (AnyActive(destatep)) {
  3601. ActiveSpecialBoostWhack(src, destatep);
  3602. }
  3603. // Update for next time
  3604. destatep->prior_src = src;
  3605. destatep->prior_interesting_pair[AsciiPair] =
  3606. destatep->next_interesting_pair[AsciiPair];
  3607. destatep->prior_interesting_pair[OtherPair] =
  3608. destatep->next_interesting_pair[OtherPair];
  3609. // Do any pre-prune final adjustments
  3610. // ====================
  3611. if (prunereason == PRUNE_FINAL) {
  3612. // If UTF8 not in base state, whack
  3613. if (destatep->next_utf8_ministate != 0) {
  3614. Whack(destatep, F_UTF8, kGentlePairWhack * 2 * 1);
  3615. }
  3616. // If UTF8UTF8 not in base state, whack
  3617. if (destatep->next_utf8utf8_ministate != 0) {
  3618. Whack(destatep, F_UTF8UTF8, kGentlePairWhack * 2 * 1);
  3619. }
  3620. // If no valid UTF-8 char ever seen, whack
  3621. if (destatep->utf8_minicount[5] == 0) {
  3622. Whack(destatep, F_UTF8, kBadPairWhack * 8); // No sequence
  3623. Whack(destatep, F_UTF8UTF8, kBadPairWhack * 8); // No sequence
  3624. }
  3625. // If no valid UTF8UTF8 char ever seen, whack
  3626. if (destatep->utf8utf8_minicount[5] == 0) {
  3627. Whack(destatep, F_UTF8UTF8, kBadPairWhack * 8); // No sequence
  3628. }
  3629. // If not all four binary quadrants, whack BINARY;
  3630. // worth 2 pair if 3 quads, 4 pair if 1 or 2 quads
  3631. if (destatep->binary_quadrants_count < 4) {
  3632. if (destatep->binary_quadrants_count == 3) {
  3633. Whack(destatep, F_BINARY, kBadPairWhack * 2);
  3634. } else {
  3635. Whack(destatep, F_BINARY, kBadPairWhack * 4);
  3636. }
  3637. }
  3638. // If 1st pair is 1b24, choose between ISO-2022-xx
  3639. // <esc> $ ) C ISO-2022-KR [1b 24 29 43]
  3640. // <esc> $ ) A ISO-2022-CN [1b 24 29 41]
  3641. // <esc> $ ) G ISO-2022-CN [1b 24 29 47]
  3642. // <esc> $ * H ISO-2022-CN [1b 24 2a 48]
  3643. // <esc> ( B ISO-2022-JP [1b 28 42] to ASCII
  3644. // <esc> ( J ISO-2022-JP [1b 28 4a] to X0201
  3645. // <esc> $ @ ISO-2022-JP [1b 24 40] to X0208-78 twobyte
  3646. // <esc> $ B ISO-2022-JP [1b 24 42] to X0208-83 twobyte
  3647. if ((destatep->next_interesting_pair[OtherPair] >= 1) &&
  3648. Iso2022Active(destatep)) {
  3649. if ((destatep->interesting_pairs[OtherPair][0] == 0x1b) &&
  3650. (destatep->interesting_pairs[OtherPair][1] == 0x24)) {
  3651. int offset = destatep->interesting_offsets[OtherPair][0];
  3652. const uint8* esc_src = destatep->initial_src + offset;
  3653. if ((destatep->initial_src + offset) < (destatep->limit_src - 3)) {
  3654. if ((esc_src[2] == ')') && (esc_src[3] == 'C')) {
  3655. Boost(destatep, F_ISO_2022_KR, kBoostOnePair);
  3656. Whack(destatep, F_ISO_2022_CN, kBadPairWhack);
  3657. Whack(destatep, F_JIS, kBadPairWhack);
  3658. } else if ((esc_src[2] == ')') && ((esc_src[3] == 'A') ||
  3659. (esc_src[3] == 'G'))) {
  3660. Boost(destatep, F_ISO_2022_CN, kBoostOnePair);
  3661. Whack(destatep, F_ISO_2022_KR, kBadPairWhack);
  3662. Whack(destatep, F_JIS, kBadPairWhack);
  3663. } else if ((esc_src[2] == '@') || (esc_src[2] == 'B')) {
  3664. Boost(destatep, F_JIS, kBoostOnePair);
  3665. Whack(destatep, F_ISO_2022_CN, kBadPairWhack);
  3666. Whack(destatep, F_ISO_2022_KR, kBadPairWhack);
  3667. }
  3668. } else {
  3669. // Incomplete escape sequence. Whack them all
  3670. Whack(destatep, F_JIS, kBadPairWhack);
  3671. Whack(destatep, F_ISO_2022_CN, kBadPairWhack);
  3672. Whack(destatep, F_ISO_2022_KR, kBadPairWhack);
  3673. }
  3674. }
  3675. }
  3676. if (destatep->debug_data != NULL) {
  3677. SetDetailsEncLabel(destatep, "pre-final");
  3678. }
  3679. }
  3680. // PRUNE
  3681. // ====================
  3682. // Find current top two rankedencoding probabilities
  3683. ReRank(destatep);
  3684. if (prunereason == PRUNE_SLOWEND) {
  3685. if (destatep->debug_data != NULL) {
  3686. SetDetailsEncLabel(destatep, "slow-end");
  3687. }
  3688. }
  3689. // Keep every rankedencoding with probablity >= top_prob - prune_difference
  3690. int prune_diff = destatep->prune_difference;
  3691. // If the top encoding is BINARY, it might be overstated, and we might
  3692. // therefore prune away the real encoding. Make the pruning delta
  3693. // twice as big.
  3694. if (destatep->top_rankedencoding == F_BINARY) {
  3695. prune_diff *= 2;
  3696. }
  3697. int keep_prob = destatep->top_prob - prune_diff;
  3698. // Tighten pruning difference (we start wide) for next time
  3699. if (destatep->prune_difference > kFinalPruneDifference) {
  3700. int decrement = kPruneDiffDecrement;
  3701. // If only ASCII pairs, small tighten; if some non-ASCII, full tighten
  3702. if (counted_otherpairs == 0) {
  3703. decrement >>= 1;
  3704. }
  3705. destatep->prune_difference -= decrement;
  3706. }
  3707. // Prune the list of active encoding families
  3708. destatep->active_special = 0;
  3709. int k = 0;
  3710. for (int j = 0; j < destatep->rankedencoding_list_len; j++) {
  3711. bool keep = true;
  3712. int rankedencoding = destatep->rankedencoding_list[j];
  3713. // If count is too low, ditch it
  3714. if (destatep->enc_prob[rankedencoding] < keep_prob) {
  3715. keep = false;
  3716. }
  3717. // If at end of slow section, ditch any 7-bit with zero evidence so far
  3718. if ((prunereason == PRUNE_SLOWEND) &&
  3719. SevenBitEncoding(kMapToEncoding[rankedencoding]) &&
  3720. (destatep->enc_prob[rankedencoding] <= 0) &&
  3721. (rankedencoding != destatep->top_rankedencoding)) {
  3722. keep = false;
  3723. }
  3724. // Keep it. This will always keep at least top_prob rankedencoding
  3725. if (keep) {
  3726. destatep->active_special |= kSpecialMask[kMapToEncoding[rankedencoding]];
  3727. destatep->rankedencoding_list[k++] = rankedencoding;
  3728. }
  3729. }
  3730. if (destatep->debug_data != NULL) {
  3731. char buff[32];
  3732. snprintf(buff, sizeof(buff), "%d prune", prune_diff / XLOG2);
  3733. SetDetailsEncLabel(destatep, buff);
  3734. }
  3735. destatep->rankedencoding_list_len = k;
  3736. // Force final result in some cases
  3737. // Do any post-prune final adjustments
  3738. if (prunereason == PRUNE_FINAL) {
  3739. // If no high-byte pairs, result is ASCII7, BINARY, UTF7, 2022, or HZ
  3740. if (destatep->next_interesting_pair[OtherPair] == 0) {
  3741. if ((destatep->top_rankedencoding != F_BINARY) &&
  3742. (destatep->top_rankedencoding != F_UTF7) &&
  3743. (destatep->top_rankedencoding != F_ISO_2022_CN) &&
  3744. (destatep->top_rankedencoding != F_ISO_2022_KR) &&
  3745. (destatep->top_rankedencoding != F_JIS) &&
  3746. (destatep->top_rankedencoding != F_HZ_GB_2312)) {
  3747. destatep->top_rankedencoding = F_ASCII_7_bit;
  3748. Boost(destatep, F_ASCII_7_bit, kBoostOnePair * 2);
  3749. }
  3750. }
  3751. // If some 89 pairs, not ISO_8859_x and vice versa
  3752. if (destatep->byte32_count[4] > 0) {
  3753. switch (destatep->top_rankedencoding) {
  3754. case F_ASCII: // ISO-8859-1
  3755. destatep->top_rankedencoding = F_CP1252;
  3756. // Better: destatep->enc_prob[F_ASCII] <==> destatep->enc_prob[F_CP1252]
  3757. Boost(destatep, F_CP1252, kBoostOnePair * 2);
  3758. break;
  3759. case F_Latin2: // ISO-8859-2
  3760. // Don't swap back; not superset
  3761. //destatep->top_rankedencoding = F_CP1250;
  3762. //Boost(destatep, F_CP1250, kBoostOnePair * 2);
  3763. break;
  3764. case F_Arabic: // ISO-8859-6
  3765. destatep->top_rankedencoding = F_CP1256;
  3766. Boost(destatep, F_CP1256, kBoostOnePair * 2);
  3767. break;
  3768. case F_Greek: // ISO-8859-7
  3769. // Don't swap -- not proper superset
  3770. // Capital Alpha tonos at 0xB6 in ISO-8859-7, 0xA2 in CP1253
  3771. //destatep->top_rankedencoding = F_CP1253;
  3772. //Boost(destatep, F_CP1253, kBoostOnePair * 2);
  3773. break;
  3774. case F_Hebrew: // ISO-8859-8
  3775. // Don't swap -- visual vs. logical
  3776. //destatep->top_rankedencoding = F_CP1255;
  3777. //Boost(destatep, F_CP1255, kBoostOnePair * 2);
  3778. break;
  3779. case F_Latin5: // ISO-8859-9
  3780. destatep->top_rankedencoding = F_CP1254;
  3781. Boost(destatep, F_CP1254, kBoostOnePair * 2);
  3782. break;
  3783. case F_ISO_8859_11: // ISO-8859-11
  3784. destatep->top_rankedencoding = F_CP874;
  3785. Boost(destatep, F_CP874, kBoostOnePair * 2);
  3786. break;
  3787. }
  3788. } else {
  3789. switch (destatep->top_rankedencoding) {
  3790. case F_CP1252: // ISO-8859-1
  3791. destatep->top_rankedencoding = F_ASCII;
  3792. Boost(destatep, F_ASCII, kBoostOnePair * 2);
  3793. break;
  3794. case F_CP1250: // ISO-8859-2
  3795. // Don't swap back; not superset
  3796. //destatep->top_rankedencoding = F_Latin2;
  3797. //Boost(destatep, F_Latin2, kBoostOnePair * 2);
  3798. break;
  3799. case F_CP1256: // ISO-8859-6
  3800. // Don't swap back -- not proper superset
  3801. //destatep->top_rankedencoding = F_Arabic;
  3802. //Boost(destatep, F_Arabic, kBoostOnePair * 2);
  3803. break;
  3804. case F_CP1253: // ISO-8859-7
  3805. // Don't swap back -- not proper superset
  3806. //destatep->top_rankedencoding = F_Greek;
  3807. //Boost(destatep, F_Greek, kBoostOnePair * 2);
  3808. break;
  3809. case F_CP1255: // ISO-8859-8
  3810. // Don't swap back -- not proper superset
  3811. //destatep->top_rankedencoding = F_Hebrew;
  3812. //Boost(destatep, F_Hebrew, kBoostOnePair * 2);
  3813. break;
  3814. case F_CP1254: // ISO-8859-9
  3815. destatep->top_rankedencoding = F_Latin5;
  3816. Boost(destatep, F_Latin5, kBoostOnePair * 2);
  3817. break;
  3818. case F_CP874: // ISO-8859-11
  3819. destatep->top_rankedencoding = F_ISO_8859_11;
  3820. Boost(destatep, F_ISO_8859_11, kBoostOnePair * 2);
  3821. break;
  3822. }
  3823. }
  3824. if (destatep->debug_data != NULL) {
  3825. char buff[32];
  3826. snprintf(buff, sizeof(buff), "final %d",
  3827. static_cast<int>(src - destatep->initial_src));
  3828. SetDetailsEncLabel(destatep, buff);
  3829. // Show winning encoding and its delta log base2 from 2nd-best
  3830. // Divide delta by XLOG2 to get log base 2
  3831. int delta = destatep->top_prob - destatep->second_top_prob;
  3832. if (delta < (2 * XLOG2)) {
  3833. delta /= XDECILOG2;
  3834. snprintf(buff, sizeof(buff), "+%d.%d %s ",
  3835. delta / 10, delta % 10,
  3836. MyEncodingName(kMapToEncoding[destatep->top_rankedencoding]));
  3837. } else if (delta < (50 * XLOG2)) {
  3838. delta /= XLOG2;
  3839. snprintf(buff, sizeof(buff), "+%d %s",
  3840. delta,
  3841. MyEncodingName(kMapToEncoding[destatep->top_rankedencoding]));
  3842. } else {
  3843. snprintf(buff, sizeof(buff), "%s",
  3844. MyEncodingName(kMapToEncoding[destatep->top_rankedencoding]));
  3845. }
  3846. SetDetailsEncProbCopyOffset(destatep, destatep->top_rankedencoding, buff);
  3847. }
  3848. }
  3849. // FINISH
  3850. // ====================
  3851. // Eventual encoding result is reliable if big difference in top two, or if
  3852. // only Ascii7 ever encountered
  3853. // Also reliable if exactly one OtherPair and it's best encoding matches top
  3854. destatep->reliable = false;
  3855. if (destatep->next_interesting_pair[OtherPair] == 0) {
  3856. // Only 7-bit ASCII
  3857. destatep->reliable = true;
  3858. }
  3859. if ((destatep->top_prob - destatep->second_top_prob) >=
  3860. FLAGS_ced_reliable_difference) {
  3861. destatep->reliable = true;
  3862. }
  3863. if (destatep->next_interesting_pair[OtherPair] == 1) {
  3864. uint8 byte1 = destatep->interesting_pairs[OtherPair][0];
  3865. uint8 byte2 = destatep->interesting_pairs[OtherPair][1];
  3866. int best_enc = kMostLikelyEncoding[(byte1 << 8) + byte2];
  3867. if (best_enc == destatep->top_rankedencoding) {
  3868. destatep->reliable = true;
  3869. }
  3870. }
  3871. // If we pruned to one encoding, we are done
  3872. if (destatep->rankedencoding_list_len == 1) {
  3873. destatep->reliable = true;
  3874. destatep->done = true;
  3875. }
  3876. // If we pruned to two or three encodings in the same *superset/subset
  3877. // rankedencoding* and enough pairs, we are done. Else keep going
  3878. if (destatep->rankedencoding_list_len == 2) {
  3879. Encoding enc0 = kMapToEncoding[destatep->rankedencoding_list[0]];
  3880. Encoding enc1 = kMapToEncoding[destatep->rankedencoding_list[1]];
  3881. if (kMapEncToBaseEncoding[enc0] == kMapEncToBaseEncoding[enc1]) {
  3882. if (destatep->prune_count >= 3) {
  3883. destatep->reliable = true;
  3884. destatep->done = true;
  3885. }
  3886. }
  3887. } else if (destatep->rankedencoding_list_len == 3) {
  3888. Encoding enc0 = kMapToEncoding[destatep->rankedencoding_list[0]];
  3889. Encoding enc1 = kMapToEncoding[destatep->rankedencoding_list[1]];
  3890. Encoding enc2 = kMapToEncoding[destatep->rankedencoding_list[2]];
  3891. Encoding base0 = kMapEncToBaseEncoding[enc0];
  3892. Encoding base1 = kMapEncToBaseEncoding[enc1];
  3893. Encoding base2 = kMapEncToBaseEncoding[enc2];
  3894. if ((base0 == base1) && (base0 == base2)) {
  3895. if (destatep->prune_count >= 3) {
  3896. destatep->reliable = true;
  3897. destatep->done = true;
  3898. }
  3899. }
  3900. }
  3901. }
  3902. // Accumulate aligned byte-pair at src
  3903. // Occasionally, calc boost for some encodings and then prune the active list
  3904. // weightshift is used to give low weight some text, such as inside tags
  3905. // Returns true if pruning occurred
  3906. bool IncrementAndBoostPrune(const uint8* src,
  3907. int remaining_length,
  3908. DetectEncodingState* destatep,
  3909. int weightshift,
  3910. int exit_reason) {
  3911. destatep->last_pair = src;
  3912. // Pick up byte pair, or very last byte plus 0x20
  3913. uint8 byte1 = src[0];
  3914. uint8 byte2 = 0x20;
  3915. if (1 < remaining_length) {byte2 = src[1];}
  3916. // whatset=0 for Ascii + ~, 1 for all others; see kTestPrintableAsciiTildePlus
  3917. int whatset = exit_reason - 1;
  3918. int next_pair = destatep->next_interesting_pair[whatset];
  3919. if (next_pair > 16) {
  3920. // If not clear by 16 bigrams, stop accumulating + ~ 00
  3921. if (byte1 == '+') {return false;}
  3922. if (byte1 == '~') {return false;}
  3923. if (byte1 == 0x00) {return false;}
  3924. }
  3925. // Remember pair in appropriate list
  3926. if (next_pair >= kMaxPairs) {
  3927. // We have filled up our alloted space for interesting pairs with no
  3928. // decision. If ASCII pairs full, just skip until end of slow loop; if
  3929. // non-Ascii pairs full, force done
  3930. if (whatset == OtherPair) {
  3931. destatep->done = true;
  3932. }
  3933. } else {
  3934. int offset = static_cast<int>(src - destatep->initial_src);
  3935. destatep->interesting_pairs[whatset][next_pair * 2 + 0] = byte1;
  3936. destatep->interesting_pairs[whatset][next_pair * 2 + 1] = byte2;
  3937. destatep->interesting_offsets[whatset][next_pair] = offset;
  3938. destatep->interesting_weightshift[whatset][next_pair] = weightshift;
  3939. ++destatep->next_interesting_pair[whatset];
  3940. ++next_pair;
  3941. }
  3942. // Prune now and then , but always if forced to be done
  3943. if (destatep->done || ((next_pair & kPruneMask) == 0)) { // Prune every M
  3944. BoostPrune(src + 2, destatep, PRUNE_NORMAL); // src+2 first unscanned byte
  3945. // may be off end of input
  3946. return true;
  3947. }
  3948. return false;
  3949. }
  3950. void DumpSummary(DetectEncodingState* destatep, int whatset, int n) {
  3951. printf(" %sSummary[%2d]: ", kWhatSetName[whatset],
  3952. destatep->next_interesting_pair[whatset]);
  3953. int limit = minint(n, destatep->next_interesting_pair[whatset]);
  3954. for (int i = 0; i < limit; ++i) {
  3955. printf("%02x%02x ",
  3956. destatep->interesting_pairs[whatset][i * 2 + 0],
  3957. destatep->interesting_pairs[whatset][i * 2 + 1]);
  3958. if ((i & 7) == 7) {printf(" ");}
  3959. }
  3960. printf("\n");
  3961. }
  3962. void BeginDetail(DetectEncodingState* destatep) {
  3963. fprintf(stderr, "%d [", NUM_RANKEDENCODING);
  3964. for (int e = 0; e < NUM_RANKEDENCODING; ++e) {
  3965. fprintf(stderr, "(%s)", MyRankedEncName(e));
  3966. if ((e % 10) == 9) {fprintf(stderr, "\n ");}
  3967. }
  3968. fprintf(stderr, "] size-detail\n");
  3969. destatep->next_detail_entry = 0;
  3970. }
  3971. // Single character to represent (printable ASCII) gap between bigrams
  3972. char DetailOffsetChar(int delta) {
  3973. if (delta == 0) {return ' ';}
  3974. if (delta <= 2) {return '=';}
  3975. if (delta <= 15) {return '_';}
  3976. if (delta <= 31) {return '+';}
  3977. {return ' ';}
  3978. }
  3979. void DumpDetail(DetectEncodingState* destatep) {
  3980. // Turn all counts into delta from previous entry
  3981. fprintf(stderr, "%d count-detail\n", destatep->next_detail_entry);
  3982. // Rewrite, recording deltas
  3983. for (int z = destatep->next_detail_entry - 1; z > 0; --z) {
  3984. destatep->debug_data[z].offset -= destatep->debug_data[z - 1].offset;
  3985. for (int e = 0; e < NUM_RANKEDENCODING; ++e) {
  3986. destatep->debug_data[z].detail_enc_prob[e] -=
  3987. destatep->debug_data[z - 1].detail_enc_prob[e];
  3988. }
  3989. }
  3990. // Now print
  3991. for (int z = 0; z < destatep->next_detail_entry; ++z) {
  3992. // Highlight some entries ending in '!' with light red underbar
  3993. int len = destatep->debug_data[z].label.size();
  3994. if (destatep->debug_data[z].label[len - 1] == '!') {
  3995. fprintf(stderr, "1 0.9 0.9 do-flag\n");
  3996. }
  3997. fprintf(stderr, "(%c%s) %d [",
  3998. DetailOffsetChar(destatep->debug_data[z].offset),
  3999. destatep->debug_data[z].label.c_str(),
  4000. destatep->debug_data[z].best_enc);
  4001. for (int e = 0; e < NUM_RANKEDENCODING; ++e) {
  4002. fprintf(stderr, "%d ", destatep->debug_data[z].detail_enc_prob[e]);
  4003. if ((e % 10) == 9) {fprintf(stderr, " ");}
  4004. }
  4005. fprintf(stderr, "] do-detail-e\n");
  4006. }
  4007. // Get ready for next time,if any
  4008. destatep->next_detail_entry = 0;
  4009. }
  4010. void PsRecurse(const char* buff) {
  4011. fprintf(stderr, "() end-detail (%s) start-detail\n\n", buff);
  4012. }
  4013. void DumpReliable(DetectEncodingState* destatep) {
  4014. printf("Not reliable: ");
  4015. // Find center of gravity of OtherPair list
  4016. int x_sum = 0;
  4017. int y_sum = 0;
  4018. int count = destatep->next_interesting_pair[OtherPair];
  4019. for (int i = 0; i < count; ++i) {
  4020. uint8 byte1 = destatep->interesting_pairs[OtherPair][i * 2 + 0];
  4021. uint8 byte2 = destatep->interesting_pairs[OtherPair][i * 2 + 1];
  4022. x_sum += byte2;
  4023. y_sum += byte1;
  4024. }
  4025. if (count == 0) {count = 1;} // adoid zdiv
  4026. int x_bar = x_sum / count;
  4027. int y_bar = y_sum / count;
  4028. printf("center %02X,%02X\n", x_bar, y_bar);
  4029. double closest_dist = 999.0;
  4030. int closest = 0;
  4031. for (int j = 0; j < destatep->rankedencoding_list_len; j++) {
  4032. int rankedencoding = destatep->rankedencoding_list[j];
  4033. const UnigramEntry* ue = &unigram_table[rankedencoding];
  4034. printf(" %8s = %4d at %02x,%02x +/- %02X,%02X ",
  4035. MyEncodingName(kMapToEncoding[rankedencoding]),
  4036. destatep->enc_prob[rankedencoding],
  4037. ue->x_bar, ue->y_bar,
  4038. ue->x_stddev, ue->y_stddev);
  4039. double x_diff = x_bar - ue->x_bar;
  4040. double y_diff = y_bar - ue->y_bar;
  4041. double dist = sqrt((x_diff * x_diff) + (y_diff * y_diff));
  4042. printf("(%3.1f)\n", dist);
  4043. if (closest_dist > dist) {
  4044. closest_dist = dist;
  4045. closest = rankedencoding;
  4046. }
  4047. }
  4048. printf("Closest=%s (%3.1f)\n",
  4049. MyEncodingName(kMapToEncoding[closest]), closest_dist);
  4050. for (int i = 0; i < 8; ++i) {
  4051. // Demote by distance to CG and see if that helps, or just quit
  4052. }
  4053. }
  4054. // Scan short single lines quickly for all printable ASCII
  4055. // Return true if all bytes are in [20..7F], false otherwise
  4056. bool QuickPrintableAsciiScan(const char* text, int text_length) {
  4057. const uint8* src = reinterpret_cast<const uint8*>(text);
  4058. const uint8* srclimit = src + text_length;
  4059. const uint8* srclimit8 = srclimit - 7;
  4060. while (src < srclimit8) {
  4061. // Exits on any byte outside [0x20..0x7E] range (HT LF CR exit)
  4062. uint8 mask = 0;
  4063. for (int i = 0; i < 8; ++i) mask |= (src[i]-0x20)|(src[i]+0x01);
  4064. if ((mask & 0x80) != 0) break;
  4065. src += 8;
  4066. }
  4067. while (src < srclimit) {
  4068. uint8 uc = *src++;
  4069. if (kIsPrintableAscii[uc] == 0) {return false;}
  4070. }
  4071. return true;
  4072. }
  4073. static const int kMaxScanBack = 192;
  4074. // Return true if text is inside a tag or JS comment
  4075. bool TextInsideTag(const uint8* isrc, const uint8* src, const uint8* srclimit) {
  4076. const uint8* srcbacklimit = src - kMaxScanBack;
  4077. if (srcbacklimit < isrc) {
  4078. srcbacklimit = isrc;
  4079. }
  4080. const uint8* ss = src - 1;
  4081. while (srcbacklimit <= ss) {
  4082. uint8 c = *ss--;
  4083. if ((c & ~0x02) == '<') {
  4084. // We found preceding < 3C or > 3E nearby
  4085. // Even cheaper: if inside a tag, we don't care what tag; return true
  4086. if (c == '<') {
  4087. return true;
  4088. }
  4089. // See if we are just after <title>...
  4090. if ((c == '>') && (isrc <= (ss - 5)) &&
  4091. (ss[-5] == '<') &&
  4092. ((ss[-4] | 0x20) == 't') &&
  4093. ((ss[-3] | 0x20) == 'i') &&
  4094. ((ss[-2] | 0x20) == 't') &&
  4095. ((ss[-1] | 0x20) == 'l') &&
  4096. ((ss[-0] | 0x20) == 'e')) {
  4097. return true;
  4098. }
  4099. // See if we are just after <SCRIPT language=javascript>...
  4100. if ((c == '>') && (isrc <= (ss - 5)) &&
  4101. (ss[-5] == 's') &&
  4102. ((ss[-4] | 0x20) == 'c') &&
  4103. ((ss[-3] | 0x20) == 'r') &&
  4104. ((ss[-2] | 0x20) == 'i') &&
  4105. ((ss[-1] | 0x20) == 'p') &&
  4106. ((ss[-0] | 0x20) == 't')) {
  4107. return true;
  4108. }
  4109. // Not in a tag
  4110. return false;
  4111. // See if we are just after JavaScript comment /* ...
  4112. } else if (c == '/') {
  4113. if (((ss + 2) < srclimit) && (ss[2] == '*')) {
  4114. // We backscanned to /*
  4115. return true;
  4116. }
  4117. }
  4118. }
  4119. return false;
  4120. }
  4121. const uint8* SkipToTagEnd(const uint8* src, const uint8* srclimit) {
  4122. const uint8* ss = src + 1;
  4123. while (ss <= srclimit) {
  4124. uint8 c = *ss++;
  4125. if ((c == '<') || (c == '>')) {
  4126. return ss;
  4127. }
  4128. }
  4129. return src + 2; // Always make progress, Otherwise we get an infinite loop
  4130. }
  4131. // Take a watch string and map to a ranked encoding. If no match, return -1
  4132. int LookupWatchEnc(const string& watch_str) {
  4133. int watchval = -1;
  4134. // Mixed encoding maps to enc=UTF8UTF8
  4135. if (watch_str == "UTF8UTF8") {
  4136. watchval = F_UTF8UTF8;
  4137. } else {
  4138. Encoding enc;
  4139. if (EncodingFromName(watch_str.c_str(), &enc)) {
  4140. watchval = CompactEncDet::BackmapEncodingToRankedEncoding(enc);
  4141. }
  4142. }
  4143. return watchval;
  4144. }
  4145. // Return true if enc and enc2 are equal or one is a subset of the other
  4146. // or either is UNKNOWN
  4147. // also UTF8UTF8 is compatible with both Latin1 and UTF8
  4148. bool CompatibleEnc(Encoding enc, Encoding enc2) {
  4149. if (enc < 0) {return false;}
  4150. if (NUM_ENCODINGS <= enc) {return false;}
  4151. if (enc2 < 0) {return false;}
  4152. if (NUM_ENCODINGS <= enc2) {return false;}
  4153. if (enc == enc2) {return true;}
  4154. if (kMapEncToBaseEncoding[enc] == kMapEncToBaseEncoding[enc2]) {return true;}
  4155. if (enc == ASCII_7BIT) {return true;}
  4156. if (enc2 == ASCII_7BIT) {return true;}
  4157. if (enc == UNKNOWN_ENCODING) {return true;}
  4158. if (enc2 == UNKNOWN_ENCODING) {return true;}
  4159. if (enc == UTF8UTF8) {
  4160. if (enc2 == UTF8) {return true;}
  4161. if (kMapEncToBaseEncoding[enc2] == ISO_8859_1) {return true;}
  4162. }
  4163. if (enc2 == UTF8UTF8) {
  4164. if (enc == UTF8) {return true;}
  4165. if (kMapEncToBaseEncoding[enc] == ISO_8859_1) {return true;}
  4166. }
  4167. return false;
  4168. }
  4169. // Return superset of enc and enc2, which must be compatible
  4170. Encoding SupersetEnc(Encoding enc, Encoding enc2) {
  4171. //printf(" SupersetEnc (%s, ", MyEncodingName(enc)); // TEMP
  4172. //printf("%s) ", MyEncodingName(enc2));
  4173. //printf("= %s\n",
  4174. // MyEncodingName(kMapEncToSuperLevel[enc] >= kMapEncToSuperLevel[enc2] ?
  4175. // enc :enc2));
  4176. if (kMapEncToSuperLevel[enc] >= kMapEncToSuperLevel[enc2]) {
  4177. return enc;
  4178. }
  4179. return enc2;
  4180. }
  4181. // If unreliable, try rescoring to separate some encodings
  4182. Encoding Rescore(Encoding enc, const uint8* isrc,
  4183. const uint8* srctextlimit, DetectEncodingState* destatep) {
  4184. if (FLAGS_counts) {++rescore_used;}
  4185. Encoding new_enc = enc;
  4186. bool rescore_change = false;
  4187. int count = destatep->next_interesting_pair[OtherPair];
  4188. int text_length = srctextlimit - isrc;
  4189. for (int i = 0; i < count; ++i) {
  4190. int bigram_offset = destatep->interesting_offsets[OtherPair][i];
  4191. uint8 byte0 = (0 < bigram_offset) ?
  4192. isrc[bigram_offset - 1] : 0x20;
  4193. uint8 byte1 = isrc[bigram_offset + 0]; // Known to have high bit on
  4194. uint8 byte2 = ((bigram_offset + 1) < text_length) ?
  4195. isrc[bigram_offset + 1] : 0x20;
  4196. uint8 byte3 = ((bigram_offset + 2) < text_length) ?
  4197. isrc[bigram_offset + 2] : 0x20;
  4198. int high_hash = ((byte0 & 0xc0) >> 0) |
  4199. ((byte1 & 0xc0) >> 1) |
  4200. ((byte2 & 0xc0) >> 4) |
  4201. ((byte3 & 0xc0) >> 6); // 00112233
  4202. // Boost HighAccent encodings for Ascii bit patterns
  4203. // 0x1x 0x0x
  4204. // 1010 1010
  4205. // 0010 0000
  4206. //
  4207. if ((high_hash & 0xaa) == 0x20) {
  4208. for (int j = 0; j < destatep->rankedencoding_list_len; j++) {
  4209. int rankedencoding = destatep->rankedencoding_list[j];
  4210. if (HighAccentEncoding(kMapToEncoding[rankedencoding])) {
  4211. // TODO: also want to boost Shift-JIS here if byte1 is Ax..Dx
  4212. // TEMP
  4213. //printf(" Rescore[%02x] %s +%d\n",
  4214. // high_hash, MyRankedEncName(rankedencoding), kGentlePairBoost);
  4215. Boost(destatep, rankedencoding, kGentlePairBoost);
  4216. rescore_change = true;
  4217. }
  4218. }
  4219. }
  4220. // Whack HighAccent encodings for high bit patterns
  4221. // 1x1x 1x1x
  4222. // 1010 1010
  4223. // 1010 1010
  4224. //
  4225. if ((high_hash & 0xaa) == 0xaa) {
  4226. for (int j = 0; j < destatep->rankedencoding_list_len; j++) {
  4227. int rankedencoding = destatep->rankedencoding_list[j];
  4228. if (HighAccentEncoding(kMapToEncoding[rankedencoding])) {
  4229. // TEMP
  4230. //printf(" Rescore[%02x] %s -%d\n",
  4231. // high_hash, MyRankedEncName(rankedencoding), kGentlePairBoost);
  4232. Whack(destatep, rankedencoding, kGentlePairBoost);
  4233. rescore_change = true;
  4234. }
  4235. }
  4236. }
  4237. }
  4238. if (rescore_change) {
  4239. ReRank(destatep);
  4240. new_enc = kMapToEncoding[destatep->top_rankedencoding];
  4241. if (destatep->debug_data != NULL) {
  4242. char buff[32];
  4243. snprintf(buff, sizeof(buff), "=Rescore %s", MyEncodingName(new_enc));
  4244. SetDetailsEncProb(destatep,
  4245. 0,
  4246. CompactEncDet::BackmapEncodingToRankedEncoding(new_enc),
  4247. buff);
  4248. //// DumpDetail(destatep);
  4249. }
  4250. SimplePrune(destatep, kFinalPruneDifference);
  4251. CalcReliable(destatep);
  4252. }
  4253. //if (new_enc != enc) {
  4254. // // TEMP
  4255. // printf(" Rescore new top encoding = %s\n",
  4256. // MyRankedEncName(destatep->top_rankedencoding));
  4257. //}
  4258. return new_enc;
  4259. }
  4260. // Given an encoding, add its corresponding ranked encoding to the set
  4261. void AddToSet(Encoding enc, int* list_len, int* list) {
  4262. // TEMP print
  4263. int item = CompactEncDet::BackmapEncodingToRankedEncoding(enc);
  4264. for (int i = 0; i < *list_len; ++i) {
  4265. if (list[i] == item) {
  4266. return; // Already in the set; don't add again
  4267. }
  4268. }
  4269. list[(*list_len)++] = item;
  4270. }
  4271. static const int kMinRobustBigramCount = 1000;
  4272. static const int kMinKBToRobustScan = 64;
  4273. static const int kMaxKBToRobustScan = 256;
  4274. // Scan the first 64K or so, just doing raw bigram increments on given
  4275. // probability list.
  4276. // No fancy duplicate filtering or anything else here.
  4277. // Returns number of bigrams counted
  4278. int RobustScan(const char* text,
  4279. int text_length,
  4280. int robust_renc_list_len,
  4281. int* robust_renc_list,
  4282. int* robust_renc_probs) {
  4283. if (FLAGS_counts) {++robust_used;}
  4284. // Zero all the result probabilities
  4285. for (int i = 0; i < robust_renc_list_len; ++i) {
  4286. robust_renc_probs[i] = 0;
  4287. }
  4288. int max_fast_len = minint(text_length, (kMaxKBToRobustScan << 10));
  4289. const uint8* isrc = reinterpret_cast<const uint8*>(text);
  4290. const uint8* src = isrc;
  4291. const uint8* srclimitfast2 = isrc + max_fast_len - 1;
  4292. const uint8* srclimitfast4 = isrc + max_fast_len - 3;
  4293. int min_fast_len = minint(text_length, (kMinKBToRobustScan << 10));
  4294. const uint8* srclimitmin = isrc + min_fast_len - 1;
  4295. int bigram_count = 0;
  4296. if (FLAGS_enc_detect_source) {
  4297. PsSourceInit(kPsSourceWidth);
  4298. fprintf(stderr, "(RobustScan) do-src\n");
  4299. }
  4300. // Sum over a big chunk of the input
  4301. // Faster loop, no 7-bit-encodings possible, approx 3000 GB/sec
  4302. //====================================
  4303. while (src < srclimitfast2) {
  4304. // Skip to next interesting bigram
  4305. while (src < srclimitfast4) {
  4306. if (((src[0] | src[1] | src[2] | src[3]) & 0x80) != 0) break;
  4307. src += 4;
  4308. }
  4309. while (src < srclimitfast2) {
  4310. if ((src[0] & 0x80) != 0) break;
  4311. src++;
  4312. }
  4313. if (src < srclimitfast2) {
  4314. // We found a bigram with high bit on
  4315. // Next 5 lines commented out so we don't show all the source.
  4316. //const uint8* srctextlimit = isrc + text_length;
  4317. //if (FLAGS_enc_detect_source) {
  4318. // PsSource(src, isrc, srctextlimit);
  4319. // PsMark(src, 2, isrc, 0);
  4320. //}
  4321. uint8 byte1 = src[0];
  4322. uint8 byte2 = src[1];
  4323. uint8 byte1x2x = (byte1 & 0xf0) | ((byte2 >> 4) & 0x0f);
  4324. uint8 byte1f = byte1;
  4325. // Flip top bit of subscript to better separate quadrant 4 (esp. for Hebrew)
  4326. byte1f ^= (byte2 & 0x80);
  4327. // The real increments
  4328. for (int j = 0; j < robust_renc_list_len; ++j) {
  4329. int rankedencoding = robust_renc_list[j];
  4330. const UnigramEntry* ue = &unigram_table[rankedencoding];
  4331. int incr = ue->b1[byte1f] + ue->b2[byte2] + ue->b12[byte1x2x];
  4332. if ((ue->b12[byte1x2x] & 0x01) != 0) {
  4333. // Use a more-precise table
  4334. int byte32x32 = ((byte1 & 0x1f) << 5) | (byte2 & 0x1f);
  4335. int hiressub = (byte2 & 0x60) >> 5; // select w/bits 5&6 of byte 2
  4336. DCHECK(ue->hires[hiressub] != NULL);
  4337. incr += ue->hires[hiressub][byte32x32];
  4338. } else {
  4339. // Default final offset
  4340. incr += ue->so;
  4341. }
  4342. robust_renc_probs[j] += incr;
  4343. }
  4344. src += 2; // Continue after this bigram
  4345. ++bigram_count;
  4346. // Stop after 1000 bigrams reached, if at least 64KB scanned
  4347. if ((bigram_count > kMinRobustBigramCount) && (src > srclimitmin)) {
  4348. break;
  4349. }
  4350. }
  4351. }
  4352. if (FLAGS_enc_detect_source) {
  4353. fprintf(stderr, "( bigram_count = %d) do-src\n", bigram_count);
  4354. if (bigram_count == 0) {bigram_count = 1;} // zdiv
  4355. for (int i = 0; i < robust_renc_list_len; ++i) {
  4356. fprintf(stderr, "( enc[%-12.12s] = %7d (avg %d)) do-src\n",
  4357. MyRankedEncName(robust_renc_list[i]), robust_renc_probs[i],
  4358. robust_renc_probs[i] / bigram_count);
  4359. }
  4360. PsSourceFinish();
  4361. }
  4362. return bigram_count;
  4363. }
  4364. // If unreliable, rescan middle of document to see if we can get a better
  4365. // answer. Rescan is only worthwhile if there are ~200 bytes or more left,
  4366. // since the detector takes as much as 96 bytes of bigrams to decide.
  4367. Encoding Rescan(Encoding enc,
  4368. const uint8* isrc,
  4369. const uint8* src,
  4370. const uint8* srctextlimit,
  4371. const char* url_hint,
  4372. const char* http_charset_hint,
  4373. const char* meta_charset_hint,
  4374. const int encoding_hint,
  4375. const Language language_hint,
  4376. const CompactEncDet::TextCorpusType corpus_type,
  4377. bool ignore_7bit_mail_encodings,
  4378. DetectEncodingState* destatep) {
  4379. bool enc_is_reliable = destatep->reliable;
  4380. Encoding new_enc = enc;
  4381. Encoding second_best_enc =
  4382. kMapToEncoding[destatep->second_top_rankedencoding];
  4383. if (FLAGS_counts) {++rescan_used;}
  4384. int scanned_bytes = src - isrc;
  4385. int unscanned_bytes = srctextlimit - src;
  4386. int text_length = srctextlimit - isrc;
  4387. bool empty_rescan = true;
  4388. // See if enough bytes left to bother doing rescan
  4389. if (kMinRescanLength < unscanned_bytes) {
  4390. const char* text = reinterpret_cast<const char*>(isrc);
  4391. Encoding one_hint = destatep->http_hint;
  4392. if ((one_hint == UNKNOWN_ENCODING) &&
  4393. (destatep->meta_hint != UNKNOWN_ENCODING)) {
  4394. one_hint = destatep->meta_hint;
  4395. }
  4396. if ((one_hint == UNKNOWN_ENCODING) &&
  4397. (destatep->bom_hint != UNKNOWN_ENCODING)) {
  4398. one_hint = destatep->bom_hint;
  4399. }
  4400. // Go to an even offset to keep UTF-16 in synch
  4401. int middle_offset = (scanned_bytes + (unscanned_bytes / 2)) & ~1;
  4402. CHECK(middle_offset <= text_length);
  4403. // Look back a bit for a low byte to synchronize, else hope for the best.
  4404. const uint8* srcbacklimit = isrc + middle_offset - kMaxScanBack;
  4405. if (srcbacklimit < src) {
  4406. srcbacklimit = src;
  4407. }
  4408. const uint8* ss = isrc + middle_offset - 1;
  4409. while (srcbacklimit <= ss) {
  4410. if ((*ss & 0x80) == 0) {break;}
  4411. --ss;
  4412. }
  4413. // Leave middle offset unchanged unless we found a low byte
  4414. if (srcbacklimit <= ss) {
  4415. // Align to low byte or high byte just after it, whichever is even
  4416. middle_offset = (ss - isrc + 1) & ~1; // Even to keep UTF-16 in sync
  4417. }
  4418. CHECK(middle_offset <= text_length);
  4419. if (destatep->debug_data != NULL) {
  4420. SetDetailsEncLabel(destatep, ">> Rescan");
  4421. // Print the current chart before recursive call
  4422. DumpDetail(destatep);
  4423. char buff[32];
  4424. snprintf(buff, sizeof(buff), ">> Rescan[%d..%d]",
  4425. middle_offset, text_length);
  4426. PsRecurse(buff);
  4427. }
  4428. int mid_bytes_consumed;
  4429. bool mid_is_reliable;
  4430. Encoding mid_second_best_enc;
  4431. CEDInternalFlags newflags = static_cast<CEDInternalFlags>(
  4432. kCEDRescanning + kCEDForceTags);
  4433. // Recursive call for rescan of half of remaining
  4434. Encoding mid_enc = InternalDetectEncoding(
  4435. newflags,
  4436. text + middle_offset,
  4437. text_length - middle_offset,
  4438. url_hint,
  4439. http_charset_hint,
  4440. meta_charset_hint,
  4441. encoding_hint,
  4442. language_hint, // User interface lang
  4443. corpus_type,
  4444. ignore_7bit_mail_encodings,
  4445. &mid_bytes_consumed,
  4446. &mid_is_reliable,
  4447. &mid_second_best_enc);
  4448. destatep->reliable = mid_is_reliable;
  4449. empty_rescan = (mid_enc == ASCII_7BIT);
  4450. // Not the right decision if, e.g. enc=Greek, mid=ASCII7, one=KSC
  4451. // hence the !empty_rescan term
  4452. if (!empty_rescan && CompatibleEnc(one_hint, mid_enc)) {
  4453. // Encoding we just found is compatible with the
  4454. // single hint (if any); return superset
  4455. new_enc = SupersetEnc(one_hint, mid_enc);
  4456. }
  4457. // If original and mid are compatible, and both reliable,
  4458. // return new_enc = SupersetEnc(enc, mid_enc)
  4459. //
  4460. // This avoids too much weight on a bogus hint causing a RobustScan
  4461. // that gets the wrong answer
  4462. if (!empty_rescan && mid_is_reliable && enc_is_reliable &&
  4463. CompatibleEnc(enc, mid_enc)) {
  4464. new_enc = SupersetEnc(enc, mid_enc);
  4465. return new_enc;
  4466. }
  4467. // if mid unreliable, robustscan
  4468. // if mid empty, robustscan
  4469. // if original and mid not compatible, robustscan
  4470. // if mid and one_hint not compatible, robustscan
  4471. // If we found conflicting data, drop back and do a robust scan of a big
  4472. // chunk of the input over a set of candidate encodings
  4473. //
  4474. if (!mid_is_reliable ||
  4475. empty_rescan ||
  4476. !CompatibleEnc(enc, mid_enc) ||
  4477. !CompatibleEnc(one_hint, mid_enc)) {
  4478. int robust_renc_list_len; // Number of active encodings
  4479. int robust_renc_list[NUM_RANKEDENCODING]; // List of ranked encodings
  4480. int robust_renc_probs[NUM_RANKEDENCODING]; // List of matching probs
  4481. robust_renc_list_len = 0;
  4482. AddToSet(enc, &robust_renc_list_len, robust_renc_list);
  4483. AddToSet(second_best_enc, &robust_renc_list_len, robust_renc_list);
  4484. AddToSet(mid_enc, &robust_renc_list_len, robust_renc_list);
  4485. AddToSet(mid_second_best_enc, &robust_renc_list_len, robust_renc_list);
  4486. if (destatep->http_hint != UNKNOWN_ENCODING) {
  4487. AddToSet(destatep->http_hint, &robust_renc_list_len, robust_renc_list);
  4488. }
  4489. if (destatep->meta_hint != UNKNOWN_ENCODING) {
  4490. AddToSet(destatep->meta_hint, &robust_renc_list_len, robust_renc_list);
  4491. }
  4492. if (destatep->bom_hint != UNKNOWN_ENCODING) {
  4493. AddToSet(destatep->bom_hint, &robust_renc_list_len, robust_renc_list);
  4494. }
  4495. if (destatep->tld_hint != UNKNOWN_ENCODING) {
  4496. AddToSet(destatep->tld_hint, &robust_renc_list_len, robust_renc_list);
  4497. }
  4498. // Separate simple scan
  4499. // =====================
  4500. if (destatep->debug_data != NULL) {
  4501. SetDetailsEncLabel(destatep, ">> RobustScan");
  4502. // Print the current chart before recursive call
  4503. DumpDetail(destatep);
  4504. char buff[32];
  4505. snprintf(buff, sizeof(buff), ">> RobustScan[0..%d]", text_length);
  4506. PsRecurse(buff);
  4507. }
  4508. int bigram_count = RobustScan(text, text_length,
  4509. robust_renc_list_len, robust_renc_list, robust_renc_probs);
  4510. // Default to new_enc and update if something better was found
  4511. int best_prob = -1;
  4512. // TEMP print
  4513. for (int i = 0; i < robust_renc_list_len; ++i) {
  4514. if (best_prob < robust_renc_probs[i]) {
  4515. best_prob = robust_renc_probs[i];
  4516. new_enc = kMapToEncoding[robust_renc_list[i]];
  4517. }
  4518. }
  4519. if (destatep->debug_data != NULL) {
  4520. char buff[32];
  4521. snprintf(buff, sizeof(buff), "=Robust[%d] %s",
  4522. bigram_count, MyEncodingName(new_enc));
  4523. SetDetailsEncProb(destatep,
  4524. 0,
  4525. CompactEncDet::BackmapEncodingToRankedEncoding(new_enc),
  4526. buff);
  4527. }
  4528. }
  4529. } // End if enough bytes
  4530. return new_enc;
  4531. }
  4532. // With no hints at all, and perhaps on rescan, we relax our pickiness
  4533. // and go ahead and accept the top multibyte encodings, even though
  4534. // strictly their web pages should have declared an explicit encoding to
  4535. // avoid the HTML standard's default ISO-8859-1.
  4536. bool NoHintsCloseEnoughCompatible(Encoding top_enc) {
  4537. // First test accepts degenerate cases plus UTF8 and UTF8UTF8
  4538. if (CompatibleEnc(UTF8, top_enc)) {return true;}
  4539. // The rest look for exact match of base encoding
  4540. Encoding base_enc = kMapEncToBaseEncoding[top_enc];
  4541. if (base_enc == JAPANESE_EUC_JP) {return true;}
  4542. if (base_enc == JAPANESE_SHIFT_JIS) {return true;}
  4543. if (base_enc == CHINESE_BIG5) {return true;}
  4544. if (base_enc == CHINESE_GB) {return true;}
  4545. if (base_enc == KOREAN_EUC_KR) {return true;}
  4546. return false;
  4547. }
  4548. // Scan raw bytes and detect most likely encoding
  4549. // Design goals:
  4550. // Skip over big initial stretches of seven-bit ASCII bytes very quickly
  4551. // Thread safe
  4552. // Works equally well on
  4553. // 50-byte queries,
  4554. // 5000-byte email and
  4555. // 50000-byte web pages
  4556. // Length 0 input returns ISO_8859_1 (ASCII) encoding
  4557. // Setting ignore_7bit_mail_encodings effectively turns off detection of
  4558. // UTF-7, HZ, and ISO-2022-xx
  4559. Encoding InternalDetectEncoding(
  4560. CEDInternalFlags flags, const char* text, int text_length,
  4561. const char* url_hint, const char* http_charset_hint,
  4562. const char* meta_charset_hint, const int encoding_hint,
  4563. const Language language_hint, // User interface lang
  4564. const CompactEncDet::TextCorpusType corpus_type,
  4565. bool ignore_7bit_mail_encodings, int* bytes_consumed, bool* is_reliable,
  4566. Encoding* second_best_enc) {
  4567. *bytes_consumed = 0;
  4568. *is_reliable = false;
  4569. *second_best_enc = ASCII_7BIT;
  4570. if (text_length == 0) {
  4571. // Follow the spec. Text might be NULL.
  4572. *is_reliable = true;
  4573. return ISO_8859_1;
  4574. }
  4575. // For very short (20-50 byte) input strings that are highly likely to be
  4576. // all printable ASCII, our startup overhead might dominate. We have to do the
  4577. // full detection if the ISO-2022-xx, HZ, or UTF-7 encodings are possible.
  4578. // Otherwise, we can do a quick scan for printable ASCII.
  4579. if ((text_length <= 500) && ignore_7bit_mail_encodings &&
  4580. QuickPrintableAsciiScan(text, text_length)) {
  4581. *is_reliable = true;
  4582. return ASCII_7BIT;
  4583. }
  4584. // Go for the full boat detection
  4585. DetectEncodingState destate;
  4586. InitDetectEncodingState(&destate);
  4587. std::unique_ptr<DetailEntry[]> scoped_debug_data;
  4588. if (FLAGS_enc_detect_detail) {
  4589. // Allocate max 10 details per bigram
  4590. scoped_debug_data.reset(new DetailEntry[kMaxPairs * 10]);
  4591. destate.debug_data = scoped_debug_data.get();
  4592. // NOTE: destate and scoped_debug_data have exactly the same scope
  4593. // All other FLAGS_enc_detect_detail tests use destate.debug_data != NULL
  4594. }
  4595. // Get text length limits
  4596. // Typically, we scan the first 16KB looking for all encodings, then
  4597. // scan the rest (up to 256KB) a bit faster by no longer looking for
  4598. // interesting bytes below 0x80. This allows us to skip over runs of
  4599. // 7-bit-ASCII much more quickly.
  4600. int slow_len = minint(text_length, (FLAGS_enc_detect_slow_max_kb << 10));
  4601. int fast_len = minint(text_length, (FLAGS_enc_detect_fast_max_kb << 10));
  4602. // Initialize pointers.
  4603. // In general, we do not look at last 3 bytes of input in the fast scan
  4604. // We do, however want to look at the last byte or so in the slow scan,
  4605. // especilly in the case of a very short text whose only interesting
  4606. // information is a 3-byte UTF-8 character in the last three bytes.
  4607. // If necessary, we fake a last bigram with 0x20 space as a pad byte.
  4608. const uint8* isrc = reinterpret_cast<const uint8*>(text);
  4609. const uint8* src = isrc;
  4610. const uint8* srctextlimit = isrc + text_length;
  4611. const uint8* srclimitslow2 = isrc + slow_len - 1;
  4612. const uint8* srclimitfast2 = isrc + fast_len - 1;
  4613. const uint8* srclimitfast4 = isrc + fast_len - 3;
  4614. if (srclimitslow2 > srclimitfast2) {
  4615. srclimitslow2 = srclimitfast2;
  4616. }
  4617. destate.initial_src = isrc;
  4618. destate.limit_src = srclimitfast2 + 1; // May include last byte
  4619. destate.prior_src = isrc;
  4620. destate.last_pair = isrc - 2;
  4621. const char* scan_table = kTestPrintableAsciiTildePlus;
  4622. if (ignore_7bit_mail_encodings) {
  4623. // Caller wants to ignore UTF-7, HZ, ISO-2022-xx
  4624. // Don't stop on + (for UTF-7), nor on ~ (for HZ)
  4625. scan_table = kTestPrintableAscii;
  4626. }
  4627. int exit_reason = 0;
  4628. if (destate.debug_data != NULL) {
  4629. BeginDetail(&destate);
  4630. // Take any incoming watch encoding name and backmap to the corresponding
  4631. // ranked enum value
  4632. watch1_rankedenc = LookupWatchEnc(FLAGS_enc_detect_watch1);
  4633. if (watch1_rankedenc >= 0) {
  4634. fprintf(stderr, "/track-me %d def\n", watch1_rankedenc);
  4635. }
  4636. watch2_rankedenc = LookupWatchEnc(FLAGS_enc_detect_watch2);
  4637. if (watch2_rankedenc >= 0) {
  4638. fprintf(stderr, "/track-me2 %d def\n", watch2_rankedenc);
  4639. }
  4640. fprintf(stderr, "%% kDerateHintsBelow = %d\n", kDerateHintsBelow);
  4641. }
  4642. if (FLAGS_enc_detect_source) {
  4643. PsSourceInit(kPsSourceWidth);
  4644. PsSource(src, isrc, srctextlimit);
  4645. PsMark(src, 4, isrc, 0);
  4646. }
  4647. // Apply hints, if any, to probabilities
  4648. // NOTE: Encoding probabilites are all zero at this point
  4649. ApplyHints(url_hint,
  4650. http_charset_hint,
  4651. meta_charset_hint,
  4652. encoding_hint,
  4653. language_hint,
  4654. corpus_type,
  4655. &destate);
  4656. // NOTE: probabilities up to this point are subject to derating for
  4657. // small numbers of bigrams.
  4658. // Probability changes after this point are not derated.
  4659. // Do first 4 bytes to pick off strong markers
  4660. InitialBytesBoost(isrc, text_length, &destate);
  4661. bool ignored_some_tag_text = false;
  4662. int tag_text_bigram_count = 0;
  4663. // Slower loop, approx 500 MB/sec (2.8 GHz P4)
  4664. // ASSERT(srclimitslow2 <= srclimitfast2);
  4665. //====================================
  4666. DoMoreSlowLoop:
  4667. while (src < srclimitslow2) {
  4668. // Skip to next interesting byte (this is the slower part)
  4669. while (src < srclimitslow2) {
  4670. uint8 uc = *src++;
  4671. if (scan_table[uc] != 0) {exit_reason = scan_table[uc]; src--; break;}
  4672. }
  4673. if (src < srclimitslow2) {
  4674. if (FLAGS_enc_detect_source) {
  4675. PsSource(src, isrc, srctextlimit); // don't mark yet
  4676. }
  4677. int weightshift = 0;
  4678. // In the first 16KB, derate new text run inside <title>...</title> and
  4679. // inside <!-- ... -->
  4680. if (////((destate.last_pair + 6) <= src) && // if beyond last one
  4681. ////(tag_text_bigram_count < kMaxBigramsTagTitleText) &&
  4682. (corpus_type == CompactEncDet::WEB_CORPUS) && // and web page
  4683. !CEDFlagForceTags(flags)) { // and OK to skip
  4684. ////if (TextInsideTag(destate.last_pair + 2, src, srclimitslow2)) {
  4685. if (TextInsideTag(isrc, src, srclimitslow2)) {
  4686. if (tag_text_bigram_count >= kMaxBigramsTagTitleText) {
  4687. ignored_some_tag_text = true;
  4688. src = SkipToTagEnd(src, srclimitslow2);
  4689. continue;
  4690. } else {
  4691. weightshift = kWeightshiftForTagTitleText;
  4692. ++tag_text_bigram_count;
  4693. }
  4694. }
  4695. }
  4696. if (FLAGS_enc_detect_source) {
  4697. PsMark(src, 2, isrc, weightshift);
  4698. }
  4699. // Saves byte pair and offset
  4700. bool pruned = IncrementAndBoostPrune(src, srctextlimit - src,
  4701. &destate, weightshift, exit_reason);
  4702. // Advance; if inside tag, advance to end of tag
  4703. if (weightshift == 0) {
  4704. src += exit_reason; // 1 Ascii, 2 other
  4705. } else {
  4706. src += exit_reason; // 1 Ascii, 2 other
  4707. //// src = SkipToTagEnd(src, srclimitslow2);
  4708. }
  4709. if (pruned) {
  4710. // Scoring and active encodings have been updated
  4711. if (destate.done) {break;}
  4712. // Check if all the reasons for the slow loop have been pruned
  4713. // If so, go to fast loop
  4714. if (!SevenBitActive(&destate)) {break;}
  4715. }
  4716. }
  4717. }
  4718. //====================================
  4719. // We reached the end of a slow scan, possibly because no more SevenBitActive,
  4720. // or possibly are at end of source.
  4721. // If we are exactly at the end of the source, make sure we look at the very
  4722. // last byte.
  4723. bool very_last_byte_incremented = false;
  4724. if (src == (srctextlimit - 1)) {
  4725. exit_reason = scan_table[*src];
  4726. if (exit_reason != 0) {
  4727. // The very last byte is an interesting byte
  4728. // Saves byte pair and offset
  4729. //printf("Interesting very last slow byte = 0x%02x\n", *src);
  4730. IncrementAndBoostPrune(src, srctextlimit - src, &destate, 0, exit_reason);
  4731. very_last_byte_incremented = true;
  4732. }
  4733. }
  4734. if (FLAGS_enc_detect_source) {
  4735. PsSource(src, isrc, srctextlimit);
  4736. PsMark(src, 2, isrc, 0);
  4737. }
  4738. // Force a pruning based on whatever we have
  4739. // Delete the seven-bit encodings if there is no evidence of them so far
  4740. BoostPrune(src, &destate, PRUNE_SLOWEND);
  4741. if (!destate.done) {
  4742. // If not clear yet on 7-bit-encodings and more bytes, do more slow
  4743. if (SevenBitActive(&destate) && (src < srclimitfast2)) {
  4744. // Increment limit by another xxxK
  4745. slow_len += (FLAGS_enc_detect_slow_max_kb << 10);
  4746. srclimitslow2 = isrc + slow_len - 1;
  4747. if (srclimitslow2 > srclimitfast2) {
  4748. srclimitslow2 = srclimitfast2;
  4749. }
  4750. if (!UTF7OrHzActive(&destate)) {
  4751. // We can switch to table that does not stop on + ~
  4752. scan_table = kTestPrintableAscii;
  4753. }
  4754. goto DoMoreSlowLoop;
  4755. }
  4756. exit_reason = 2;
  4757. // Faster loop, no 7-bit-encodings possible, approx 3000 GB/sec
  4758. //====================================
  4759. while (src < srclimitfast2) {
  4760. // Skip to next interesting byte (this is the faster part)
  4761. while (src < srclimitfast4) {
  4762. if (((src[0] | src[1] | src[2] | src[3]) & 0x80) != 0) break;
  4763. src += 4;
  4764. }
  4765. while (src < srclimitfast2) {
  4766. if ((src[0] & 0x80) != 0) break;
  4767. src++;
  4768. }
  4769. if (src < srclimitfast2) {
  4770. if (FLAGS_enc_detect_source) {
  4771. PsSource(src, isrc, srctextlimit);
  4772. PsMark(src, 2, isrc, 0);
  4773. }
  4774. // saves byte pair and offset
  4775. bool pruned = IncrementAndBoostPrune(src, srctextlimit - src,
  4776. &destate, 0, exit_reason);
  4777. src += exit_reason; // 1 Ascii, 2 other
  4778. if (pruned) {
  4779. // Scoring and active encodings have been updated
  4780. if (destate.done) {break;}
  4781. }
  4782. }
  4783. }
  4784. //====================================
  4785. // We reached the end of fast scan
  4786. // If we are exactly at the end of the source, make sure we look at the very
  4787. // last byte.
  4788. if (src == (srctextlimit - 1) && !very_last_byte_incremented) {
  4789. exit_reason = scan_table[*src];
  4790. if (exit_reason != 0) {
  4791. // The very last byte is an interesting byte
  4792. // Saves byte pair and offset
  4793. //printf("Interesting very last fast byte = 0x%02x\n", *src);
  4794. IncrementAndBoostPrune(src, srctextlimit - src, &destate, 0, exit_reason);
  4795. very_last_byte_incremented = true;
  4796. }
  4797. }
  4798. } // End if !done
  4799. if (FLAGS_enc_detect_source) {
  4800. PsSource(src, isrc, srctextlimit);
  4801. PsMark(src, 2, isrc, 0);
  4802. }
  4803. // Force a pruning based on whatever we have
  4804. BoostPrune(src, &destate, PRUNE_FINAL);
  4805. if (FLAGS_enc_detect_summary) {
  4806. DumpSummary(&destate, AsciiPair, 32);
  4807. DumpSummary(&destate, OtherPair, 32);
  4808. }
  4809. if (FLAGS_enc_detect_source) {
  4810. PsSourceFinish();
  4811. }
  4812. if (destate.debug_data != NULL) {
  4813. //// DumpDetail(&destate);
  4814. }
  4815. if (ignored_some_tag_text &&
  4816. (kMapToEncoding[destate.top_rankedencoding] == ASCII_7BIT)) {
  4817. // There were some interesting bytes, but only in tag text.
  4818. // Recursive call to reprocess looking at the tags this time.
  4819. if (destate.debug_data != NULL) {
  4820. SetDetailsEncLabel(&destate, ">> Recurse/tags");
  4821. // Print the current chart before recursive call
  4822. DumpDetail(&destate);
  4823. char buff[32];
  4824. snprintf(buff, sizeof(buff), ">> Recurse for tags");
  4825. PsRecurse(buff);
  4826. }
  4827. // Recursive call for high bytes in tags [no longer used, 1/16 tag score]
  4828. Encoding enc2 = InternalDetectEncoding(
  4829. kCEDForceTags, // force
  4830. text,
  4831. text_length,
  4832. url_hint,
  4833. http_charset_hint,
  4834. meta_charset_hint,
  4835. encoding_hint,
  4836. language_hint,
  4837. corpus_type,
  4838. ignore_7bit_mail_encodings,
  4839. bytes_consumed,
  4840. is_reliable,
  4841. second_best_enc);
  4842. if (destate.debug_data != NULL) {
  4843. // Show winning encoding and dump PostScript
  4844. char buff[32];
  4845. snprintf(buff, sizeof(buff), "=2 %s", MyEncodingName(enc2));
  4846. SetDetailsEncProb(&destate,
  4847. 0,
  4848. CompactEncDet::BackmapEncodingToRankedEncoding(enc2),
  4849. buff);
  4850. DumpDetail(&destate);
  4851. }
  4852. return enc2;
  4853. }
  4854. // If the detected encoding does not match default/hints, or if the hints
  4855. // conflict with each other, mark as unreliable. This can be used to trigger
  4856. // further scoring.
  4857. // Three buckets of input documents;
  4858. // ~19% of the web no hints, and top == 7bit, Latin1, or CP1252
  4859. // ~79% of the web one or more hints, all same encoding X and top == X
  4860. // ~ 2% of the web one or more hints that are inconsistent
  4861. Encoding top_enc = kMapToEncoding[destate.top_rankedencoding];
  4862. Encoding one_hint = destate.http_hint;
  4863. if ((one_hint == UNKNOWN_ENCODING) &&
  4864. (destate.meta_hint != UNKNOWN_ENCODING)) {
  4865. one_hint = destate.meta_hint;
  4866. }
  4867. if ((one_hint == UNKNOWN_ENCODING) &&
  4868. (destate.bom_hint != UNKNOWN_ENCODING)) {
  4869. one_hint = destate.bom_hint;
  4870. }
  4871. bool found_compatible_encoding = true;
  4872. if (one_hint == UNKNOWN_ENCODING) {
  4873. // [~14% of the web] No hints, and top == 7bit, Latin1, or CP1252
  4874. if (!CompatibleEnc(ISO_8859_1, top_enc)) {
  4875. found_compatible_encoding = false;
  4876. // If there is nothing but a TLD hint and its top encoding matches, OK
  4877. if ((destate.tld_hint != UNKNOWN_ENCODING) &&
  4878. CompatibleEnc(destate.tld_hint, top_enc)) {
  4879. found_compatible_encoding = true;
  4880. }
  4881. }
  4882. } else if (CompatibleEnc(one_hint, destate.http_hint) &&
  4883. CompatibleEnc(one_hint, destate.meta_hint) &&
  4884. CompatibleEnc(one_hint, destate.bom_hint)) {
  4885. // [~83% of the web] One or more hints, all same encoding X and top == X
  4886. if (!CompatibleEnc(one_hint, top_enc)) {
  4887. // [~ 2% of the web] Oops, not the declared encoding
  4888. found_compatible_encoding = false;
  4889. }
  4890. } else {
  4891. // [~ 3% of the web] Two or more hints that are inconsistent
  4892. one_hint = UNKNOWN_ENCODING;
  4893. found_compatible_encoding = false;
  4894. }
  4895. // If we turned Latin1 into Latin2 or 7 via trigrams, don't fail it here
  4896. if (destate.do_latin_trigrams) {
  4897. if (CompatibleEnc(kMapToEncoding[F_Latin1], top_enc) ||
  4898. CompatibleEnc(kMapToEncoding[F_Latin2], top_enc) ||
  4899. CompatibleEnc(kMapToEncoding[F_CP1250], top_enc) ||
  4900. CompatibleEnc(kMapToEncoding[F_ISO_8859_13], top_enc)) {
  4901. found_compatible_encoding = true;
  4902. destate.reliable = true;
  4903. }
  4904. }
  4905. // If top encoding is not compatible with the hints, but it is reliably
  4906. // UTF-8, accept it anyway.
  4907. // This will perform badly with mixed UTF-8 prefix plus another encoding in
  4908. // the body if done too early, so we want to be rescanning.
  4909. if (!found_compatible_encoding &&
  4910. destate.reliable &&
  4911. NoHintsCloseEnoughCompatible(top_enc) &&
  4912. (destate.next_interesting_pair[OtherPair] >= kStrongPairs) &&
  4913. CEDFlagRescanning(flags)) {
  4914. found_compatible_encoding = true;
  4915. }
  4916. // Hold off on this so Rescan() can see if the original encoding was reliable
  4917. //if (!found_compatible_encoding) {
  4918. // destate.reliable = false;
  4919. //}
  4920. // If unreliable, try rescoring to separate some encodings
  4921. if (!destate.reliable || !found_compatible_encoding) {
  4922. top_enc = Rescore(top_enc, isrc, srctextlimit, &destate);
  4923. }
  4924. *second_best_enc = kMapToEncoding[destate.second_top_rankedencoding];
  4925. // If unreliable, and not already rescanning,
  4926. // rescan middle of document to see if we can get a better
  4927. // answer. Rescan is only worthwhile if there are ~200 bytes or more left,
  4928. // since the detector takes as much as 96 bytes of bigrams to decide.
  4929. //
  4930. // CANNOT retry ISO-2022-xx HZ etc. because no declaration escape at the front
  4931. // or we may land in the middle of some partial state. Skip them all.
  4932. //
  4933. if ((!destate.reliable || !found_compatible_encoding) &&
  4934. !CEDFlagRescanning(flags) &&
  4935. !SevenBitEncoding(top_enc)) {
  4936. top_enc = Rescan(top_enc,
  4937. isrc,
  4938. src,
  4939. srctextlimit,
  4940. url_hint,
  4941. http_charset_hint,
  4942. meta_charset_hint,
  4943. encoding_hint,
  4944. language_hint,
  4945. corpus_type,
  4946. ignore_7bit_mail_encodings,
  4947. &destate);
  4948. } else {
  4949. if (!found_compatible_encoding) {
  4950. destate.reliable = false;
  4951. }
  4952. }
  4953. if (destate.debug_data != NULL) {
  4954. // Dump PostScript
  4955. DumpDetail(&destate);
  4956. }
  4957. *bytes_consumed = src - isrc + 1; // We looked 1 byte beyond src
  4958. *is_reliable = destate.reliable;
  4959. return top_enc;
  4960. }
  4961. Encoding CompactEncDet::DetectEncoding(
  4962. const char* text, int text_length, const char* url_hint,
  4963. const char* http_charset_hint, const char* meta_charset_hint,
  4964. const int encoding_hint,
  4965. const Language language_hint, // User interface lang
  4966. const TextCorpusType corpus_type, bool ignore_7bit_mail_encodings,
  4967. int* bytes_consumed, bool* is_reliable) {
  4968. if (FLAGS_ced_echo_input) {
  4969. string temp(text, text_length);
  4970. fprintf(stderr, "CompactEncDet::DetectEncoding()\n%s\n\n", temp.c_str());
  4971. }
  4972. if (FLAGS_counts) {
  4973. encdet_used = 0;
  4974. rescore_used = 0;
  4975. rescan_used = 0;
  4976. robust_used = 0;
  4977. looking_used = 0;
  4978. doing_used = 0;
  4979. ++encdet_used;
  4980. }
  4981. if (FLAGS_dirtsimple) {
  4982. // Just count first 64KB bigram encoding probabilities for each encoding
  4983. int robust_renc_list_len; // Number of active encodings
  4984. int robust_renc_list[NUM_RANKEDENCODING]; // List of ranked encodings
  4985. int robust_renc_probs[NUM_RANKEDENCODING]; // List of matching probs
  4986. for (int i = 0; i < NUM_RANKEDENCODING; ++i) {
  4987. robust_renc_list[i] = i;
  4988. }
  4989. robust_renc_list_len = NUM_RANKEDENCODING;
  4990. RobustScan(text, text_length,
  4991. robust_renc_list_len, robust_renc_list, robust_renc_probs);
  4992. // Pick off best encoding
  4993. int best_prob = -1;
  4994. Encoding enc = UNKNOWN_ENCODING;
  4995. for (int i = 0; i < robust_renc_list_len; ++i) {
  4996. if (best_prob < robust_renc_probs[i]) {
  4997. best_prob = robust_renc_probs[i];
  4998. enc = kMapToEncoding[robust_renc_list[i]];
  4999. }
  5000. }
  5001. *bytes_consumed = minint(text_length, (kMaxKBToRobustScan << 10));
  5002. *is_reliable = true;
  5003. if (FLAGS_counts) {
  5004. printf("CEDcounts ");
  5005. while (encdet_used--) {printf("encdet ");}
  5006. while (rescore_used--) {printf("rescore ");}
  5007. while (rescan_used--) {printf("rescan ");}
  5008. while (robust_used--) {printf("robust ");}
  5009. while (looking_used--) {printf("looking ");}
  5010. while (doing_used--) {printf("doing ");}
  5011. printf("\n");
  5012. }
  5013. return enc;
  5014. }
  5015. Encoding second_best_enc;
  5016. Encoding enc = InternalDetectEncoding(kCEDNone,
  5017. text,
  5018. text_length,
  5019. url_hint,
  5020. http_charset_hint,
  5021. meta_charset_hint,
  5022. encoding_hint,
  5023. language_hint, // User interface lang
  5024. corpus_type,
  5025. ignore_7bit_mail_encodings,
  5026. bytes_consumed,
  5027. is_reliable,
  5028. &second_best_enc);
  5029. if (FLAGS_counts) {
  5030. printf("CEDcounts ");
  5031. while (encdet_used--) {printf("encdet ");}
  5032. while (rescore_used--) {printf("rescore ");}
  5033. while (rescan_used--) {printf("rescan ");}
  5034. while (robust_used--) {printf("robust ");}
  5035. while (looking_used--) {printf("looking ");}
  5036. while (doing_used--) {printf("doing ");}
  5037. printf("\n");
  5038. }
  5039. #if defined(HTML5_MODE)
  5040. // Map all the Shift-JIS variants to Shift-JIS when used in Japanese locale.
  5041. if (language_hint == JAPANESE && IsShiftJisOrVariant(enc)) {
  5042. enc = JAPANESE_SHIFT_JIS;
  5043. }
  5044. // 7-bit encodings (except ISO-2022-JP), and some obscure encodings not
  5045. // supported in WHATWG encoding standard are marked as ASCII to keep the raw
  5046. // bytes intact.
  5047. switch (enc) {
  5048. case ISO_2022_KR:
  5049. case ISO_2022_CN:
  5050. case HZ_GB_2312:
  5051. case UTF7:
  5052. case UTF16LE:
  5053. case UTF16BE:
  5054. case CHINESE_EUC_DEC:
  5055. case CHINESE_CNS:
  5056. case CHINESE_BIG5_CP950:
  5057. case JAPANESE_CP932:
  5058. case MSFT_CP874:
  5059. case TSCII:
  5060. case TAMIL_MONO:
  5061. case TAMIL_BI:
  5062. case JAGRAN:
  5063. case BHASKAR:
  5064. case HTCHANAKYA:
  5065. case BINARYENC:
  5066. case UTF8UTF8:
  5067. case TAM_ELANGO:
  5068. case TAM_LTTMBARANI:
  5069. case TAM_SHREE:
  5070. case TAM_TBOOMIS:
  5071. case TAM_TMNEWS:
  5072. case TAM_WEBTAMIL:
  5073. case KDDI_SHIFT_JIS:
  5074. case DOCOMO_SHIFT_JIS:
  5075. case SOFTBANK_SHIFT_JIS:
  5076. case KDDI_ISO_2022_JP:
  5077. case SOFTBANK_ISO_2022_JP:
  5078. enc = ASCII_7BIT;
  5079. break;
  5080. default:
  5081. break;
  5082. }
  5083. #endif
  5084. return enc;
  5085. }
  5086. // Return top encoding hint for given string
  5087. Encoding CompactEncDet::TopEncodingOfLangHint(const char* name) {
  5088. string normalized_lang = MakeChar8(string(name));
  5089. int n = HintBinaryLookup8(kLangHintProbs, kLangHintProbsSize,
  5090. normalized_lang.c_str());
  5091. if (n < 0) {return UNKNOWN_ENCODING;}
  5092. // Charset is eight bytes, probability table is eight bytes
  5093. int toprankenc =
  5094. TopCompressedProb((const char *)&kLangHintProbs[n].key_prob[kMaxLangKey],
  5095. kMaxLangVector);
  5096. return kMapToEncoding[toprankenc];
  5097. }
  5098. // Return top encoding hint for given string
  5099. Encoding CompactEncDet::TopEncodingOfTLDHint(const char* name) {
  5100. string normalized_tld = MakeChar4(string(name));
  5101. int n = HintBinaryLookup4(kTLDHintProbs, kTLDHintProbsSize,
  5102. normalized_tld.c_str());
  5103. if (n < 0) {return UNKNOWN_ENCODING;}
  5104. // TLD is four bytes, probability table is 12 bytes
  5105. int toprankenc =
  5106. TopCompressedProb((const char *)&kTLDHintProbs[n].key_prob[kMaxTldKey],
  5107. kMaxTldVector);
  5108. return kMapToEncoding[toprankenc];
  5109. }
  5110. // Return top encoding hint for given string
  5111. Encoding CompactEncDet::TopEncodingOfCharsetHint(const char* name) {
  5112. string normalized_charset = MakeChar44(string(name));
  5113. int n = HintBinaryLookup8(kCharsetHintProbs, kCharsetHintProbsSize,
  5114. normalized_charset.c_str());
  5115. if (n < 0) {return UNKNOWN_ENCODING;}
  5116. // Charset is eight bytes, probability table is eight bytes
  5117. int toprankenc =
  5118. TopCompressedProb((const char *)&kCharsetHintProbs[n].key_prob[kMaxCharsetKey],
  5119. kMaxCharsetVector);
  5120. return kMapToEncoding[toprankenc];
  5121. }
  5122. const char* CompactEncDet::Version(void) {
  5123. return kVersion;
  5124. }