您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

content_type.c 12KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536
  1. /*-
  2. * Copyright 2016 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "libmime/content_type.h"
  17. #include "smtp_parsers.h"
  18. #include "utlist.h"
  19. void
  20. rspamd_content_type_add_param (rspamd_mempool_t *pool,
  21. struct rspamd_content_type *ct,
  22. gchar *name_start, gchar *name_end,
  23. gchar *value_start, gchar *value_end)
  24. {
  25. rspamd_ftok_t srch;
  26. struct rspamd_content_type_param *found = NULL, *nparam;
  27. g_assert (ct != NULL);
  28. nparam = rspamd_mempool_alloc (pool, sizeof (*nparam));
  29. nparam->name.begin = name_start;
  30. nparam->name.len = name_end - name_start;
  31. rspamd_str_lc (name_start, name_end - name_start);
  32. nparam->value.begin = value_start;
  33. nparam->value.len = value_end - value_start;
  34. RSPAMD_FTOK_ASSIGN (&srch, "charset");
  35. if (rspamd_ftok_cmp (&nparam->name, &srch) == 0) {
  36. /* Adjust charset */
  37. found = nparam;
  38. ct->charset.begin = nparam->value.begin;
  39. ct->charset.len = nparam->value.len;
  40. }
  41. RSPAMD_FTOK_ASSIGN (&srch, "boundary");
  42. if (rspamd_ftok_cmp (&nparam->name, &srch) == 0) {
  43. found = nparam;
  44. gchar *lc_boundary;
  45. /* Adjust boundary */
  46. lc_boundary = rspamd_mempool_alloc (pool, nparam->value.len);
  47. memcpy (lc_boundary, nparam->value.begin, nparam->value.len);
  48. rspamd_str_lc (lc_boundary, nparam->value.len);
  49. ct->boundary.begin = lc_boundary;
  50. ct->boundary.len = nparam->value.len;
  51. /* Preserve original (case sensitive) boundary */
  52. ct->orig_boundary.begin = nparam->value.begin;
  53. ct->orig_boundary.len = nparam->value.len;
  54. }
  55. if (!found) {
  56. srch.begin = nparam->name.begin;
  57. srch.len = nparam->name.len;
  58. rspamd_str_lc (value_start, value_end - value_start);
  59. if (ct->attrs) {
  60. found = g_hash_table_lookup (ct->attrs, &srch);
  61. } else {
  62. ct->attrs = g_hash_table_new (rspamd_ftok_icase_hash,
  63. rspamd_ftok_icase_equal);
  64. }
  65. if (!found) {
  66. DL_APPEND (found, nparam);
  67. g_hash_table_insert (ct->attrs, &nparam->name, nparam);
  68. }
  69. else {
  70. DL_APPEND (found, nparam);
  71. }
  72. }
  73. }
  74. static struct rspamd_content_type *
  75. rspamd_content_type_parser (gchar *in, gsize len, rspamd_mempool_t *pool)
  76. {
  77. guint obraces = 0, ebraces = 0, qlen = 0;
  78. gchar *p, *c, *end, *pname_start = NULL, *pname_end = NULL;
  79. struct rspamd_content_type *res = NULL, val;
  80. gboolean eqsign_seen = FALSE;
  81. enum {
  82. parse_type,
  83. parse_subtype,
  84. parse_after_subtype,
  85. parse_param_name,
  86. parse_param_after_name,
  87. parse_param_value,
  88. parse_param_value_after_quote,
  89. parse_space,
  90. parse_quoted,
  91. parse_comment,
  92. } state = parse_space, next_state = parse_type;
  93. p = in;
  94. c = p;
  95. end = p + len;
  96. memset (&val, 0, sizeof (val));
  97. val.cpy = in;
  98. while (p < end) {
  99. switch (state) {
  100. case parse_type:
  101. if (g_ascii_isspace (*p) || *p == ';') {
  102. /* We have type without subtype */
  103. val.type.begin = c;
  104. val.type.len = p - c;
  105. state = parse_after_subtype;
  106. } else if (*p == '/') {
  107. val.type.begin = c;
  108. val.type.len = p - c;
  109. state = parse_space;
  110. next_state = parse_subtype;
  111. p++;
  112. } else {
  113. p++;
  114. }
  115. break;
  116. case parse_subtype:
  117. if (g_ascii_isspace (*p) || *p == ';') {
  118. val.subtype.begin = c;
  119. val.subtype.len = p - c;
  120. state = parse_after_subtype;
  121. } else {
  122. p++;
  123. }
  124. break;
  125. case parse_after_subtype:
  126. if (*p == ';' || g_ascii_isspace (*p)) {
  127. p++;
  128. } else if (*p == '(') {
  129. c = p;
  130. state = parse_comment;
  131. next_state = parse_param_name;
  132. obraces = 1;
  133. ebraces = 0;
  134. pname_start = NULL;
  135. pname_end = NULL;
  136. eqsign_seen = FALSE;
  137. p++;
  138. } else {
  139. c = p;
  140. state = parse_param_name;
  141. pname_start = NULL;
  142. pname_end = NULL;
  143. eqsign_seen = FALSE;
  144. }
  145. break;
  146. case parse_param_name:
  147. if (*p == '=') {
  148. pname_start = c;
  149. pname_end = p;
  150. state = parse_param_after_name;
  151. eqsign_seen = TRUE;
  152. p++;
  153. } else if (g_ascii_isspace (*p)) {
  154. pname_start = c;
  155. pname_end = p;
  156. state = parse_param_after_name;
  157. } else {
  158. p++;
  159. }
  160. break;
  161. case parse_param_after_name:
  162. if (g_ascii_isspace (*p)) {
  163. p++;
  164. } else if (*p == '=') {
  165. if (eqsign_seen) {
  166. /* Treat as value start */
  167. c = p;
  168. eqsign_seen = FALSE;
  169. state = parse_space;
  170. next_state = parse_param_value;
  171. p++;
  172. } else {
  173. eqsign_seen = TRUE;
  174. p++;
  175. }
  176. } else {
  177. if (eqsign_seen) {
  178. state = parse_param_value;
  179. c = p;
  180. } else {
  181. /* Invalid parameter without value */
  182. c = p;
  183. state = parse_param_name;
  184. pname_start = NULL;
  185. pname_end = NULL;
  186. }
  187. }
  188. break;
  189. case parse_param_value:
  190. if (*p == '"') {
  191. p++;
  192. c = p;
  193. state = parse_quoted;
  194. next_state = parse_param_value_after_quote;
  195. } else if (g_ascii_isspace (*p)) {
  196. if (pname_start && pname_end && pname_end > pname_start) {
  197. rspamd_content_type_add_param (pool, &val, pname_start,
  198. pname_end, c, p);
  199. }
  200. state = parse_space;
  201. next_state = parse_param_name;
  202. pname_start = NULL;
  203. pname_end = NULL;
  204. } else if (*p == '(') {
  205. if (pname_start && pname_end && pname_end > pname_start) {
  206. rspamd_content_type_add_param (pool, &val, pname_start,
  207. pname_end, c, p);
  208. }
  209. obraces = 1;
  210. ebraces = 0;
  211. p++;
  212. state = parse_comment;
  213. next_state = parse_param_name;
  214. pname_start = NULL;
  215. pname_end = NULL;
  216. }
  217. else if (*p == ';') {
  218. if (pname_start && pname_end && pname_end > pname_start) {
  219. rspamd_content_type_add_param (pool, &val, pname_start,
  220. pname_end, c, p);
  221. }
  222. p ++;
  223. state = parse_space;
  224. next_state = parse_param_name;
  225. pname_start = NULL;
  226. pname_end = NULL;
  227. }
  228. else {
  229. p++;
  230. }
  231. break;
  232. case parse_param_value_after_quote:
  233. if (pname_start && pname_end && pname_end > pname_start) {
  234. rspamd_content_type_add_param (pool, &val, pname_start,
  235. pname_end, c, c + qlen);
  236. }
  237. if (g_ascii_isspace (*p)) {
  238. state = parse_space;
  239. next_state = parse_param_name;
  240. pname_start = NULL;
  241. pname_end = NULL;
  242. } else if (*p == '(') {
  243. obraces = 1;
  244. ebraces = 0;
  245. p++;
  246. state = parse_comment;
  247. next_state = parse_param_name;
  248. pname_start = NULL;
  249. pname_end = NULL;
  250. } else {
  251. state = parse_param_name;
  252. pname_start = NULL;
  253. pname_end = NULL;
  254. c = p;
  255. }
  256. break;
  257. case parse_quoted:
  258. if (*p == '\\') {
  259. /* Quoted pair */
  260. if (p + 1 < end) {
  261. p += 2;
  262. } else {
  263. p++;
  264. }
  265. } else if (*p == '"') {
  266. qlen = p - c;
  267. state = next_state;
  268. } else {
  269. p++;
  270. }
  271. break;
  272. case parse_comment:
  273. if (*p == '(') {
  274. obraces++;
  275. p++;
  276. } else if (*p == ')') {
  277. ebraces++;
  278. p++;
  279. if (ebraces == obraces && p < end) {
  280. if (g_ascii_isspace (*p)) {
  281. state = parse_space;
  282. } else {
  283. c = p;
  284. state = next_state;
  285. }
  286. }
  287. } else {
  288. p++;
  289. }
  290. break;
  291. case parse_space:
  292. if (g_ascii_isspace (*p)) {
  293. p++;
  294. } else if (*p == '(') {
  295. obraces = 1;
  296. ebraces = 0;
  297. p++;
  298. state = parse_comment;
  299. } else {
  300. c = p;
  301. state = next_state;
  302. }
  303. break;
  304. }
  305. }
  306. /* Process leftover */
  307. switch (state) {
  308. case parse_type:
  309. val.type.begin = c;
  310. val.type.len = p - c;
  311. break;
  312. case parse_subtype:
  313. val.subtype.begin = c;
  314. val.subtype.len = p - c;
  315. break;
  316. case parse_param_value:
  317. if (pname_start && pname_end && pname_end > pname_start) {
  318. if (p > c && *(p - 1) == ';') {
  319. p --;
  320. }
  321. rspamd_content_type_add_param (pool, &val, pname_start,
  322. pname_end, c, p);
  323. }
  324. break;
  325. case parse_param_value_after_quote:
  326. if (pname_start && pname_end && pname_end > pname_start) {
  327. rspamd_content_type_add_param (pool, &val, pname_start,
  328. pname_end, c, c + qlen);
  329. }
  330. break;
  331. default:
  332. break;
  333. }
  334. if (val.type.len > 0) {
  335. res = rspamd_mempool_alloc (pool, sizeof (val));
  336. memcpy (res, &val, sizeof (val));
  337. /* Lowercase common thingies */
  338. }
  339. return res;
  340. }
  341. struct rspamd_content_type *
  342. rspamd_content_type_parse (const gchar *in,
  343. gsize len, rspamd_mempool_t *pool)
  344. {
  345. struct rspamd_content_type *res = NULL;
  346. rspamd_ftok_t srch;
  347. gchar *lc_data;
  348. lc_data = rspamd_mempool_alloc (pool, len + 1);
  349. rspamd_strlcpy (lc_data, in, len + 1);
  350. if ((res = rspamd_content_type_parser (lc_data, len, pool)) != NULL) {
  351. if (res->attrs) {
  352. rspamd_mempool_add_destructor (pool,
  353. (rspamd_mempool_destruct_t)g_hash_table_unref, res->attrs);
  354. }
  355. /* Now do some hacks to work with broken content types */
  356. if (res->subtype.len == 0) {
  357. res->flags |= RSPAMD_CONTENT_TYPE_BROKEN;
  358. RSPAMD_FTOK_ASSIGN (&srch, "text");
  359. if (rspamd_ftok_cmp (&res->type, &srch) == 0) {
  360. /* Workaround for Content-Type: text */
  361. /* Assume text/plain */
  362. RSPAMD_FTOK_ASSIGN (&srch, "plain");
  363. }
  364. else {
  365. RSPAMD_FTOK_ASSIGN (&srch, "html");
  366. if (rspamd_ftok_cmp (&res->type, &srch) == 0) {
  367. /* Workaround for Content-Type: html */
  368. RSPAMD_FTOK_ASSIGN (&res->type, "text");
  369. RSPAMD_FTOK_ASSIGN (&res->subtype, "html");
  370. }
  371. else {
  372. RSPAMD_FTOK_ASSIGN (&srch, "application");
  373. if (rspamd_ftok_cmp (&res->type, &srch) == 0) {
  374. RSPAMD_FTOK_ASSIGN (&res->subtype, "octet-stream");
  375. }
  376. }
  377. }
  378. }
  379. else {
  380. /* Common mistake done by retards */
  381. RSPAMD_FTOK_ASSIGN (&srch, "alternate");
  382. if (rspamd_ftok_cmp (&res->subtype, &srch) == 0) {
  383. res->flags |= RSPAMD_CONTENT_TYPE_BROKEN;
  384. RSPAMD_FTOK_ASSIGN (&res->subtype, "alternative");
  385. }
  386. }
  387. RSPAMD_FTOK_ASSIGN (&srch, "multipart");
  388. if (rspamd_ftok_cmp (&res->type, &srch) == 0) {
  389. res->flags |= RSPAMD_CONTENT_TYPE_MULTIPART;
  390. }
  391. else {
  392. RSPAMD_FTOK_ASSIGN (&srch, "text");
  393. if (rspamd_ftok_cmp (&res->type, &srch) == 0) {
  394. res->flags |= RSPAMD_CONTENT_TYPE_TEXT;
  395. }
  396. else {
  397. RSPAMD_FTOK_ASSIGN (&srch, "message");
  398. if (rspamd_ftok_cmp (&res->type, &srch) == 0) {
  399. RSPAMD_FTOK_ASSIGN (&srch, "delivery-status");
  400. if (rspamd_ftok_cmp (&res->subtype, &srch) == 0) {
  401. res->flags |= RSPAMD_CONTENT_TYPE_TEXT|RSPAMD_CONTENT_TYPE_DSN;
  402. }
  403. else {
  404. RSPAMD_FTOK_ASSIGN (&srch, "notification");
  405. if (rspamd_substring_search_caseless (res->subtype.begin,
  406. res->subtype.len, srch.begin, srch.len) != -1) {
  407. res->flags |= RSPAMD_CONTENT_TYPE_TEXT|
  408. RSPAMD_CONTENT_TYPE_DSN;
  409. }
  410. else {
  411. res->flags |= RSPAMD_CONTENT_TYPE_MESSAGE;
  412. }
  413. }
  414. }
  415. }
  416. }
  417. }
  418. else {
  419. msg_warn_pool ("cannot parse content type: %*s", (gint)len, lc_data);
  420. }
  421. return res;
  422. }
  423. void
  424. rspamd_content_disposition_add_param (rspamd_mempool_t *pool,
  425. struct rspamd_content_disposition *cd,
  426. const gchar *name_start, const gchar *name_end,
  427. const gchar *value_start, const gchar *value_end)
  428. {
  429. rspamd_ftok_t srch;
  430. gchar *decoded;
  431. struct rspamd_content_type_param *found = NULL, *nparam;
  432. g_assert (cd != NULL);
  433. srch.begin = name_start;
  434. srch.len = name_end - name_start;
  435. if (cd->attrs) {
  436. found = g_hash_table_lookup (cd->attrs, &srch);
  437. }
  438. else {
  439. cd->attrs = g_hash_table_new (rspamd_ftok_icase_hash,
  440. rspamd_ftok_icase_equal);
  441. rspamd_mempool_add_destructor (pool,
  442. (rspamd_mempool_destruct_t)g_hash_table_unref, cd->attrs);
  443. }
  444. nparam = rspamd_mempool_alloc (pool, sizeof (*nparam));
  445. nparam->name.begin = name_start;
  446. nparam->name.len = name_end - name_start;
  447. decoded = rspamd_mime_header_decode (pool, value_start,
  448. value_end - value_start, NULL);
  449. RSPAMD_FTOK_FROM_STR (&nparam->value, decoded);
  450. if (!found) {
  451. g_hash_table_insert (cd->attrs, &nparam->name, nparam);
  452. }
  453. DL_APPEND (found, nparam);
  454. srch.begin = "filename";
  455. srch.len = 8;
  456. if (rspamd_ftok_cmp (&nparam->name, &srch) == 0) {
  457. /* Adjust filename */
  458. cd->filename.begin = nparam->value.begin;
  459. cd->filename.len = nparam->value.len;
  460. }
  461. }
  462. struct rspamd_content_disposition *
  463. rspamd_content_disposition_parse (const gchar *in,
  464. gsize len, rspamd_mempool_t *pool)
  465. {
  466. struct rspamd_content_disposition *res = NULL, val;
  467. if (rspamd_content_disposition_parser (in, len, &val, pool)) {
  468. res = rspamd_mempool_alloc (pool, sizeof (val));
  469. memcpy (res, &val, sizeof (val));
  470. res->lc_data = rspamd_mempool_alloc (pool, len + 1);
  471. rspamd_strlcpy (res->lc_data, in, len + 1);
  472. rspamd_str_lc (res->lc_data, len);
  473. }
  474. else {
  475. msg_warn_pool ("cannot parse content disposition: %*s",
  476. (gint)len, val.lc_data);
  477. }
  478. return res;
  479. }