You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

content_type.c 11KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495
  1. /*-
  2. * Copyright 2016 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "libmime/content_type.h"
  17. #include "smtp_parsers.h"
  18. #include "utlist.h"
  19. void
  20. rspamd_content_type_add_param (rspamd_mempool_t *pool,
  21. struct rspamd_content_type *ct,
  22. const gchar *name_start, const gchar *name_end,
  23. const gchar *value_start, const gchar *value_end)
  24. {
  25. rspamd_ftok_t srch;
  26. struct rspamd_content_type_param *found = NULL, *nparam;
  27. g_assert (ct != NULL);
  28. srch.begin = name_start;
  29. srch.len = name_end - name_start;
  30. if (ct->attrs) {
  31. found = g_hash_table_lookup (ct->attrs, &srch);
  32. }
  33. else {
  34. ct->attrs = g_hash_table_new (rspamd_ftok_icase_hash,
  35. rspamd_ftok_icase_equal);
  36. }
  37. nparam = rspamd_mempool_alloc (pool, sizeof (*nparam));
  38. nparam->name.begin = name_start;
  39. nparam->name.len = name_end - name_start;
  40. nparam->value.begin = value_start;
  41. nparam->value.len = value_end - value_start;
  42. if (!found) {
  43. DL_APPEND (found, nparam);
  44. g_hash_table_insert (ct->attrs, &nparam->name, nparam);
  45. }
  46. else {
  47. DL_APPEND (found, nparam);
  48. }
  49. RSPAMD_FTOK_ASSIGN (&srch, "charset");
  50. if (rspamd_ftok_cmp (&nparam->name, &srch) == 0) {
  51. /* Adjust charset */
  52. ct->charset.begin = nparam->value.begin;
  53. ct->charset.len = nparam->value.len;
  54. }
  55. RSPAMD_FTOK_ASSIGN (&srch, "boundary");
  56. if (rspamd_ftok_cmp (&nparam->name, &srch) == 0) {
  57. /* Adjust boundary */
  58. ct->boundary.begin = nparam->value.begin;
  59. ct->boundary.len = nparam->value.len;
  60. }
  61. }
  62. static struct rspamd_content_type *
  63. rspamd_content_type_parser (const gchar *in, gsize len, rspamd_mempool_t *pool)
  64. {
  65. guint obraces = 0, ebraces = 0, qlen = 0;
  66. const gchar *p, *c, *end, *pname_start = NULL, *pname_end = NULL;
  67. struct rspamd_content_type *res = NULL, val;
  68. gboolean eqsign_seen = FALSE;
  69. enum {
  70. parse_type,
  71. parse_subtype,
  72. parse_after_subtype,
  73. parse_param_name,
  74. parse_param_after_name,
  75. parse_param_value,
  76. parse_param_value_after_quote,
  77. parse_space,
  78. parse_quoted,
  79. parse_comment,
  80. } state = parse_space, next_state = parse_type;
  81. p = in;
  82. c = p;
  83. end = p + len;
  84. memset (&val, 0, sizeof (val));
  85. val.lc_data = (gchar *)in;
  86. while (p < end) {
  87. switch (state) {
  88. case parse_type:
  89. if (g_ascii_isspace (*p) || *p == ';') {
  90. /* We have type without subtype */
  91. val.type.begin = c;
  92. val.type.len = p - c;
  93. state = parse_after_subtype;
  94. } else if (*p == '/') {
  95. val.type.begin = c;
  96. val.type.len = p - c;
  97. state = parse_space;
  98. next_state = parse_subtype;
  99. p++;
  100. } else {
  101. p++;
  102. }
  103. break;
  104. case parse_subtype:
  105. if (g_ascii_isspace (*p) || *p == ';') {
  106. val.subtype.begin = c;
  107. val.subtype.len = p - c;
  108. state = parse_after_subtype;
  109. } else {
  110. p++;
  111. }
  112. break;
  113. case parse_after_subtype:
  114. if (*p == ';' || g_ascii_isspace (*p)) {
  115. p++;
  116. } else if (*p == '(') {
  117. c = p;
  118. state = parse_comment;
  119. next_state = parse_param_name;
  120. obraces = 1;
  121. ebraces = 0;
  122. pname_start = NULL;
  123. pname_end = NULL;
  124. eqsign_seen = FALSE;
  125. p++;
  126. } else {
  127. c = p;
  128. state = parse_param_name;
  129. pname_start = NULL;
  130. pname_end = NULL;
  131. eqsign_seen = FALSE;
  132. }
  133. break;
  134. case parse_param_name:
  135. if (*p == '=') {
  136. pname_start = c;
  137. pname_end = p;
  138. state = parse_param_after_name;
  139. eqsign_seen = TRUE;
  140. p++;
  141. } else if (g_ascii_isspace (*p)) {
  142. pname_start = c;
  143. pname_end = p;
  144. state = parse_param_after_name;
  145. } else {
  146. p++;
  147. }
  148. break;
  149. case parse_param_after_name:
  150. if (g_ascii_isspace (*p)) {
  151. p++;
  152. } else if (*p == '=') {
  153. if (eqsign_seen) {
  154. /* Treat as value start */
  155. c = p;
  156. eqsign_seen = FALSE;
  157. state = parse_space;
  158. next_state = parse_param_value;
  159. p++;
  160. } else {
  161. eqsign_seen = TRUE;
  162. p++;
  163. }
  164. } else {
  165. if (eqsign_seen) {
  166. state = parse_param_value;
  167. c = p;
  168. } else {
  169. /* Invalid parameter without value */
  170. c = p;
  171. state = parse_param_name;
  172. pname_start = NULL;
  173. pname_end = NULL;
  174. }
  175. }
  176. break;
  177. case parse_param_value:
  178. if (*p == '"') {
  179. p++;
  180. c = p;
  181. state = parse_quoted;
  182. next_state = parse_param_value_after_quote;
  183. } else if (g_ascii_isspace (*p)) {
  184. if (pname_start && pname_end && pname_end > pname_start) {
  185. rspamd_content_type_add_param (pool, &val, pname_start,
  186. pname_end, c, p);
  187. }
  188. state = parse_space;
  189. next_state = parse_param_name;
  190. pname_start = NULL;
  191. pname_end = NULL;
  192. } else if (*p == '(') {
  193. if (pname_start && pname_end && pname_end > pname_start) {
  194. rspamd_content_type_add_param (pool, &val, pname_start,
  195. pname_end, c, p);
  196. }
  197. obraces = 1;
  198. ebraces = 0;
  199. p++;
  200. state = parse_comment;
  201. next_state = parse_param_name;
  202. pname_start = NULL;
  203. pname_end = NULL;
  204. } else {
  205. p++;
  206. }
  207. break;
  208. case parse_param_value_after_quote:
  209. if (pname_start && pname_end && pname_end > pname_start) {
  210. rspamd_content_type_add_param (pool, &val, pname_start,
  211. pname_end, c, c + qlen);
  212. }
  213. if (g_ascii_isspace (*p)) {
  214. state = parse_space;
  215. next_state = parse_param_name;
  216. pname_start = NULL;
  217. pname_end = NULL;
  218. } else if (*p == '(') {
  219. obraces = 1;
  220. ebraces = 0;
  221. p++;
  222. state = parse_comment;
  223. next_state = parse_param_name;
  224. pname_start = NULL;
  225. pname_end = NULL;
  226. } else {
  227. state = parse_param_name;
  228. pname_start = NULL;
  229. pname_end = NULL;
  230. c = p;
  231. }
  232. break;
  233. case parse_quoted:
  234. if (*p == '\\') {
  235. /* Quoted pair */
  236. if (p + 1 < end) {
  237. p += 2;
  238. } else {
  239. p++;
  240. }
  241. } else if (*p == '"') {
  242. qlen = p - c;
  243. state = next_state;
  244. } else {
  245. p++;
  246. }
  247. break;
  248. case parse_comment:
  249. if (*p == '(') {
  250. obraces++;
  251. p++;
  252. } else if (*p == ')') {
  253. ebraces++;
  254. p++;
  255. if (ebraces == obraces && p < end) {
  256. if (g_ascii_isspace (*p)) {
  257. state = parse_space;
  258. } else {
  259. c = p;
  260. state = next_state;
  261. }
  262. }
  263. } else {
  264. p++;
  265. }
  266. break;
  267. case parse_space:
  268. if (g_ascii_isspace (*p)) {
  269. p++;
  270. } else if (*p == '(') {
  271. obraces = 1;
  272. ebraces = 0;
  273. p++;
  274. state = parse_comment;
  275. } else {
  276. c = p;
  277. state = next_state;
  278. }
  279. break;
  280. }
  281. }
  282. /* Process leftover */
  283. switch (state) {
  284. case parse_type:
  285. val.type.begin = c;
  286. val.type.len = p - c;
  287. break;
  288. case parse_subtype:
  289. val.subtype.begin = c;
  290. val.subtype.len = p - c;
  291. break;
  292. case parse_param_value:
  293. if (pname_start && pname_end && pname_end > pname_start) {
  294. rspamd_content_type_add_param (pool, &val, pname_start,
  295. pname_end, c, p);
  296. }
  297. case parse_param_value_after_quote:
  298. if (pname_start && pname_end && pname_end > pname_start) {
  299. rspamd_content_type_add_param (pool, &val, pname_start,
  300. pname_end, c, c + qlen);
  301. }
  302. break;
  303. default:
  304. break;
  305. }
  306. if (val.type.len > 0) {
  307. res = rspamd_mempool_alloc (pool, sizeof (val));
  308. memcpy (res, &val, sizeof (val));
  309. }
  310. return res;
  311. }
  312. struct rspamd_content_type *
  313. rspamd_content_type_parse (const gchar *in,
  314. gsize len, rspamd_mempool_t *pool)
  315. {
  316. struct rspamd_content_type *res = NULL;
  317. rspamd_ftok_t srch;
  318. gchar *lc_data;
  319. lc_data = rspamd_mempool_alloc (pool, len);
  320. memcpy (lc_data, in, len);
  321. rspamd_str_lc (lc_data, len);
  322. if ((res = rspamd_content_type_parser (lc_data, len, pool)) != NULL) {
  323. if (res->attrs) {
  324. rspamd_mempool_add_destructor (pool,
  325. (rspamd_mempool_destruct_t)g_hash_table_unref, res->attrs);
  326. }
  327. /* Now do some hacks to work with broken content types */
  328. if (res->subtype.len == 0) {
  329. res->flags |= RSPAMD_CONTENT_TYPE_BROKEN;
  330. RSPAMD_FTOK_ASSIGN (&srch, "text");
  331. if (rspamd_ftok_cmp (&res->type, &srch) == 0) {
  332. /* Workaround for Content-Type: text */
  333. /* Assume text/plain */
  334. RSPAMD_FTOK_ASSIGN (&srch, "plain");
  335. }
  336. else {
  337. RSPAMD_FTOK_ASSIGN (&srch, "html");
  338. if (rspamd_ftok_cmp (&res->type, &srch) == 0) {
  339. /* Workaround for Content-Type: html */
  340. RSPAMD_FTOK_ASSIGN (&res->type, "text");
  341. RSPAMD_FTOK_ASSIGN (&res->subtype, "html");
  342. }
  343. else {
  344. RSPAMD_FTOK_ASSIGN (&srch, "application");
  345. if (rspamd_ftok_cmp (&res->type, &srch) == 0) {
  346. RSPAMD_FTOK_ASSIGN (&res->subtype, "octet-stream");
  347. }
  348. }
  349. }
  350. }
  351. else {
  352. /* Common mistake done by retards */
  353. RSPAMD_FTOK_ASSIGN (&srch, "alternate");
  354. if (rspamd_ftok_cmp (&res->subtype, &srch) == 0) {
  355. res->flags |= RSPAMD_CONTENT_TYPE_BROKEN;
  356. RSPAMD_FTOK_ASSIGN (&res->subtype, "alternative");
  357. }
  358. }
  359. RSPAMD_FTOK_ASSIGN (&srch, "multipart");
  360. if (rspamd_ftok_cmp (&res->type, &srch) == 0) {
  361. res->flags |= RSPAMD_CONTENT_TYPE_MULTIPART;
  362. }
  363. else {
  364. RSPAMD_FTOK_ASSIGN (&srch, "text");
  365. if (rspamd_ftok_cmp (&res->type, &srch) == 0) {
  366. res->flags |= RSPAMD_CONTENT_TYPE_TEXT;
  367. }
  368. else {
  369. RSPAMD_FTOK_ASSIGN (&srch, "message");
  370. if (rspamd_ftok_cmp (&res->type, &srch) == 0) {
  371. RSPAMD_FTOK_ASSIGN (&srch, "delivery-status");
  372. if (rspamd_ftok_cmp (&res->subtype, &srch) == 0) {
  373. res->flags |= RSPAMD_CONTENT_TYPE_TEXT|RSPAMD_CONTENT_TYPE_DSN;
  374. }
  375. else {
  376. res->flags |= RSPAMD_CONTENT_TYPE_MESSAGE;
  377. }
  378. }
  379. }
  380. }
  381. }
  382. else {
  383. msg_warn_pool ("cannot parse content type: %*s", (gint)len, lc_data);
  384. }
  385. return res;
  386. }
  387. void
  388. rspamd_content_disposition_add_param (rspamd_mempool_t *pool,
  389. struct rspamd_content_disposition *cd,
  390. const gchar *name_start, const gchar *name_end,
  391. const gchar *value_start, const gchar *value_end)
  392. {
  393. rspamd_ftok_t srch;
  394. gchar *decoded;
  395. struct rspamd_content_type_param *found = NULL, *nparam;
  396. g_assert (cd != NULL);
  397. srch.begin = name_start;
  398. srch.len = name_end - name_start;
  399. if (cd->attrs) {
  400. found = g_hash_table_lookup (cd->attrs, &srch);
  401. }
  402. else {
  403. cd->attrs = g_hash_table_new (rspamd_ftok_icase_hash,
  404. rspamd_ftok_icase_equal);
  405. }
  406. nparam = rspamd_mempool_alloc (pool, sizeof (*nparam));
  407. nparam->name.begin = name_start;
  408. nparam->name.len = name_end - name_start;
  409. decoded = rspamd_mime_header_decode (pool, value_start, value_end - value_start);
  410. RSPAMD_FTOK_FROM_STR (&nparam->value, decoded);
  411. if (!found) {
  412. g_hash_table_insert (cd->attrs, &nparam->name, nparam);
  413. }
  414. DL_APPEND (found, nparam);
  415. srch.begin = "filename";
  416. srch.len = 8;
  417. if (rspamd_ftok_cmp (&nparam->name, &srch) == 0) {
  418. /* Adjust filename */
  419. cd->filename.begin = nparam->value.begin;
  420. cd->filename.len = nparam->value.len;
  421. }
  422. }
  423. struct rspamd_content_disposition *
  424. rspamd_content_disposition_parse (const gchar *in,
  425. gsize len, rspamd_mempool_t *pool)
  426. {
  427. struct rspamd_content_disposition *res = NULL, val;
  428. val.lc_data = rspamd_mempool_alloc (pool, len);
  429. memcpy (val.lc_data, in, len);
  430. rspamd_str_lc (val.lc_data, len);
  431. if (rspamd_content_disposition_parser (in, len, &val, pool)) {
  432. res = rspamd_mempool_alloc (pool, sizeof (val));
  433. memcpy (res, &val, sizeof (val));
  434. if (res->attrs) {
  435. rspamd_mempool_add_destructor (pool,
  436. (rspamd_mempool_destruct_t)g_hash_table_unref, res->attrs);
  437. }
  438. }
  439. else {
  440. msg_warn_pool ("cannot parse content disposition: %*s",
  441. (gint)len, val.lc_data);
  442. }
  443. return res;
  444. }