summaryrefslogtreecommitdiffstats
path: root/src/main/java/com/gitblit/wicket/pages/RepositoryPage.java
blob: 2b97bc1657cc9f6fb494268644221c48536181ad (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
/*
 * Copyright 2011 gitblit.com.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.gitblit.wicket.pages;

import java.io.Serializable;
import java.text.MessageFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.wicket.Component;
import org.apache.wicket.PageParameters;
import org.apache.wicket.RestartResponseException;
import org.apache.wicket.behavior.SimpleAttributeModifier;
import org.apache.wicket.markup.html.basic.Label;
import org.apache.wicket.markup.html.form.DropDownChoice;
import org.apache.wicket.markup.html.form.TextField;
import org.apache.wicket.markup.html.link.ExternalLink;
import org.apache.wicket.markup.html.panel.Fragment;
import org.apache.wicket.model.IModel;
import org.apache.wicket.model.Model;
import org.apache.wicket.request.target.basic.RedirectRequestTarget;
import org.eclipse.jgit.diff.DiffEntry.ChangeType;
import org.eclipse.jgit.lib.PersonIdent;
import org.eclipse.jgit.lib.Repository;
import org.eclipse.jgit.revwalk.RevCommit;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.gitblit.Constants;
import com.gitblit.GitBlitException;
import com.gitblit.Keys;
import com.gitblit.models.ProjectModel;
import com.gitblit.models.RefModel;
import com.gitblit.models.RepositoryModel;
import com.gitblit.models.SubmoduleModel;
import com.gitblit.models.UserModel;
import com.gitblit.models.UserRepositoryPreferences;
import com.gitblit.servlet.PagesServlet;
import com.gitblit.servlet.SyndicationServlet;
import com.gitblit.tickets.TicketIndexer.Lucene;
import com.gitblit.utils.ArrayUtils;
import com.gitblit.utils.BugtraqProcessor;
import com.gitblit.utils.DeepCopier;
import com.gitblit.utils.JGitUtils;
import com.gitblit.utils.RefLogUtils;
import com.gitblit.utils.StringUtils;
import com.gitblit.wicket.CacheControl;
import com.gitblit.wicket.GitBlitWebSession;
import com.gitblit.wicket.PageRegistration;
import com.gitblit.wicket.PageRegistration.OtherPageLink;
import com.gitblit.wicket.SessionlessForm;
import com.gitblit.wicket.WicketUtils;
import com.gitblit.wicket.panels.LinkPanel;
import com.gitblit.wicket.panels.NavigationPanel;
import com.gitblit.wicket.panels.RefsPanel;

public abstract class RepositoryPage extends RootPage {

	protected final Logger logger = LoggerFactory.getLogger(getClass());

	private final String PARAM_STAR = "star";

	protected final String projectName;
	protected final String repositoryName;
	protected final String objectId;

	private transient Repository r;

	private RepositoryModel m;

	private Map<String, SubmoduleModel> submodules;

	private final Map<String, PageRegistration> registeredPages;
	private boolean showAdmin;
	private boolean isOwner;

	public RepositoryPage(PageParameters params) {
		super(params);
		repositoryName = WicketUtils.getRepositoryName(params);
		String root = StringUtils.getFirstPathElement(repositoryName);
		if (StringUtils.isEmpty(root)) {
			projectName = app().settings().getString(Keys.web.repositoryRootGroupName, "main");
		} else {
			projectName = root;
		}
		objectId = WicketUtils.getObject(params);

		if (StringUtils.isEmpty(repositoryName)) {
			error(MessageFormat.format(getString("gb.repositoryNotSpecifiedFor"), getPageName()), true);
		}

		if (!getRepositoryModel().hasCommits) {
			throw new RestartResponseException(EmptyRepositoryPage.class, params);
		}

		if (getRepositoryModel().isCollectingGarbage) {
			error(MessageFormat.format(getString("gb.busyCollectingGarbage"), getRepositoryModel().name), true);
		}

		if (objectId != null) {
			RefModel branch = null;
			if ((branch = JGitUtils.getBranch(getRepository(), objectId)) != null) {
				UserModel user = GitBlitWebSession.get().getUser();
				if (user == null) {
					// workaround until get().getUser() is reviewed throughout the app
					user = UserModel.ANONYMOUS;
				}
				boolean canAccess = user.canView(getRepositoryModel(),
								branch.reference.getName());
				if (!canAccess) {
					error(getString("gb.accessDenied"), true);
				}
			}
		}

		if (params.containsKey(PARAM_STAR)) {
			// set starred state
			boolean star = params.getBoolean(PARAM_STAR);
			UserModel user = GitBlitWebSession.get().getUser();
			if (user != null && user.isAuthenticated) {
				UserRepositoryPreferences prefs = user.getPreferences().getRepositoryPreferences(getRepositoryModel().name);
				prefs.starred = star;
				try {
					app().gitblit().reviseUser(user.username, user);
				} catch (GitBlitException e) {
					logger.error("Failed to update user " + user.username, e);
					error(getString("gb.failedToUpdateUser"), false);
				}
			}
		}

		// register the available page links for this page and user
		registeredPages = registerPages();

		// standard page links
		List<PageRegistration> pages = new ArrayList<PageRegistration>(registeredPages.values());
		NavigationPanel navigationPanel = new NavigationPanel("repositoryNavPanel", getRepoNavPageClass(), pages);
		add(navigationPanel);

		add(new ExternalLink("syndication", SyndicationServlet.asLink(getRequest()
				.getRelativePathPrefixToContextRoot(), repositoryName, null, 0)));

		// add floating search form
		SearchForm searchForm = new SearchForm("searchForm", repositoryName);
		add(searchForm);
		searchForm.setTranslatedAttributes();

		// set stateless page preference
		setStatelessHint(true);
	}

	@Override
	protected Class<? extends BasePage> getRootNavPageClass() {
		return RepositoriesPage.class;
	}

	protected Class<? extends BasePage> getRepoNavPageClass() {
		return getClass();
	}

	protected BugtraqProcessor bugtraqProcessor() {
		return new BugtraqProcessor(app().settings());
	}

	private Map<String, PageRegistration> registerPages() {
		PageParameters params = null;
		if (!StringUtils.isEmpty(repositoryName)) {
			params = WicketUtils.newRepositoryParameter(repositoryName);
		}
		Map<String, PageRegistration> pages = new LinkedHashMap<String, PageRegistration>();

		Repository r = getRepository();
		RepositoryModel model = getRepositoryModel();

		// standard links
		if (RefLogUtils.getRefLogBranch(r) == null) {
			pages.put("summary", new PageRegistration("gb.summary", SummaryPage.class, params));
		} else {
			pages.put("summary", new PageRegistration("gb.summary", SummaryPage.class, params));
//			pages.put("overview", new PageRegistration("gb.overview", OverviewPage.class, params));
			pages.put("reflog", new PageRegistration("gb.reflog", ReflogPage.class, params));
		}
		pages.put("commits", new PageRegistration("gb.commits", LogPage.class, params));
		pages.put("tree", new PageRegistration("gb.tree", TreePage.class, params));
		if (app().tickets().isReady() && (app().tickets().isAcceptingNewTickets(getRepositoryModel()) || app().tickets().hasTickets(getRepositoryModel()))) {
			PageParameters tParams = new PageParameters(params);
			for (String state : TicketsPage.openStatii) {
				tParams.add(Lucene.status.name(), state);
			}
			pages.put("tickets", new PageRegistration("gb.tickets", TicketsPage.class, tParams));
		}
		pages.put("docs", new PageRegistration("gb.docs", DocsPage.class, params, true));
		if (app().settings().getBoolean(Keys.web.allowForking, true)) {
			pages.put("forks", new PageRegistration("gb.forks", ForksPage.class, params, true));
		}
		pages.put("compare", new PageRegistration("gb.compare", ComparePage.class, params, true));

		// conditional links
		// per-repository extra page links
		if (JGitUtils.getPagesBranch(r) != null) {
			OtherPageLink pagesLink = new OtherPageLink("gb.pages", PagesServlet.asLink(
					getRequest().getRelativePathPrefixToContextRoot(), repositoryName, null), true);
			pages.put("pages", pagesLink);
		}

		// Conditionally add edit link
		showAdmin = false;
		if (app().settings().getBoolean(Keys.web.authenticateAdminPages, true)) {
			boolean allowAdmin = app().settings().getBoolean(Keys.web.allowAdministration, false);
			showAdmin = allowAdmin && GitBlitWebSession.get().canAdmin();
		} else {
			showAdmin = app().settings().getBoolean(Keys.web.allowAdministration, false);
		}
		isOwner = GitBlitWebSession.get().isLoggedIn()
				&& (model.isOwner(GitBlitWebSession.get()
						.getUsername()));
		return pages;
	}

	protected boolean allowForkControls() {
		return app().settings().getBoolean(Keys.web.allowForking, true);
	}

	@Override
	protected void setupPage(String repositoryName, String pageName) {
		String projectName = StringUtils.getFirstPathElement(repositoryName);
		ProjectModel project = app().projects().getProjectModel(projectName);
		if (project.isUserProject()) {
			// user-as-project
			add(new LinkPanel("projectTitle", null, project.getDisplayName(),
					UserPage.class, WicketUtils.newUsernameParameter(project.name.substring(1))));
		} else {
			// project
			add(new LinkPanel("projectTitle", null, project.name,
					ProjectPage.class, WicketUtils.newProjectParameter(project.name)));
		}

		String name = StringUtils.stripDotGit(repositoryName);
		if (!StringUtils.isEmpty(projectName) && name.startsWith(projectName)) {
			name = name.substring(projectName.length() + 1);
		}
		add(new LinkPanel("repositoryName", null, name, SummaryPage.class,
				WicketUtils.newRepositoryParameter(repositoryName)));

		UserModel user = GitBlitWebSession.get().getUser();
		if (user == null) {
			user = UserModel.ANONYMOUS;
		}

		// indicate origin repository
		RepositoryModel model = getRepositoryModel();
		if (StringUtils.isEmpty(model.originRepository)) {
			if (model.isMirror) {
				Fragment mirrorFrag = new Fragment("originRepository", "mirrorFragment", this);
				Label lbl = new Label("originRepository", MessageFormat.format(getString("gb.mirrorOf"), "<b>" + model.origin + "</b>"));
				mirrorFrag.add(lbl.setEscapeModelStrings(false));
				add(mirrorFrag);
			} else {
				add(new Label("originRepository").setVisible(false));
			}
		} else {
			RepositoryModel origin = app().repositories().getRepositoryModel(model.originRepository);
			if (origin == null) {
				// no origin repository
				add(new Label("originRepository").setVisible(false));
			} else if (!user.canView(origin)) {
				// show origin repository without link
				Fragment forkFrag = new Fragment("originRepository", "originFragment", this);
				forkFrag.add(new Label("originRepository", StringUtils.stripDotGit(model.originRepository)));
				add(forkFrag);
			} else {
				// link to origin repository
				Fragment forkFrag = new Fragment("originRepository", "originFragment", this);
				forkFrag.add(new LinkPanel("originRepository", null, StringUtils.stripDotGit(model.originRepository),
						SummaryPage.class, WicketUtils.newRepositoryParameter(model.originRepository)));
				add(forkFrag);
			}
		}

		// new ticket button
		if (user.isAuthenticated && app().tickets().isAcceptingNewTickets(getRepositoryModel())) {
			String newTicketUrl = getRequestCycle().urlFor(NewTicketPage.class, WicketUtils.newRepositoryParameter(repositoryName)).toString();
			addToolbarButton("newTicketLink", "fa fa-ticket", getString("gb.new"), newTicketUrl);
		} else {
			add(new Label("newTicketLink").setVisible(false));
		}

		// (un)star link allows a user to star a repository
		if (user.isAuthenticated) {
			PageParameters starParams = DeepCopier.copy(getPageParameters());
			starParams.put(PARAM_STAR, !user.getPreferences().isStarredRepository(model.name));
			String toggleStarUrl = getRequestCycle().urlFor(getClass(), starParams).toString();
			if (user.getPreferences().isStarredRepository(model.name)) {
				// show unstar button
				add(new Label("starLink").setVisible(false));
				addToolbarButton("unstarLink", "icon-star-empty", getString("gb.unstar"), toggleStarUrl);
			} else {
				// show star button
				addToolbarButton("starLink", "icon-star", getString("gb.star"), toggleStarUrl);
				add(new Label("unstarLink").setVisible(false));
			}
		} else {
			// anonymous user
			add(new Label("starLink").setVisible(false));
			add(new Label("unstarLink").setVisible(false));
		}

		// fork controls
		if (!allowForkControls() || !user.isAuthenticated) {
			// must be logged-in to fork, hide all fork controls
			add(new ExternalLink("forkLink", "").setVisible(false));
			add(new ExternalLink("myForkLink", "").setVisible(false));
		} else {
			String fork = app().repositories().getFork(user.username, model.name);
			boolean hasFork = fork != null;
			boolean canFork = user.canFork(model);

			if (hasFork || !canFork) {
				// user not allowed to fork or fork already exists or repo forbids forking
				add(new ExternalLink("forkLink", "").setVisible(false));

				if (hasFork && !fork.equals(model.name)) {
					// user has fork, view my fork link
					String url = getRequestCycle().urlFor(SummaryPage.class, WicketUtils.newRepositoryParameter(fork)).toString();
					add(new ExternalLink("myForkLink", url));
				} else {
					// no fork, hide view my fork link
					add(new ExternalLink("myForkLink", "").setVisible(false));
				}
			} else if (canFork) {
				// can fork and we do not have one
				add(new ExternalLink("myForkLink", "").setVisible(false));
				String url = getRequestCycle().urlFor(ForkPage.class, WicketUtils.newRepositoryParameter(model.name)).toString();
				add(new ExternalLink("forkLink", url));
			}
		}

		if (showAdmin || isOwner) {
			String url = getRequestCycle().urlFor(EditRepositoryPage.class, WicketUtils.newRepositoryParameter(model.name)).toString();
			add(new ExternalLink("editLink", url));
		} else {
			add(new Label("editLink").setVisible(false));
		}

		super.setupPage(repositoryName, pageName);
	}

	protected void addToolbarButton(String wicketId, String iconClass, String label, String url) {
		Fragment button = new Fragment(wicketId, "toolbarLinkFragment", this);
		Label icon = new Label("icon");
		WicketUtils.setCssClass(icon, iconClass);
		button.add(icon);
		button.add(new Label("label", label));
		button.add(new SimpleAttributeModifier("href", url));
		add(button);
	}

	protected void addSyndicationDiscoveryLink() {
		add(WicketUtils.syndicationDiscoveryLink(SyndicationServlet.getTitle(repositoryName,
				objectId), SyndicationServlet.asLink(getRequest()
				.getRelativePathPrefixToContextRoot(), repositoryName, objectId, 0)));
	}

	protected Repository getRepository() {
		if (r == null) {
			Repository r = app().repositories().getRepository(repositoryName);
			if (r == null) {
				error(getString("gb.canNotLoadRepository") + " " + repositoryName, true);
				return null;
			}
			this.r = r;
		}
		return r;
	}

	protected RepositoryModel getRepositoryModel() {
		if (m == null) {
			RepositoryModel model = app().repositories().getRepositoryModel(
					GitBlitWebSession.get().getUser(), repositoryName);
			if (model == null) {
				if (app().repositories().hasRepository(repositoryName, true)) {
					// has repository, but unauthorized
					authenticationError(getString("gb.unauthorizedAccessForRepository") + " " + repositoryName);
				} else {
					// does not have repository
					error(getString("gb.canNotLoadRepository") + " " + repositoryName, true);
				}
				return null;
			}
			m = model;
		}
		return m;
	}

	protected RevCommit getCommit() {
		RevCommit commit = JGitUtils.getCommit(r, objectId);
		if (commit == null) {
			error(MessageFormat.format(getString("gb.failedToFindCommit"),
					objectId, repositoryName, getPageName()), null, LogPage.class,
					WicketUtils.newRepositoryParameter(repositoryName));
		}
		getSubmodules(commit);
		return commit;
	}

	protected String getBestCommitId(RevCommit commit) {
		String head = null;
		try {
			head = r.resolve(getRepositoryModel().HEAD).getName();
		} catch (Exception e) {
		}

		String id = commit.getName();
		if (!StringUtils.isEmpty(head) && head.equals(id)) {
			// match default branch
			return Repository.shortenRefName(getRepositoryModel().HEAD);
		}

		// find first branch match
		for (RefModel ref : JGitUtils.getLocalBranches(r, false, -1)) {
			if (ref.getObjectId().getName().equals(id)) {
				return ref.getName();
			}
		}

		// return sha
		return id;
	}

	protected Map<String, SubmoduleModel> getSubmodules(RevCommit commit) {
		if (submodules == null) {
			submodules = new HashMap<String, SubmoduleModel>();
			for (SubmoduleModel model : JGitUtils.getSubmodules(r, commit.getTree())) {
				submodules.put(model.path, model);
			}
		}
		return submodules;
	}

	protected SubmoduleModel getSubmodule(String path) {
		SubmoduleModel model = null;
		if (submodules != null) {
			model = submodules.get(path);
		}
		if (model == null) {
			// undefined submodule?!
			model = new SubmoduleModel(path.substring(path.lastIndexOf('/') + 1), path, path);
			model.hasSubmodule = false;
			model.gitblitPath = model.name;
			return model;
		} else {
			// extract the repository name from the clone url
			List<String> patterns = app().settings().getStrings(Keys.git.submoduleUrlPatterns);
			String submoduleName = StringUtils.extractRepositoryPath(model.url, patterns.toArray(new String[0]));

			// determine the current path for constructing paths relative
			// to the current repository
			String currentPath = "";
			if (repositoryName.indexOf('/') > -1) {
				currentPath = repositoryName.substring(0, repositoryName.lastIndexOf('/') + 1);
			}

			// try to locate the submodule repository
			// prefer bare to non-bare names
			List<String> candidates = new ArrayList<String>();

			// relative
			candidates.add(currentPath + StringUtils.stripDotGit(submoduleName));
			candidates.add(candidates.get(candidates.size() - 1) + ".git");

			// relative, no subfolder
			if (submoduleName.lastIndexOf('/') > -1) {
				String name = submoduleName.substring(submoduleName.lastIndexOf('/') + 1);
				candidates.add(currentPath + StringUtils.stripDotGit(name));
				candidates.add(candidates.get(candidates.size() - 1) + ".git");
			}

			// absolute
			candidates.add(StringUtils.stripDotGit(submoduleName));
			candidates.add(candidates.get(candidates.size() - 1) + ".git");

			// absolute, no subfolder
			if (submoduleName.lastIndexOf('/') > -1) {
				String name = submoduleName.substring(submoduleName.lastIndexOf('/') + 1);
				candidates.add(StringUtils.stripDotGit(name));
				candidates.add(candidates.get(candidates.size() - 1) + ".git");
			}

			// create a unique, ordered set of candidate paths
			Set<String> paths = new LinkedHashSet<String>(candidates);
			for (String candidate : paths) {
				if (app().repositories().hasRepository(candidate)) {
					model.hasSubmodule = true;
					model.gitblitPath = candidate;
					return model;
				}
			}

			// we do not have a copy of the submodule, but we need a path
			model.gitblitPath = candidates.get(0);
			return model;
		}
	}

	protected String getShortObjectId(String objectId) {
		return objectId.substring(0, app().settings().getInteger(Keys.web.shortCommitIdLength, 6));
	}

	protected void addRefs(Repository r, RevCommit c) {
		add(new RefsPanel("refsPanel", repositoryName, c, JGitUtils.getAllRefs(r, getRepositoryModel().showRemoteBranches)));
	}

	protected void addFullText(String wicketId, String text) {
		RepositoryModel model = getRepositoryModel();
		String content = bugtraqProcessor().processCommitMessage(r, model, text);
		String html;
		switch (model.commitMessageRenderer) {
		case MARKDOWN:
			html = MessageFormat.format("<div class='commit_message'>{0}</div>", content);
			break;
		default:
			html = MessageFormat.format("<pre class='commit_message'>{0}</pre>", content);
			break;
		}
		add(new Label(wicketId, html).setEscapeModelStrings(false));
	}

	protected abstract String getPageName();

	protected Component createPersonPanel(String wicketId, PersonIdent identity,
			Constants.SearchType searchType) {
		String name = identity == null ? "" : identity.getName();
		String address = identity == null ? "" : identity.getEmailAddress();
		name = StringUtils.removeNewlines(name);
		address = StringUtils.removeNewlines(address);
		boolean showEmail = app().settings().getBoolean(Keys.web.showEmailAddresses, false);
		if (!showEmail || StringUtils.isEmpty(name) || StringUtils.isEmpty(address)) {
			String value = name;
			if (StringUtils.isEmpty(value)) {
				if (showEmail) {
					value = address;
				} else {
					value = getString("gb.missingUsername");
				}
			}
			Fragment partial = new Fragment(wicketId, "partialPersonIdent", this);
			LinkPanel link = new LinkPanel("personName", "list", value, GitSearchPage.class,
					WicketUtils.newSearchParameter(repositoryName, objectId, value, searchType));
			setPersonSearchTooltip(link, value, searchType);
			partial.add(link);
			return partial;
		} else {
			Fragment fullPerson = new Fragment(wicketId, "fullPersonIdent", this);
			LinkPanel nameLink = new LinkPanel("personName", "list", name, GitSearchPage.class,
					WicketUtils.newSearchParameter(repositoryName, objectId, name, searchType));
			setPersonSearchTooltip(nameLink, name, searchType);
			fullPerson.add(nameLink);

			LinkPanel addressLink = new LinkPanel("personAddress", "hidden-phone list", "<" + address + ">",
					GitSearchPage.class, WicketUtils.newSearchParameter(repositoryName, objectId,
							address, searchType));
			setPersonSearchTooltip(addressLink, address, searchType);
			fullPerson.add(addressLink);
			return fullPerson;
		}
	}

	protected void setPersonSearchTooltip(Component component, String value,
			Constants.SearchType searchType) {
		if (searchType.equals(Constants.SearchType.AUTHOR)) {
			WicketUtils.setHtmlTooltip(component, getString("gb.searchForAuthor") + " " + value);
		} else if (searchType.equals(Constants.SearchType.COMMITTER)) {
			WicketUtils.setHtmlTooltip(component, getString("gb.searchForCommitter") + " " + value);
		}
	}

	protected void setChangeTypeTooltip(Component container, ChangeType type) {
		switch (type) {
		case ADD:
			WicketUtils.setHtmlTooltip(container, getString("gb.addition"));
			break;
		case COPY:
		case RENAME:
			WicketUtils.setHtmlTooltip(container, getString("gb.rename"));
			break;
		case DELETE:
			WicketUtils.setHtmlTooltip(container, getString("gb.deletion"));
			break;
		case MODIFY:
			WicketUtils.setHtmlTooltip(container, getString("gb.modification"));
			break;
		}
	}

	@Override
	protected void onBeforeRender() {
		// dispose of repository object
		if (r != null) {
			r.close();
			r = null;
		}
		// setup page header and footer
		setupPage(repositoryName, "/ " + getPageName());
		super.onBeforeRender();
	}

	@Override
	protected void setLastModified() {
		if (getClass().isAnnotationPresent(CacheControl.class)) {
			CacheControl cacheControl = getClass().getAnnotation(CacheControl.class);
			switch (cacheControl.value()) {
			case REPOSITORY:
				RepositoryModel repository = getRepositoryModel();
				if (repository != null) {
					setLastModified(repository.lastChange);
				}
				break;
			case COMMIT:
				RevCommit commit = getCommit();
				if (commit != null) {
					Date commitDate = JGitUtils.getCommitDate(commit);
					setLastModified(commitDate);
				}
				break;
			default:
				super.setLastModified();
			}
		}
	}

	protected PageParameters newRepositoryParameter() {
		return WicketUtils.newRepositoryParameter(repositoryName);
	}

	protected PageParameters newCommitParameter() {
		return WicketUtils.newObjectParameter(repositoryName, objectId);
	}

	protected PageParameters newCommitParameter(String commitId) {
		return WicketUtils.newObjectParameter(repositoryName, commitId);
	}

	public boolean isShowAdmin() {
		return showAdmin;
	}

	public boolean isOwner() {
		return isOwner;
	}

	private class SearchForm extends SessionlessForm<Void> implements Serializable {
		private static final long serialVersionUID = 1L;

		private final String repositoryName;

		private final IModel<String> searchBoxModel = new Model<String>("");

		private final IModel<Constants.SearchType> searchTypeModel = new Model<Constants.SearchType>(
				Constants.SearchType.COMMIT);

		public SearchForm(String id, String repositoryName) {
			super(id, RepositoryPage.this.getClass(), RepositoryPage.this.getPageParameters());
			this.repositoryName = repositoryName;
			DropDownChoice<Constants.SearchType> searchType = new DropDownChoice<Constants.SearchType>(
					"searchType", Arrays.asList(Constants.SearchType.values()));
			searchType.setModel(searchTypeModel);
			add(searchType.setVisible(app().settings().getBoolean(Keys.web.showSearchTypeSelection, false)));
			TextField<String> searchBox = new TextField<String>("searchBox", searchBoxModel);
			add(searchBox);
		}

		void setTranslatedAttributes() {
			WicketUtils.setHtmlTooltip(get("searchType"), getString("gb.searchTypeTooltip"));
			WicketUtils.setHtmlTooltip(get("searchBox"),
					MessageFormat.format(getString("gb.searchTooltip"), repositoryName));
			WicketUtils.setInputPlaceholder(get("searchBox"), getString("gb.search"));
		}

		@Override
		public void onSubmit() {
			Constants.SearchType searchType = searchTypeModel.getObject();
			String searchString = searchBoxModel.getObject();
			if (StringUtils.isEmpty(searchString)) {
				// redirect to self to avoid wicket page update bug
				String absoluteUrl = getCanonicalUrl();
				getRequestCycle().setRequestTarget(new RedirectRequestTarget(absoluteUrl));
				return;
			}
			for (Constants.SearchType type : Constants.SearchType.values()) {
				if (searchString.toLowerCase().startsWith(type.name().toLowerCase() + ":")) {
					searchType = type;
					searchString = searchString.substring(type.name().toLowerCase().length() + 1)
							.trim();
					break;
				}
			}
			Class<? extends BasePage> searchPageClass = GitSearchPage.class;
			RepositoryModel model = app().repositories().getRepositoryModel(repositoryName);
			if (app().settings().getBoolean(Keys.web.allowLuceneIndexing, true)
					&& !ArrayUtils.isEmpty(model.indexedBranches)) {
				// this repository is Lucene-indexed
				searchPageClass = LuceneSearchPage.class;
			}
			// use an absolute url to workaround Wicket-Tomcat problems with
			// mounted url parameters (issue-111)
			PageParameters params = WicketUtils.newSearchParameter(repositoryName, null, searchString, searchType);
			String absoluteUrl = getCanonicalUrl(searchPageClass, params);
			getRequestCycle().setRequestTarget(new RedirectRequestTarget(absoluteUrl));
		}
	}
}
url_web_start, url_web_end, 0}, /* Likely emails */ {"@", "mailto://", url_email_start, url_email_end, 0}}; struct rspamd_url_flag_name { const gchar *name; gint flag; gint hash; } url_flag_names[] = { {"phished", RSPAMD_URL_FLAG_PHISHED, -1}, {"numeric", RSPAMD_URL_FLAG_NUMERIC, -1}, {"obscured", RSPAMD_URL_FLAG_OBSCURED, -1}, {"redirected", RSPAMD_URL_FLAG_REDIRECTED, -1}, {"html_displayed", RSPAMD_URL_FLAG_HTML_DISPLAYED, -1}, {"text", RSPAMD_URL_FLAG_FROM_TEXT, -1}, {"subject", RSPAMD_URL_FLAG_SUBJECT, -1}, {"host_encoded", RSPAMD_URL_FLAG_HOSTENCODED, -1}, {"schema_encoded", RSPAMD_URL_FLAG_SCHEMAENCODED, -1}, {"path_encoded", RSPAMD_URL_FLAG_PATHENCODED, -1}, {"query_encoded", RSPAMD_URL_FLAG_QUERYENCODED, -1}, {"missing_slashes", RSPAMD_URL_FLAG_MISSINGSLASHES, -1}, {"idn", RSPAMD_URL_FLAG_IDN, -1}, {"has_port", RSPAMD_URL_FLAG_HAS_PORT, -1}, {"has_user", RSPAMD_URL_FLAG_HAS_USER, -1}, {"schemaless", RSPAMD_URL_FLAG_SCHEMALESS, -1}, {"unnormalised", RSPAMD_URL_FLAG_UNNORMALISED, -1}, {"zw_spaces", RSPAMD_URL_FLAG_ZW_SPACES, -1}, {"url_displayed", RSPAMD_URL_FLAG_DISPLAY_URL, -1}, {"image", RSPAMD_URL_FLAG_IMAGE, -1}, {"query", RSPAMD_URL_FLAG_QUERY, -1}, {"content", RSPAMD_URL_FLAG_CONTENT, -1}, {"no_tld", RSPAMD_URL_FLAG_NO_TLD, -1}, {"truncated", RSPAMD_URL_FLAG_TRUNCATED, -1}, {"redirect_target", RSPAMD_URL_FLAG_REDIRECT_TARGET, -1}, {"invisible", RSPAMD_URL_FLAG_INVISIBLE, -1}, {"special", RSPAMD_URL_FLAG_SPECIAL, -1}, }; static inline khint_t rspamd_url_hash(struct rspamd_url *u); static inline khint_t rspamd_url_host_hash(struct rspamd_url *u); static inline bool rspamd_urls_cmp(struct rspamd_url *a, struct rspamd_url *b); static inline bool rspamd_urls_host_cmp(struct rspamd_url *a, struct rspamd_url *b); /* Hash table implementation */ __KHASH_IMPL(rspamd_url_hash, kh_inline, struct rspamd_url *, char, false, rspamd_url_hash, rspamd_urls_cmp); __KHASH_IMPL(rspamd_url_host_hash, kh_inline, struct rspamd_url *, char, false, rspamd_url_host_hash, rspamd_urls_host_cmp); struct url_callback_data { const gchar *begin; gchar *url_str; rspamd_mempool_t *pool; gint len; enum rspamd_url_find_type how; gboolean prefix_added; guint newline_idx; GArray *matchers; GPtrArray *newlines; const gchar *start; const gchar *fin; const gchar *end; const gchar *last_at; url_insert_function func; void *funcd; }; struct url_match_scanner { GArray *matchers_full; GArray *matchers_strict; struct rspamd_multipattern *search_trie_full; struct rspamd_multipattern *search_trie_strict; bool has_tld_file; }; struct url_match_scanner *url_scanner = NULL; enum { IS_LWSP = (1 << 0), IS_DOMAIN = (1 << 1), IS_URLSAFE = (1 << 2), IS_MAILSAFE = (1 << 3), IS_DOMAIN_END = (1 << 4) }; static const unsigned int url_scanner_table[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, IS_LWSP, IS_LWSP, IS_LWSP, IS_LWSP, IS_LWSP, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, IS_LWSP /* */, IS_MAILSAFE /* ! */, IS_URLSAFE | IS_DOMAIN_END | IS_MAILSAFE /* " */, IS_MAILSAFE /* # */, IS_MAILSAFE /* $ */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* % */, 0 /* & */, IS_MAILSAFE /* ' */, 0 /* ( */, 0 /* ) */, IS_MAILSAFE /* * */, IS_MAILSAFE /* + */, IS_MAILSAFE /* , */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* - */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* . */, IS_DOMAIN_END | IS_MAILSAFE /* / */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* 0 */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* 1 */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* 2 */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* 3 */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* 4 */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* 5 */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* 6 */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* 7 */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* 8 */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* 9 */, IS_DOMAIN_END /* : */, 0 /* ; */, IS_URLSAFE | IS_DOMAIN_END /* < */, 0 /* = */, IS_URLSAFE | IS_DOMAIN_END /* > */, IS_DOMAIN_END /* ? */, 0 /* @ */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* A */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* B */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* C */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* D */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* E */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* F */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* G */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* H */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* I */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* J */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* K */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* L */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* M */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* N */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* O */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* P */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* Q */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* R */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* S */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* T */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* U */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* V */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* W */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* X */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* Y */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* Z */, 0 /* [ */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* \ */, 0 /* ] */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* ^ */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* _ */, IS_URLSAFE | IS_DOMAIN_END /* ` */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* a */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* b */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* c */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* d */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* e */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* f */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* g */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* h */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* i */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* j */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* k */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* l */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* m */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* n */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* o */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* p */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* q */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* r */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* s */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* t */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* u */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* v */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* w */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* x */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* y */, IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* z */, IS_URLSAFE | IS_DOMAIN_END | IS_MAILSAFE /* { */, IS_URLSAFE | IS_DOMAIN_END | IS_MAILSAFE /* | */, IS_URLSAFE | IS_DOMAIN_END | IS_MAILSAFE /* } */, IS_URLSAFE | IS_DOMAIN_END | IS_MAILSAFE /* ~ */, 0, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN}; #define is_lwsp(x) ((url_scanner_table[(guchar) (x)] & IS_LWSP) != 0) #define is_mailsafe(x) ((url_scanner_table[(guchar) (x)] & (IS_MAILSAFE)) != 0) #define is_domain(x) ((url_scanner_table[(guchar) (x)] & IS_DOMAIN) != 0) #define is_urlsafe(x) ((url_scanner_table[(guchar) (x)] & (IS_URLSAFE)) != 0) const gchar * rspamd_url_strerror(int err) { switch (err) { case URI_ERRNO_OK: return "Parsing went well"; case URI_ERRNO_EMPTY: return "The URI string was empty"; case URI_ERRNO_INVALID_PROTOCOL: return "No protocol was found"; case URI_ERRNO_BAD_FORMAT: return "Bad URL format"; case URI_ERRNO_BAD_ENCODING: return "Invalid symbols encoded"; case URI_ERRNO_INVALID_PORT: return "Port number is bad"; case URI_ERRNO_TLD_MISSING: return "TLD part is not detected"; case URI_ERRNO_HOST_MISSING: return "Host part is missing"; case URI_ERRNO_TOO_LONG: return "URL is too long"; } return NULL; } static gboolean rspamd_url_parse_tld_file(const gchar *fname, struct url_match_scanner *scanner) { FILE *f; struct url_matcher m; gchar *linebuf = NULL, *p; gsize buflen = 0; gssize r; gint flags; f = fopen(fname, "r"); if (f == NULL) { msg_err("cannot open TLD file %s: %s", fname, strerror(errno)); return FALSE; } m.end = url_tld_end; m.start = url_tld_start; m.prefix = "http://"; while ((r = getline(&linebuf, &buflen, f)) > 0) { if (linebuf[0] == '/' || g_ascii_isspace(linebuf[0])) { /* Skip comment or empty line */ continue; } g_strchomp(linebuf); /* TODO: add support for ! patterns */ if (linebuf[0] == '!') { msg_debug("skip '!' patterns from parsing for now: %s", linebuf); continue; } flags = URL_FLAG_NOHTML | URL_FLAG_TLD_MATCH; if (linebuf[0] == '*') { flags |= URL_FLAG_STAR_MATCH; p = strchr(linebuf, '.'); if (p == NULL) { msg_err("got bad star line, skip it: %s", linebuf); continue; } p++; } else { p = linebuf; } m.flags = flags; rspamd_multipattern_add_pattern(url_scanner->search_trie_full, p, RSPAMD_MULTIPATTERN_TLD | RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8); m.pattern = rspamd_multipattern_get_pattern(url_scanner->search_trie_full, rspamd_multipattern_get_npatterns(url_scanner->search_trie_full) - 1); g_array_append_val(url_scanner->matchers_full, m); } free(linebuf); fclose(f); return TRUE; } static void rspamd_url_add_static_matchers(struct url_match_scanner *sc) { gint n = G_N_ELEMENTS(static_matchers), i; for (i = 0; i < n; i++) { if (static_matchers[i].flags & URL_FLAG_REGEXP) { rspamd_multipattern_add_pattern(url_scanner->search_trie_strict, static_matchers[i].pattern, RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8 | RSPAMD_MULTIPATTERN_RE); } else { rspamd_multipattern_add_pattern(url_scanner->search_trie_strict, static_matchers[i].pattern, RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8); } } g_array_append_vals(sc->matchers_strict, static_matchers, n); if (sc->matchers_full) { for (i = 0; i < n; i++) { if (static_matchers[i].flags & URL_FLAG_REGEXP) { rspamd_multipattern_add_pattern(url_scanner->search_trie_full, static_matchers[i].pattern, RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8 | RSPAMD_MULTIPATTERN_RE); } else { rspamd_multipattern_add_pattern(url_scanner->search_trie_full, static_matchers[i].pattern, RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8); } } g_array_append_vals(sc->matchers_full, static_matchers, n); } } void rspamd_url_deinit(void) { if (url_scanner != NULL) { if (url_scanner->search_trie_full) { rspamd_multipattern_destroy(url_scanner->search_trie_full); g_array_free(url_scanner->matchers_full, TRUE); } rspamd_multipattern_destroy(url_scanner->search_trie_strict); g_array_free(url_scanner->matchers_strict, TRUE); g_free(url_scanner); url_scanner = NULL; } } void rspamd_url_init(const gchar *tld_file) { GError *err = NULL; gboolean ret = TRUE; if (url_scanner != NULL) { rspamd_url_deinit(); } url_scanner = g_malloc(sizeof(struct url_match_scanner)); url_scanner->matchers_strict = g_array_sized_new(FALSE, TRUE, sizeof(struct url_matcher), G_N_ELEMENTS(static_matchers)); url_scanner->search_trie_strict = rspamd_multipattern_create_sized( G_N_ELEMENTS(static_matchers), RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8); if (tld_file) { /* Reserve larger multipattern */ url_scanner->matchers_full = g_array_sized_new(FALSE, TRUE, sizeof(struct url_matcher), 13000); url_scanner->search_trie_full = rspamd_multipattern_create_sized(13000, RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8); url_scanner->has_tld_file = true; } else { url_scanner->matchers_full = NULL; url_scanner->search_trie_full = NULL; url_scanner->has_tld_file = false; } rspamd_url_add_static_matchers(url_scanner); if (tld_file != NULL) { ret = rspamd_url_parse_tld_file(tld_file, url_scanner); } if (url_scanner->matchers_full && url_scanner->matchers_full->len > 1000) { msg_info("start compiling of %d TLD suffixes; it might take a long time", url_scanner->matchers_full->len); } if (!rspamd_multipattern_compile(url_scanner->search_trie_strict, &err)) { msg_err("cannot compile url matcher static patterns, fatal error: %e", err); abort(); } if (url_scanner->search_trie_full) { if (!rspamd_multipattern_compile(url_scanner->search_trie_full, &err)) { msg_err("cannot compile tld patterns, url matching will be " "incomplete: %e", err); g_error_free(err); ret = FALSE; } } if (tld_file != NULL) { if (ret) { msg_info("initialized %ud url match suffixes from '%s'", url_scanner->matchers_full->len - url_scanner->matchers_strict->len, tld_file); } else { msg_err("failed to initialize url tld suffixes from '%s', " "use %ud internal match suffixes", tld_file, url_scanner->matchers_strict->len); } } /* Generate hashes for flags */ for (gint i = 0; i < G_N_ELEMENTS(url_flag_names); i++) { url_flag_names[i].hash = rspamd_cryptobox_fast_hash_specific(RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT, url_flag_names[i].name, strlen(url_flag_names[i].name), 0); } /* Ensure that we have no hashes collisions O(N^2) but this array is small */ for (gint i = 0; i < G_N_ELEMENTS(url_flag_names) - 1; i++) { for (gint j = i + 1; j < G_N_ELEMENTS(url_flag_names); j++) { if (url_flag_names[i].hash == url_flag_names[j].hash) { msg_err("collision: both %s and %s map to %d", url_flag_names[i].name, url_flag_names[j].name, url_flag_names[i].hash); abort(); } } } } #define SET_U(u, field) \ do { \ if ((u) != NULL) { \ (u)->field_set |= 1 << (field); \ (u)->field_data[(field)].len = p - c; \ (u)->field_data[(field)].off = c - str; \ } \ } while (0) static bool is_url_start(gchar c) { if (c == '(' || c == '{' || c == '[' || c == '<' || c == '\'') { return TRUE; } return FALSE; } static bool is_url_end(gchar c) { if (c == ')' || c == '}' || c == ']' || c == '>' || c == '\'') { return TRUE; } return FALSE; } static bool is_domain_start(int p) { if (g_ascii_isalnum(p) || p == '[' || p == '%' || p == '_' || (p & 0x80)) { return TRUE; } return FALSE; } static const guint max_domain_length = 253; static const guint max_dns_label = 63; static const guint max_email_user = 64; static gint rspamd_mailto_parse(struct http_parser_url *u, const gchar *str, gsize len, gchar const **end, enum rspamd_url_parse_flags parse_flags, guint *flags) { const gchar *p = str, *c = str, *last = str + len; gchar t; gint ret = 1; enum { parse_mailto, parse_slash, parse_slash_slash, parse_semicolon, parse_prefix_question, parse_destination, parse_equal, parse_user, parse_at, parse_domain, parse_suffix_question, parse_query } st = parse_mailto; if (u != NULL) { memset(u, 0, sizeof(*u)); } while (p < last) { t = *p; if (p - str > max_email_user + max_domain_length + 1) { goto out; } switch (st) { case parse_mailto: if (t == ':') { st = parse_semicolon; SET_U(u, UF_SCHEMA); } p++; break; case parse_semicolon: if (t == '/' || t == '\\') { st = parse_slash; p++; } else { *flags |= RSPAMD_URL_FLAG_MISSINGSLASHES; st = parse_slash_slash; } break; case parse_slash: if (t == '/' || t == '\\') { st = parse_slash_slash; } else { goto out; } p++; break; case parse_slash_slash: if (t == '?') { st = parse_prefix_question; p++; } else if (t != '/' && t != '\\') { c = p; st = parse_user; } else { /* Skip multiple slashes */ p++; } break; case parse_prefix_question: if (t == 't') { /* XXX: accept only to= */ st = parse_destination; } else { goto out; } break; case parse_destination: if (t == '=') { st = parse_equal; } p++; break; case parse_equal: c = p; st = parse_user; break; case parse_user: if (t == '@') { if (p - c == 0) { goto out; } SET_U(u, UF_USERINFO); st = parse_at; } else if (!is_mailsafe(t)) { goto out; } else if (p - c > max_email_user) { goto out; } p++; break; case parse_at: c = p; st = parse_domain; break; case parse_domain: if (t == '?') { SET_U(u, UF_HOST); st = parse_suffix_question; } else if (!is_domain(t) && t != '.' && t != '_') { goto out; } else if (p - c > max_domain_length) { goto out; } p++; break; case parse_suffix_question: c = p; st = parse_query; break; case parse_query: if (t == '#') { if (p - c != 0) { SET_U(u, UF_QUERY); } c = p + 1; ret = 0; goto out; } else if (!(parse_flags & RSPAMD_URL_PARSE_HREF) && is_url_end(t)) { ret = 0; goto out; } else if (is_lwsp(t)) { if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) { if (g_ascii_isspace(t)) { ret = 0; } goto out; } else { goto out; } } p++; break; } } if (st == parse_domain) { if (p - c != 0) { SET_U(u, UF_HOST); ret = 0; } } else if (st == parse_query) { if (p - c > 0) { SET_U(u, UF_QUERY); } ret = 0; } out: if (end != NULL) { *end = p; } if ((parse_flags & RSPAMD_URL_PARSE_CHECK)) { return 0; } return ret; } static gint rspamd_telephone_parse(struct http_parser_url *u, const gchar *str, gsize len, gchar const **end, enum rspamd_url_parse_flags parse_flags, guint *flags) { enum { parse_protocol, parse_semicolon, parse_slash, parse_slash_slash, parse_spaces, parse_plus, parse_phone_start, parse_phone, } st = parse_protocol; const gchar *p = str, *c = str, *last = str + len; gchar t; gint ret = 1, i; UChar32 uc; if (u != NULL) { memset(u, 0, sizeof(*u)); } while (p < last) { t = *p; if (p - str > max_email_user) { goto out; } switch (st) { case parse_protocol: if (t == ':') { st = parse_semicolon; SET_U(u, UF_SCHEMA); } p++; break; case parse_semicolon: if (t == '/' || t == '\\') { st = parse_slash; p++; } else { st = parse_slash_slash; } break; case parse_slash: if (t == '/' || t == '\\') { st = parse_slash_slash; } else { goto out; } p++; break; case parse_slash_slash: if (g_ascii_isspace(t)) { st = parse_spaces; p++; } else if (t == '+') { c = p; st = parse_plus; } else if (t == '/') { /* Skip multiple slashes */ p++; } else { st = parse_phone_start; c = p; } break; case parse_spaces: if (t == '+') { c = p; st = parse_plus; } else if (!g_ascii_isspace(t)) { st = parse_phone_start; c = p; } else { p++; } break; case parse_plus: c = p; p++; st = parse_phone_start; break; case parse_phone_start: if (*p == '%' || *p == '(' || g_ascii_isdigit(*p)) { st = parse_phone; p++; } else { goto out; } break; case parse_phone: i = p - str; U8_NEXT(str, i, len, uc); p = str + i; if (u_isdigit(uc) || uc == '(' || uc == ')' || uc == '[' || uc == ']' || u_isspace(uc) || uc == '%') { /* p is already incremented by U8_NEXT! */ } else if (uc <= 0 || is_url_end(uc)) { ret = 0; goto set; } break; } } set: if (st == parse_phone) { if (p - c != 0) { SET_U(u, UF_HOST); ret = 0; } } out: if (end != NULL) { *end = p; } if ((parse_flags & RSPAMD_URL_PARSE_CHECK)) { return 0; } return ret; } static gint rspamd_web_parse(struct http_parser_url *u, const gchar *str, gsize len, gchar const **end, enum rspamd_url_parse_flags parse_flags, guint *flags) { const gchar *p = str, *c = str, *last = str + len, *slash = NULL, *password_start = NULL, *user_start = NULL; gchar t = 0; UChar32 uc; glong pt; gint ret = 1; gboolean user_seen = FALSE; enum { parse_protocol, parse_slash, parse_slash_slash, parse_semicolon, parse_user, parse_at, parse_multiple_at, parse_password_start, parse_password, parse_domain_start, parse_domain, parse_ipv6, parse_port_password, parse_port, parse_suffix_slash, parse_path, parse_query, parse_part } st = parse_protocol; if (u != NULL) { memset(u, 0, sizeof(*u)); } while (p < last) { t = *p; switch (st) { case parse_protocol: if (t == ':') { st = parse_semicolon; SET_U(u, UF_SCHEMA); } else if (!g_ascii_isalnum(t) && t != '+' && t != '-') { if ((parse_flags & RSPAMD_URL_PARSE_CHECK) && p > c) { /* We might have some domain, but no protocol */ st = parse_domain_start; p = c; slash = c; break; } else { goto out; } } p++; break; case parse_semicolon: if (t == '/' || t == '\\') { st = parse_slash; p++; } else { st = parse_slash_slash; *(flags) |= RSPAMD_URL_FLAG_MISSINGSLASHES; } break; case parse_slash: if (t == '/' || t == '\\') { st = parse_slash_slash; } else { goto out; } p++; break; case parse_slash_slash: if (t != '/' && t != '\\') { c = p; slash = p; st = parse_domain_start; /* * Unfortunately, due to brain damage of the RFC 3986 authors, * we have to distinguish two possibilities here: * authority = [ userinfo "@" ] host [ ":" port ] * So if we have @ somewhere before hostname then we must process * with the username state. Otherwise, we have to process via * the hostname state. Unfortunately, there is no way to distinguish * them aside of running NFA or two DFA or performing lookahead. * Lookahead approach looks easier to implement. */ const char *tp = p; while (tp < last) { if (*tp == '@') { user_seen = TRUE; st = parse_user; break; } else if (*tp == '/' || *tp == '#' || *tp == '?') { st = parse_domain_start; break; } tp++; } if (st == parse_domain_start && *p == '[') { st = parse_ipv6; p++; c = p; } } else { /* Skip multiple slashes */ p++; } break; case parse_ipv6: if (t == ']') { if (p - c == 0) { goto out; } SET_U(u, UF_HOST); p++; if (*p == ':') { st = parse_port; c = p + 1; } else if (*p == '/' || *p == '\\') { st = parse_path; c = p + 1; } else if (*p == '?') { st = parse_query; c = p + 1; } else if (*p == '#') { st = parse_part; c = p + 1; } else if (p != last) { goto out; } } else if (!g_ascii_isxdigit(t) && t != ':' && t != '.') { goto out; } p++; break; case parse_user: if (t == ':') { if (p - c == 0) { goto out; } user_start = c; st = parse_password_start; } else if (t == '@') { /* No password */ if (p - c == 0) { /* We have multiple at in fact */ st = parse_multiple_at; user_seen = TRUE; *flags |= RSPAMD_URL_FLAG_OBSCURED; continue; } SET_U(u, UF_USERINFO); *flags |= RSPAMD_URL_FLAG_HAS_USER; st = parse_at; } else if (!g_ascii_isgraph(t)) { goto out; } else if (p - c > max_email_user) { goto out; } p++; break; case parse_multiple_at: if (t != '@') { if (p - c == 0) { goto out; } /* For now, we ignore all that stuff as it is bogus */ /* Off by one */ p--; SET_U(u, UF_USERINFO); p++; *flags |= RSPAMD_URL_FLAG_HAS_USER; st = parse_at; } else { p++; } break; case parse_password_start: if (t == '@') { /* Empty password */ SET_U(u, UF_USERINFO); if (u != NULL && u->field_data[UF_USERINFO].len > 0) { /* Eat semicolon */ u->field_data[UF_USERINFO].len--; } *flags |= RSPAMD_URL_FLAG_HAS_USER; st = parse_at; } else { c = p; password_start = p; st = parse_password; } p++; break; case parse_password: if (t == '@') { /* XXX: password is not stored */ if (u != NULL) { if (u->field_data[UF_USERINFO].len == 0 && password_start && user_start && password_start > user_start + 1) { *flags |= RSPAMD_URL_FLAG_HAS_USER; u->field_set |= 1u << (UF_USERINFO); u->field_data[UF_USERINFO].len = password_start - user_start - 1; u->field_data[UF_USERINFO].off = user_start - str; } } st = parse_at; } else if (!g_ascii_isgraph(t)) { goto out; } else if (p - c > max_domain_length) { goto out; } p++; break; case parse_at: c = p; if (t == '@') { *flags |= RSPAMD_URL_FLAG_OBSCURED; p++; } else if (t == '[') { st = parse_ipv6; p++; c = p; } else { st = parse_domain_start; } break; case parse_domain_start: if (is_domain_start(t)) { st = parse_domain; } else { goto out; } break; case parse_domain: if (p - c > max_domain_length) { /* Too large domain */ goto out; } if (t == '/' || t == '\\' || t == ':' || t == '?' || t == '#') { if (p - c == 0) { goto out; } if (t == '/' || t == '\\') { SET_U(u, UF_HOST); st = parse_suffix_slash; } else if (t == '?') { SET_U(u, UF_HOST); st = parse_query; c = p + 1; } else if (t == '#') { SET_U(u, UF_HOST); st = parse_part; c = p + 1; } else if (t == ':' && !user_seen) { /* * Here we can have both port and password, hence we need * to apply some heuristic here */ st = parse_port_password; } else { /* * We can go only for parsing port here */ SET_U(u, UF_HOST); st = parse_port; c = p + 1; } p++; } else { if (is_url_end(t) || is_url_start(t)) { goto set; } else if (*p == '@' && !user_seen) { /* We need to fallback and test user */ p = slash; user_seen = TRUE; st = parse_user; } else if (*p != '.' && *p != '-' && *p != '_' && *p != '%') { if (*p & 0x80) { guint i = 0; U8_NEXT(((const guchar *) p), i, last - p, uc); if (uc < 0) { /* Bad utf8 */ goto out; } if (!u_isalnum(uc)) { /* Bad symbol */ if (IS_ZERO_WIDTH_SPACE(uc)) { (*flags) |= RSPAMD_URL_FLAG_ZW_SPACES; } else { if (!u_isgraph(uc)) { if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) { goto out; } else { goto set; } } } } else { (*flags) |= RSPAMD_URL_FLAG_IDN; } p = p + i; } else if (is_urlsafe(*p)) { p++; } else { if (parse_flags & RSPAMD_URL_PARSE_HREF) { /* We have to use all shit we are given here */ p++; (*flags) |= RSPAMD_URL_FLAG_OBSCURED; } else { if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) { goto out; } else { goto set; } } } } else { p++; } } break; case parse_port_password: if (g_ascii_isdigit(t)) { const gchar *tmp = p; while (tmp < last) { if (!g_ascii_isdigit(*tmp)) { if (*tmp == '/' || *tmp == '#' || *tmp == '?' || is_url_end(*tmp) || g_ascii_isspace(*tmp)) { /* Port + something */ st = parse_port; c = slash; p--; SET_U(u, UF_HOST); p++; c = p; break; } else { /* Not a port, bad character at the end */ break; } } tmp++; } if (tmp == last) { /* Host + port only */ st = parse_port; c = slash; p--; SET_U(u, UF_HOST); p++; c = p; } if (st != parse_port) { /* Fallback to user:password */ p = slash; c = slash; user_seen = TRUE; st = parse_user; } } else { /* Rewind back */ p = slash; c = slash; user_seen = TRUE; st = parse_user; } break; case parse_port: if (t == '/' || t == '\\') { pt = strtoul(c, NULL, 10); if (pt == 0 || pt > 65535) { goto out; } if (u != NULL) { u->port = pt; *flags |= RSPAMD_URL_FLAG_HAS_PORT; } st = parse_suffix_slash; } else if (t == '?') { pt = strtoul(c, NULL, 10); if (pt == 0 || pt > 65535) { goto out; } if (u != NULL) { u->port = pt; *flags |= RSPAMD_URL_FLAG_HAS_PORT; } c = p + 1; st = parse_query; } else if (t == '#') { pt = strtoul(c, NULL, 10); if (pt == 0 || pt > 65535) { goto out; } if (u != NULL) { u->port = pt; *flags |= RSPAMD_URL_FLAG_HAS_PORT; } c = p + 1; st = parse_part; } else if (is_url_end(t)) { goto set; } else if (!g_ascii_isdigit(t)) { if (!(parse_flags & RSPAMD_URL_PARSE_CHECK) || !g_ascii_isspace(t)) { goto out; } else { goto set; } } p++; break; case parse_suffix_slash: if (t != '/' && t != '\\') { c = p; st = parse_path; } else { /* Skip extra slashes */ p++; } break; case parse_path: if (t == '?') { if (p - c != 0) { SET_U(u, UF_PATH); } c = p + 1; st = parse_query; } else if (t == '#') { /* No query, just fragment */ if (p - c != 0) { SET_U(u, UF_PATH); } c = p + 1; st = parse_part; } else if (!(parse_flags & RSPAMD_URL_PARSE_HREF) && is_url_end(t)) { goto set; } else if (is_lwsp(t)) { if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) { if (g_ascii_isspace(t)) { goto set; } goto out; } else { goto set; } } p++; break; case parse_query: if (t == '#') { if (p - c != 0) { SET_U(u, UF_QUERY); } c = p + 1; st = parse_part; } else if (!(parse_flags & RSPAMD_URL_PARSE_HREF) && is_url_end(t)) { goto set; } else if (is_lwsp(t)) { if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) { if (g_ascii_isspace(t)) { goto set; } goto out; } else { goto set; } } p++; break; case parse_part: if (!(parse_flags & RSPAMD_URL_PARSE_HREF) && is_url_end(t)) { goto set; } else if (is_lwsp(t)) { if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) { if (g_ascii_isspace(t)) { goto set; } goto out; } else { goto set; } } p++; break; } } set: /* Parse remaining */ switch (st) { case parse_domain: if (p - c == 0 || !is_domain(*(p - 1)) || !is_domain(*c)) { goto out; } SET_U(u, UF_HOST); ret = 0; break; case parse_port: pt = strtoul(c, NULL, 10); if (pt == 0 || pt > 65535) { goto out; } if (u != NULL) { u->port = pt; } ret = 0; break; case parse_suffix_slash: /* Url ends with '/' */ ret = 0; break; case parse_path: if (p - c > 0) { SET_U(u, UF_PATH); } ret = 0; break; case parse_query: if (p - c > 0) { SET_U(u, UF_QUERY); } ret = 0; break; case parse_part: if (p - c > 0) { SET_U(u, UF_FRAGMENT); } ret = 0; break; case parse_ipv6: if (t != ']') { ret = 1; } else { /* e.g. http://[::] */ ret = 0; } break; default: /* Error state */ ret = 1; break; } out: if (end != NULL) { *end = p; } return ret; } #undef SET_U static gint rspamd_tld_trie_callback(struct rspamd_multipattern *mp, guint strnum, gint match_start, gint match_pos, const gchar *text, gsize len, void *context) { struct url_matcher *matcher; const gchar *start, *pos, *p; struct rspamd_url *url = context; gint ndots; matcher = &g_array_index(url_scanner->matchers_full, struct url_matcher, strnum); ndots = 1; if (matcher->flags & URL_FLAG_STAR_MATCH) { /* Skip one more tld component */ ndots++; } pos = text + match_start; p = pos - 1; start = rspamd_url_host_unsafe(url); if (*pos != '.' || match_pos != (gint) url->hostlen) { /* Something weird has been found */ if (match_pos == (gint) url->hostlen - 1) { pos = rspamd_url_host_unsafe(url) + match_pos; if (*pos == '.') { /* This is dot at the end of domain */ url->hostlen--; } else { return 0; } } else { return 0; } } /* Now we need to find top level domain */ pos = start; while (p >= start && ndots > 0) { if (*p == '.') { ndots--; pos = p + 1; } else { pos = p; } p--; } if ((ndots == 0 || p == start - 1) && url->tldlen < rspamd_url_host_unsafe(url) + url->hostlen - pos) { url->tldshift = (pos - url->string); url->tldlen = rspamd_url_host_unsafe(url) + url->hostlen - pos; } return 0; } static void rspamd_url_regen_from_inet_addr(struct rspamd_url *uri, const void *addr, int af, rspamd_mempool_t *pool) { gchar *strbuf, *p; const gchar *start_offset; gsize slen = uri->urllen - uri->hostlen; goffset r = 0; if (af == AF_INET) { slen += INET_ADDRSTRLEN; } else { slen += INET6_ADDRSTRLEN; } if (uri->flags & RSPAMD_URL_FLAG_HAS_PORT) { slen += sizeof("65535") - 1; } /* Allocate new string to build it from IP */ strbuf = rspamd_mempool_alloc(pool, slen + 1); r += rspamd_snprintf(strbuf + r, slen - r, "%*s", (gint) (uri->hostshift), uri->string); uri->hostshift = r; uri->tldshift = r; start_offset = strbuf + r; inet_ntop(af, addr, strbuf + r, slen - r + 1); uri->hostlen = strlen(start_offset); r += uri->hostlen; uri->tldlen = uri->hostlen; uri->flags |= RSPAMD_URL_FLAG_NUMERIC; /* Reconstruct URL */ if (uri->flags & RSPAMD_URL_FLAG_HAS_PORT && uri->ext) { p = strbuf + r; start_offset = p + 1; r += rspamd_snprintf(strbuf + r, slen - r, ":%ud", (unsigned int) uri->ext->port); } if (uri->datalen > 0) { p = strbuf + r; start_offset = p + 1; r += rspamd_snprintf(strbuf + r, slen - r, "/%*s", (gint) uri->datalen, rspamd_url_data_unsafe(uri)); uri->datashift = start_offset - strbuf; } else { /* Add trailing slash if needed */ if (uri->hostlen + uri->hostshift < uri->urllen && *(rspamd_url_host_unsafe(uri) + uri->hostlen) == '/') { r += rspamd_snprintf(strbuf + r, slen - r, "/"); } } if (uri->querylen > 0) { p = strbuf + r; start_offset = p + 1; r += rspamd_snprintf(strbuf + r, slen - r, "?%*s", (gint) uri->querylen, rspamd_url_query_unsafe(uri)); uri->queryshift = start_offset - strbuf; } if (uri->fragmentlen > 0) { p = strbuf + r; start_offset = p + 1; r += rspamd_snprintf(strbuf + r, slen - r, "#%*s", (gint) uri->fragmentlen, rspamd_url_fragment_unsafe(uri)); uri->fragmentshift = start_offset - strbuf; } uri->string = strbuf; uri->urllen = r; } static gboolean rspamd_url_is_ip(struct rspamd_url *uri, rspamd_mempool_t *pool) { const gchar *p, *end, *c; gchar *errstr; struct in_addr in4; struct in6_addr in6; gboolean ret = FALSE, check_num = TRUE; guint32 n, dots, t = 0, i = 0, shift, nshift; p = rspamd_url_host_unsafe(uri); end = p + uri->hostlen; if (*p == '[' && *(end - 1) == ']') { p++; end--; } while (*(end - 1) == '.' && end > p) { end--; } if (end - p == 0 || end - p > INET6_ADDRSTRLEN) { return FALSE; } if (rspamd_str_has_8bit(p, end - p)) { return FALSE; } if (rspamd_parse_inet_address_ip4(p, end - p, &in4)) { rspamd_url_regen_from_inet_addr(uri, &in4, AF_INET, pool); ret = TRUE; } else if (rspamd_parse_inet_address_ip6(p, end - p, &in6)) { rspamd_url_regen_from_inet_addr(uri, &in6, AF_INET6, pool); ret = TRUE; } else { /* Heuristics for broken urls */ gchar buf[INET6_ADDRSTRLEN + 1]; /* Try also numeric notation */ c = p; n = 0; dots = 0; shift = 0; while (p <= end && check_num) { if (shift < 32 && ((*p == '.' && dots < 3) || (p == end && dots <= 3))) { if (p - c + 1 >= (gint) sizeof(buf)) { msg_debug_pool("invalid numeric url %*.s...: too long", INET6_ADDRSTRLEN, c); return FALSE; } rspamd_strlcpy(buf, c, p - c + 1); c = p + 1; if (p < end && *p == '.') { dots++; } glong long_n = strtol(buf, &errstr, 0); if ((errstr == NULL || *errstr == '\0') && long_n >= 0) { t = long_n; /* Truncate as windows does */ /* * Even if we have zero, we need to shift by 1 octet */ nshift = (t == 0 ? shift + 8 : shift); /* * Here we count number of octets encoded in this element */ for (i = 0; i < 4; i++) { if ((t >> (8 * i)) > 0) { nshift += 8; } else { break; } } /* * Here we need to find the proper shift of the previous * components, so we check possible cases: * 1) 1 octet - just use it applying shift * 2) 2 octets - convert to big endian 16 bit number * 3) 3 octets - convert to big endian 24 bit number * 4) 4 octets - convert to big endian 32 bit number */ switch (i) { case 4: t = GUINT32_TO_BE(t); break; case 3: t = (GUINT32_TO_BE(t & 0xFFFFFFU)) >> 8; break; case 2: t = GUINT16_TO_BE(t & 0xFFFFU); break; default: t = t & 0xFF; break; } if (p != end) { n |= t << shift; shift = nshift; } } else { check_num = FALSE; } } p++; } /* The last component should be last according to url normalization: * 192.168.1 -> 192.168.0.1 * 192 -> 0.0.0.192 * 192.168 -> 192.0.0.168 */ shift = 8 * (4 - i); if (shift < 32) { n |= t << shift; } if (check_num) { if (dots <= 4) { memcpy(&in4, &n, sizeof(in4)); rspamd_url_regen_from_inet_addr(uri, &in4, AF_INET, pool); uri->flags |= RSPAMD_URL_FLAG_OBSCURED; ret = TRUE; } else if (end - c > (gint) sizeof(buf) - 1) { rspamd_strlcpy(buf, c, end - c + 1); if (inet_pton(AF_INET6, buf, &in6) == 1) { rspamd_url_regen_from_inet_addr(uri, &in6, AF_INET6, pool); uri->flags |= RSPAMD_URL_FLAG_OBSCURED; ret = TRUE; } } } } return ret; } static void rspamd_url_shift(struct rspamd_url *uri, gsize nlen, enum http_parser_url_fields field) { guint old_shift, shift = 0; gint remain; /* Shift remaining data */ switch (field) { case UF_SCHEMA: if (nlen >= uri->protocollen) { return; } else { shift = uri->protocollen - nlen; } old_shift = uri->protocollen; uri->protocollen -= shift; remain = uri->urllen - uri->protocollen; g_assert(remain >= 0); memmove(uri->string + uri->protocollen, uri->string + old_shift, remain); uri->urllen -= shift; uri->flags |= RSPAMD_URL_FLAG_SCHEMAENCODED; break; case UF_HOST: if (nlen >= uri->hostlen) { return; } else { shift = uri->hostlen - nlen; } old_shift = uri->hostlen; uri->hostlen -= shift; remain = (uri->urllen - (uri->hostshift)) - old_shift; g_assert(remain >= 0); memmove(rspamd_url_host_unsafe(uri) + uri->hostlen, rspamd_url_host_unsafe(uri) + old_shift, remain); uri->urllen -= shift; uri->flags |= RSPAMD_URL_FLAG_HOSTENCODED; break; case UF_PATH: if (nlen >= uri->datalen) { return; } else { shift = uri->datalen - nlen; } old_shift = uri->datalen; uri->datalen -= shift; remain = (uri->urllen - (uri->datashift)) - old_shift; g_assert(remain >= 0); memmove(rspamd_url_data_unsafe(uri) + uri->datalen, rspamd_url_data_unsafe(uri) + old_shift, remain); uri->urllen -= shift; uri->flags |= RSPAMD_URL_FLAG_PATHENCODED; break; case UF_QUERY: if (nlen >= uri->querylen) { return; } else { shift = uri->querylen - nlen; } old_shift = uri->querylen; uri->querylen -= shift; remain = (uri->urllen - (uri->queryshift)) - old_shift; g_assert(remain >= 0); memmove(rspamd_url_query_unsafe(uri) + uri->querylen, rspamd_url_query_unsafe(uri) + old_shift, remain); uri->urllen -= shift; uri->flags |= RSPAMD_URL_FLAG_QUERYENCODED; break; case UF_FRAGMENT: if (nlen >= uri->fragmentlen) { return; } else { shift = uri->fragmentlen - nlen; } uri->fragmentlen -= shift; uri->urllen -= shift; break; default: break; } /* Now adjust lengths and offsets */ switch (field) { case UF_SCHEMA: if (uri->userlen > 0) { uri->usershift -= shift; } if (uri->hostlen > 0) { uri->hostshift -= shift; } /* Go forward */ /* FALLTHRU */ case UF_HOST: if (uri->datalen > 0) { uri->datashift -= shift; } /* Go forward */ /* FALLTHRU */ case UF_PATH: if (uri->querylen > 0) { uri->queryshift -= shift; } /* Go forward */ /* FALLTHRU */ case UF_QUERY: if (uri->fragmentlen > 0) { uri->fragmentshift -= shift; } /* Go forward */ /* FALLTHRU */ case UF_FRAGMENT: default: break; } } static void rspamd_telephone_normalise_inplace(struct rspamd_url *uri) { gchar *t, *h, *end; gint i = 0, w, orig_len; UChar32 uc; t = rspamd_url_host_unsafe(uri); h = t; end = t + uri->hostlen; orig_len = uri->hostlen; if (*h == '+') { h++; t++; } while (h < end) { i = 0; U8_NEXT(h, i, end - h, uc); if (u_isdigit(uc)) { w = 0; U8_APPEND_UNSAFE(t, w, uc); t += w; } h += i; } uri->hostlen = t - rspamd_url_host_unsafe(uri); uri->urllen -= (orig_len - uri->hostlen); } static inline bool is_idna_label_dot(UChar ch) { switch (ch) { case 0x3002: case 0xFF0E: case 0xFF61: return true; default: return false; } } /* * All credits for this investigation should go to * Dr. Hajime Shimada and Mr. Shirakura as they have revealed this case in their * research. */ /* * This function replaces unsafe IDNA dots in host labels. Unfortunately, * IDNA extends dot definition from '.' to multiple other characters that * should be treated equally. * This function replaces such dots and returns `true` if these dots are found. * In this case, it should be treated as obfuscation attempt. */ static bool rspamd_url_remove_dots(struct rspamd_url *uri) { const gchar *hstart = rspamd_url_host_unsafe(uri); gchar *t; UChar32 uc; gint i = 0, hlen; bool ret = false; if (uri->hostlen == 0) { return false; } hlen = uri->hostlen; t = rspamd_url_host_unsafe(uri); while (i < hlen) { gint prev_i = i; U8_NEXT(hstart, i, hlen, uc); if (is_idna_label_dot(uc)) { *t++ = '.'; ret = true; } else { if (ret) { /* We have to shift the remaining stuff */ while (prev_i < i) { *t++ = *(hstart + prev_i); prev_i++; } } else { t += (i - prev_i); } } } if (ret) { rspamd_url_shift(uri, t - hstart, UF_HOST); } return ret; } enum uri_errno rspamd_url_parse(struct rspamd_url *uri, gchar *uristring, gsize len, rspamd_mempool_t *pool, enum rspamd_url_parse_flags parse_flags) { struct http_parser_url u; gchar *p; const gchar *end; guint i, complen, ret, flags = 0; gsize unquoted_len = 0; memset(uri, 0, sizeof(*uri)); memset(&u, 0, sizeof(u)); uri->count = 1; /* Undefine order */ uri->order = -1; uri->part_order = -1; if (*uristring == '\0') { return URI_ERRNO_EMPTY; } if (len >= G_MAXUINT16 / 2) { flags |= RSPAMD_URL_FLAG_TRUNCATED; len = G_MAXUINT16 / 2; } p = uristring; uri->protocol = PROTOCOL_UNKNOWN; if (len > sizeof("mailto:") - 1) { /* For mailto: urls we also need to add slashes to make it a valid URL */ if (g_ascii_strncasecmp(p, "mailto:", sizeof("mailto:") - 1) == 0) { ret = rspamd_mailto_parse(&u, uristring, len, &end, parse_flags, &flags); } else if (g_ascii_strncasecmp(p, "tel:", sizeof("tel:") - 1) == 0 || g_ascii_strncasecmp(p, "callto:", sizeof("callto:") - 1) == 0) { ret = rspamd_telephone_parse(&u, uristring, len, &end, parse_flags, &flags); uri->protocol = PROTOCOL_TELEPHONE; } else { ret = rspamd_web_parse(&u, uristring, len, &end, parse_flags, &flags); } } else { ret = rspamd_web_parse(&u, uristring, len, &end, parse_flags, &flags); } if (ret != 0) { return URI_ERRNO_BAD_FORMAT; } if (end > uristring && (guint) (end - uristring) != len) { len = end - uristring; } uri->raw = p; uri->rawlen = len; if (flags & RSPAMD_URL_FLAG_MISSINGSLASHES) { len += 2; uri->string = rspamd_mempool_alloc(pool, len + 1); memcpy(uri->string, p, u.field_data[UF_SCHEMA].len); memcpy(uri->string + u.field_data[UF_SCHEMA].len, "://", 3); rspamd_strlcpy(uri->string + u.field_data[UF_SCHEMA].len + 3, p + u.field_data[UF_SCHEMA].len + 1, len - 2 - u.field_data[UF_SCHEMA].len); /* Compensate slashes added */ for (i = UF_SCHEMA + 1; i < UF_MAX; i++) { if (u.field_set & (1 << i)) { u.field_data[i].off += 2; } } } else { uri->string = rspamd_mempool_alloc(pool, len + 1); rspamd_strlcpy(uri->string, p, len + 1); } uri->urllen = len; uri->flags = flags; for (i = 0; i < UF_MAX; i++) { if (u.field_set & (1 << i)) { guint shift = u.field_data[i].off; complen = u.field_data[i].len; if (complen >= G_MAXUINT16) { /* Too large component length */ return URI_ERRNO_BAD_FORMAT; } switch (i) { case UF_SCHEMA: uri->protocollen = u.field_data[i].len; break; case UF_HOST: uri->hostshift = shift; uri->hostlen = complen; break; case UF_PATH: uri->datashift = shift; uri->datalen = complen; break; case UF_QUERY: uri->queryshift = shift; uri->querylen = complen; break; case UF_FRAGMENT: uri->fragmentshift = shift; uri->fragmentlen = complen; break; case UF_USERINFO: uri->usershift = shift; uri->userlen = complen; break; default: break; } } } /* Port is 'special' in case of url_parser as it is not a part of UF_* macro logic */ if (u.port != 0) { if (!uri->ext) { uri->ext = rspamd_mempool_alloc0_type(pool, struct rspamd_url_ext); } uri->flags |= RSPAMD_URL_FLAG_HAS_PORT; uri->ext->port = u.port; } if (!uri->hostlen) { return URI_ERRNO_HOST_MISSING; } /* Now decode url symbols */ unquoted_len = rspamd_url_decode(uri->string, uri->string, uri->protocollen); rspamd_url_shift(uri, unquoted_len, UF_SCHEMA); unquoted_len = rspamd_url_decode(rspamd_url_host_unsafe(uri), rspamd_url_host_unsafe(uri), uri->hostlen); rspamd_url_normalise_propagate_flags(pool, rspamd_url_host_unsafe(uri), &unquoted_len, uri->flags); rspamd_url_shift(uri, unquoted_len, UF_HOST); if (rspamd_url_remove_dots(uri)) { uri->flags |= RSPAMD_URL_FLAG_OBSCURED; } if (uri->protocol & (PROTOCOL_HTTP | PROTOCOL_HTTPS | PROTOCOL_MAILTO | PROTOCOL_FTP | PROTOCOL_FILE)) { /* Ensure that hostname starts with something sane (exclude numeric urls) */ const gchar *host = rspamd_url_host_unsafe(uri); if (!(is_domain_start(host[0]) || host[0] == ':')) { return URI_ERRNO_BAD_FORMAT; } } /* Apply nameprep algorithm */ static UStringPrepProfile *nameprep = NULL; UErrorCode uc_err = U_ZERO_ERROR; if (nameprep == NULL) { /* Open and cache profile */ nameprep = usprep_openByType(USPREP_RFC3491_NAMEPREP, &uc_err); g_assert(U_SUCCESS(uc_err)); } UChar *utf16_hostname, *norm_utf16; gint32 utf16_len, norm_utf16_len, norm_utf8_len; UParseError parse_error; utf16_hostname = rspamd_mempool_alloc(pool, uri->hostlen * sizeof(UChar)); struct UConverter *utf8_conv = rspamd_get_utf8_converter(); utf16_len = ucnv_toUChars(utf8_conv, utf16_hostname, uri->hostlen, rspamd_url_host_unsafe(uri), uri->hostlen, &uc_err); if (!U_SUCCESS(uc_err)) { return URI_ERRNO_BAD_FORMAT; } norm_utf16 = rspamd_mempool_alloc(pool, utf16_len * sizeof(UChar)); norm_utf16_len = usprep_prepare(nameprep, utf16_hostname, utf16_len, norm_utf16, utf16_len, USPREP_DEFAULT, &parse_error, &uc_err); if (!U_SUCCESS(uc_err)) { return URI_ERRNO_BAD_FORMAT; } /* Convert back to utf8, sigh... */ norm_utf8_len = ucnv_fromUChars(utf8_conv, rspamd_url_host_unsafe(uri), uri->hostlen, norm_utf16, norm_utf16_len, &uc_err); if (!U_SUCCESS(uc_err)) { return URI_ERRNO_BAD_FORMAT; } /* Final shift of lengths */ rspamd_url_shift(uri, norm_utf8_len, UF_HOST); /* Process data part */ if (uri->datalen) { unquoted_len = rspamd_url_decode(rspamd_url_data_unsafe(uri), rspamd_url_data_unsafe(uri), uri->datalen); rspamd_url_normalise_propagate_flags(pool, rspamd_url_data_unsafe(uri), &unquoted_len, uri->flags); rspamd_url_shift(uri, unquoted_len, UF_PATH); /* We now normalize path */ rspamd_normalize_path_inplace(rspamd_url_data_unsafe(uri), uri->datalen, &unquoted_len); rspamd_url_shift(uri, unquoted_len, UF_PATH); } if (uri->querylen) { unquoted_len = rspamd_url_decode(rspamd_url_query_unsafe(uri), rspamd_url_query_unsafe(uri), uri->querylen); rspamd_url_normalise_propagate_flags(pool, rspamd_url_query_unsafe(uri), &unquoted_len, uri->flags); rspamd_url_shift(uri, unquoted_len, UF_QUERY); } if (uri->fragmentlen) { unquoted_len = rspamd_url_decode(rspamd_url_fragment_unsafe(uri), rspamd_url_fragment_unsafe(uri), uri->fragmentlen); rspamd_url_normalise_propagate_flags(pool, rspamd_url_fragment_unsafe(uri), &unquoted_len, uri->flags); rspamd_url_shift(uri, unquoted_len, UF_FRAGMENT); } rspamd_str_lc(uri->string, uri->protocollen); unquoted_len = rspamd_str_lc_utf8(rspamd_url_host_unsafe(uri), uri->hostlen); rspamd_url_shift(uri, unquoted_len, UF_HOST); if (uri->protocol == PROTOCOL_UNKNOWN) { for (i = 0; i < G_N_ELEMENTS(rspamd_url_protocols); i++) { if (uri->protocollen == rspamd_url_protocols[i].len) { if (memcmp(uri->string, rspamd_url_protocols[i].name, uri->protocollen) == 0) { uri->protocol = rspamd_url_protocols[i].proto; break; } } } } if (uri->protocol & (PROTOCOL_HTTP | PROTOCOL_HTTPS | PROTOCOL_MAILTO | PROTOCOL_FTP | PROTOCOL_FILE)) { /* Find TLD part */ if (url_scanner->search_trie_full) { rspamd_multipattern_lookup(url_scanner->search_trie_full, rspamd_url_host_unsafe(uri), uri->hostlen, rspamd_tld_trie_callback, uri, NULL); } if (uri->tldlen == 0) { /* * If we have not detected eSLD, but there are no dots in the hostname, * then we should treat the whole hostname as eSLD - a rule of thumb */ if (uri->hostlen > 0 && memchr(rspamd_url_host_unsafe(uri), '.', uri->hostlen) == NULL) { uri->tldlen = uri->hostlen; uri->tldshift = uri->hostshift; } else { if (uri->protocol != PROTOCOL_MAILTO) { if (url_scanner->has_tld_file && !(parse_flags & RSPAMD_URL_PARSE_HREF)) { /* Ignore URL's without TLD if it is not a numeric URL */ if (!rspamd_url_is_ip(uri, pool)) { return URI_ERRNO_TLD_MISSING; } } else { if (!rspamd_url_is_ip(uri, pool)) { /* Assume tld equal to host */ uri->tldshift = uri->hostshift; uri->tldlen = uri->hostlen; } else if (uri->flags & RSPAMD_URL_FLAG_SCHEMALESS) { /* Ignore urls with both no schema and no tld */ return URI_ERRNO_TLD_MISSING; } uri->flags |= RSPAMD_URL_FLAG_NO_TLD; } } else { /* Ignore IP like domains for mailto, as it is really never supported */ return URI_ERRNO_TLD_MISSING; } } } /* Replace stupid '\' with '/' after schema */ if (uri->protocol & (PROTOCOL_HTTP | PROTOCOL_HTTPS | PROTOCOL_FTP) && uri->protocollen > 0 && uri->urllen > uri->protocollen + 2) { gchar *pos = &uri->string[uri->protocollen], *host_start = rspamd_url_host_unsafe(uri); while (pos < host_start) { if (*pos == '\\') { *pos = '/'; uri->flags |= RSPAMD_URL_FLAG_OBSCURED; } pos++; } } } else if (uri->protocol & PROTOCOL_TELEPHONE) { /* We need to normalise phone number: remove all spaces and braces */ rspamd_telephone_normalise_inplace(uri); if (rspamd_url_host_unsafe(uri)[0] == '+') { uri->tldshift = uri->hostshift + 1; uri->tldlen = uri->hostlen - 1; } else { uri->tldshift = uri->hostshift; uri->tldlen = uri->hostlen; } } if (uri->protocol == PROTOCOL_UNKNOWN) { if (!(parse_flags & RSPAMD_URL_PARSE_HREF)) { return URI_ERRNO_INVALID_PROTOCOL; } else { /* Hack, hack, hack */ uri->protocol = PROTOCOL_UNKNOWN; } } return URI_ERRNO_OK; } struct tld_trie_cbdata { const gchar *begin; gsize len; rspamd_ftok_t *out; }; static gint rspamd_tld_trie_find_callback(struct rspamd_multipattern *mp, guint strnum, gint match_start, gint match_pos, const gchar *text, gsize len, void *context) { struct url_matcher *matcher; const gchar *start, *pos, *p; struct tld_trie_cbdata *cbdata = context; gint ndots = 1; matcher = &g_array_index(url_scanner->matchers_full, struct url_matcher, strnum); if (matcher->flags & URL_FLAG_STAR_MATCH) { /* Skip one more tld component */ ndots = 2; } pos = text + match_start; p = pos - 1; start = text; if (*pos != '.' || match_pos != (gint) cbdata->len) { /* Something weird has been found */ if (match_pos != (gint) cbdata->len - 1) { /* Search more */ return 0; } } /* Now we need to find top level domain */ pos = start; while (p >= start && ndots > 0) { if (*p == '.') { ndots--; pos = p + 1; } else { pos = p; } p--; } if (ndots == 0 || p == start - 1) { if (cbdata->begin + cbdata->len - pos > cbdata->out->len) { cbdata->out->begin = pos; cbdata->out->len = cbdata->begin + cbdata->len - pos; } } return 0; } gboolean rspamd_url_find_tld(const gchar *in, gsize inlen, rspamd_ftok_t *out) { struct tld_trie_cbdata cbdata; g_assert(in != NULL); g_assert(out != NULL); g_assert(url_scanner != NULL); cbdata.begin = in; cbdata.len = inlen; cbdata.out = out; out->len = 0; if (url_scanner->search_trie_full) { rspamd_multipattern_lookup(url_scanner->search_trie_full, in, inlen, rspamd_tld_trie_find_callback, &cbdata, NULL); } if (out->len > 0) { return TRUE; } return FALSE; } static const gchar url_braces[] = { '(', ')', '{', '}', '[', ']', '<', '>', '|', '|', '\'', '\''}; static gboolean url_file_start(struct url_callback_data *cb, const gchar *pos, url_match_t *match) { match->m_begin = pos; if (pos > cb->begin) { match->st = *(pos - 1); } else { match->st = '\0'; } return TRUE; } static gboolean url_file_end(struct url_callback_data *cb, const gchar *pos, url_match_t *match) { const gchar *p; gchar stop; guint i; p = pos + strlen(match->pattern); stop = *p; if (*p == '/') { p++; } for (i = 0; i < G_N_ELEMENTS(url_braces) / 2; i += 2) { if (*p == url_braces[i]) { stop = url_braces[i + 1]; break; } } while (p < cb->end && *p != stop && is_urlsafe(*p)) { p++; } if (p == cb->begin) { return FALSE; } match->m_len = p - match->m_begin; return TRUE; } static gboolean url_tld_start(struct url_callback_data *cb, const gchar *pos, url_match_t *match) { const gchar *p = pos; guint processed = 0; static const guint max_shift = 253 + sizeof("https://"); /* Try to find the start of the url by finding any non-urlsafe character or whitespace/punctuation */ while (p >= cb->begin) { if (!is_domain(*p) || g_ascii_isspace(*p) || is_url_start(*p) || p == match->prev_newline_pos) { if (!is_url_start(*p) && !g_ascii_isspace(*p) && p != match->prev_newline_pos) { return FALSE; } if (p != match->prev_newline_pos) { match->st = *p; p++; } else { match->st = '\n'; } if (!g_ascii_isalnum(*p)) { /* Urls cannot start with strange symbols */ return FALSE; } match->m_begin = p; return TRUE; } else if (p == cb->begin && p != pos) { match->st = '\0'; match->m_begin = p; return TRUE; } else if (*p == '.') { if (p == cb->begin) { /* Urls cannot start with a dot */ return FALSE; } if (!g_ascii_isalnum(p[1])) { /* Wrong we have an invalid character after dot */ return FALSE; } } else if (*p == '/') { /* Urls cannot contain '/' in their body */ return FALSE; } p--; processed++; if (processed > max_shift) { /* Too long */ return FALSE; } } return FALSE; } static gboolean url_tld_end(struct url_callback_data *cb, const gchar *pos, url_match_t *match) { const gchar *p; gboolean ret = FALSE; p = pos + match->m_len; if (p == cb->end) { match->m_len = p - match->m_begin; return TRUE; } else if (*p == '/' || *p == ':' || is_url_end(*p) || is_lwsp(*p) || (match->st != '<' && p == match->newline_pos)) { /* Parse arguments, ports by normal way by url default function */ p = match->m_begin; /* Check common prefix */ if (g_ascii_strncasecmp(p, "http://", sizeof("http://") - 1) == 0) { ret = url_web_end(cb, match->m_begin + sizeof("http://") - 1, match); } else { ret = url_web_end(cb, match->m_begin, match); } } else if (*p == '.') { p++; if (p < cb->end) { if (g_ascii_isspace(*p) || *p == '/' || *p == '?' || *p == ':') { ret = url_web_end(cb, match->m_begin, match); } } } if (ret) { /* Check sanity of match found */ if (match->m_begin + match->m_len <= pos) { return FALSE; } } return ret; } static gboolean url_web_start(struct url_callback_data *cb, const gchar *pos, url_match_t *match) { /* Check what we have found */ if (pos > cb->begin) { if (g_ascii_strncasecmp(pos, "www", 3) == 0) { if (!(is_url_start(*(pos - 1)) || g_ascii_isspace(*(pos - 1)) || pos - 1 == match->prev_newline_pos || (*(pos - 1) & 0x80))) { /* Chinese trick */ return FALSE; } } else { guchar prev = *(pos - 1); if (g_ascii_isalnum(prev)) { /* Part of another url */ return FALSE; } } } if (*pos == '.') { /* Urls cannot start with . */ return FALSE; } if (pos > cb->begin) { match->st = *(pos - 1); } else { match->st = '\0'; } match->m_begin = pos; return TRUE; } static gboolean url_web_end(struct url_callback_data *cb, const gchar *pos, url_match_t *match) { const gchar *last = NULL; gint len = cb->end - pos; guint flags = 0; if (match->newline_pos && match->st != '<') { /* We should also limit our match end to the newline */ len = MIN(len, match->newline_pos - pos); } if (rspamd_web_parse(NULL, pos, len, &last, RSPAMD_URL_PARSE_CHECK, &flags) != 0) { return FALSE; } if (last < cb->end && (*last == '>' && last != match->newline_pos)) { /* We need to ensure that url also starts with '>' */ if (match->st != '<') { if (last + 1 < cb->end) { if (g_ascii_isspace(last[1])) { return FALSE; } } else { return FALSE; } } } match->m_len = (last - pos); cb->fin = last + 1; return TRUE; } static gboolean url_email_start(struct url_callback_data *cb, const gchar *pos, url_match_t *match) { if (!match->prefix || match->prefix[0] == '\0') { /* We have mailto:// at the beginning */ match->m_begin = pos; if (pos >= cb->begin + 1) { match->st = *(pos - 1); } else { match->st = '\0'; } } else { /* Just '@' */ /* Check if this match is a part of the previous mailto: email */ if (cb->last_at != NULL && cb->last_at == pos) { cb->last_at = NULL; return FALSE; } else if (pos == cb->begin) { /* Just @ at the start of input */ return FALSE; } match->st = '\0'; } return TRUE; } static gboolean url_email_end(struct url_callback_data *cb, const gchar *pos, url_match_t *match) { const gchar *last = NULL; struct http_parser_url u; gint len = cb->end - pos; guint flags = 0; if (match->newline_pos && match->st != '<') { /* We should also limit our match end to the newline */ len = MIN(len, match->newline_pos - pos); } if (!match->prefix || match->prefix[0] == '\0') { /* We have mailto:// at the beginning */ if (rspamd_mailto_parse(&u, pos, len, &last, RSPAMD_URL_PARSE_CHECK, &flags) != 0) { return FALSE; } if (!(u.field_set & (1 << UF_USERINFO))) { return FALSE; } cb->last_at = match->m_begin + u.field_data[UF_USERINFO].off + u.field_data[UF_USERINFO].len; g_assert(*cb->last_at == '@'); match->m_len = (last - pos); return TRUE; } else { const gchar *c, *p; /* * Here we have just '@', so we need to find both start and end of the * pattern */ g_assert(*pos == '@'); if (pos >= cb->end - 2 || pos < cb->begin + 1) { /* Boundary violation */ return FALSE; } /* Check the next character after `@` */ if (!g_ascii_isalnum(pos[1]) || !g_ascii_isalnum(*(pos - 1))) { return FALSE; } c = pos - 1; while (c > cb->begin) { if (!is_mailsafe(*c)) { break; } if (c == match->prev_newline_pos) { break; } c--; } /* Rewind to the first alphanumeric character */ while (c < pos && !g_ascii_isalnum(*c)) { c++; } /* Find the end of email */ p = pos + 1; while (p < cb->end && is_domain(*p)) { if (p == match->newline_pos) { break; } p++; } /* Rewind it again to avoid bad emails to be detected */ while (p > pos && p < cb->end && !g_ascii_isalnum(*p)) { p--; } if (p < cb->end && g_ascii_isalnum(*p) && (match->newline_pos == NULL || p < match->newline_pos)) { p++; } if (p > c) { match->m_begin = c; match->m_len = p - c; return TRUE; } } return FALSE; } static gboolean url_tel_start(struct url_callback_data *cb, const gchar *pos, url_match_t *match) { match->m_begin = pos; if (pos >= cb->begin + 1) { match->st = *(pos - 1); } else { match->st = '\0'; } return TRUE; } static gboolean url_tel_end(struct url_callback_data *cb, const gchar *pos, url_match_t *match) { const gchar *last = NULL; struct http_parser_url u; gint len = cb->end - pos; guint flags = 0; if (match->newline_pos && match->st != '<') { /* We should also limit our match end to the newline */ len = MIN(len, match->newline_pos - pos); } if (rspamd_telephone_parse(&u, pos, len, &last, RSPAMD_URL_PARSE_CHECK, &flags) != 0) { return FALSE; } if (!(u.field_set & (1 << UF_HOST))) { return FALSE; } match->m_len = (last - pos); return TRUE; } static gboolean rspamd_url_trie_is_match(struct url_matcher *matcher, const gchar *pos, const gchar *end, const gchar *newline_pos) { if (matcher->flags & URL_FLAG_TLD_MATCH) { /* Immediately check pos for valid chars */ if (pos < end) { if (pos != newline_pos && !g_ascii_isspace(*pos) && *pos != '/' && *pos != '?' && *pos != ':' && !is_url_end(*pos)) { if (*pos == '.') { /* We allow . at the end of the domain however */ pos++; if (pos < end) { if (!g_ascii_isspace(*pos) && *pos != '/' && *pos != '?' && *pos != ':' && !is_url_end(*pos)) { return FALSE; } } } else { return FALSE; } } } } return TRUE; } static gint rspamd_url_trie_callback(struct rspamd_multipattern *mp, guint strnum, gint match_start, gint match_pos, const gchar *text, gsize len, void *context) { struct url_matcher *matcher; url_match_t m; const gchar *pos, *newline_pos = NULL; struct url_callback_data *cb = context; pos = text + match_pos; if (cb->fin > pos) { /* Already seen */ return 0; } matcher = &g_array_index(cb->matchers, struct url_matcher, strnum); if ((matcher->flags & URL_FLAG_NOHTML) && cb->how == RSPAMD_URL_FIND_STRICT) { /* Do not try to match non-html like urls in html texts */ return 0; } memset(&m, 0, sizeof(m)); m.m_begin = text + match_start; m.m_len = match_pos - match_start; if (cb->newlines && cb->newlines->len > 0) { newline_pos = g_ptr_array_index(cb->newlines, cb->newline_idx); while (pos > newline_pos && cb->newline_idx < cb->newlines->len) { cb->newline_idx++; newline_pos = g_ptr_array_index(cb->newlines, cb->newline_idx); } if (pos > newline_pos) { newline_pos = NULL; } if (cb->newline_idx > 0) { m.prev_newline_pos = g_ptr_array_index(cb->newlines, cb->newline_idx - 1); } } if (!rspamd_url_trie_is_match(matcher, pos, cb->end, newline_pos)) { return 0; } m.pattern = matcher->pattern; m.prefix = matcher->prefix; m.add_prefix = FALSE; m.newline_pos = newline_pos; pos = cb->begin + match_start; if (matcher->start(cb, pos, &m) && matcher->end(cb, pos, &m)) { if (m.add_prefix || matcher->prefix[0] != '\0') { cb->len = m.m_len + strlen(matcher->prefix); cb->url_str = rspamd_mempool_alloc(cb->pool, cb->len + 1); cb->len = rspamd_snprintf(cb->url_str, cb->len + 1, "%s%*s", m.prefix, (gint) m.m_len, m.m_begin); cb->prefix_added = TRUE; } else { cb->url_str = rspamd_mempool_alloc(cb->pool, m.m_len + 1); rspamd_strlcpy(cb->url_str, m.m_begin, m.m_len + 1); } cb->start = m.m_begin; if (pos > cb->fin) { cb->fin = pos; } return 1; } else { cb->url_str = NULL; } /* Continue search */ return 0; } gboolean rspamd_url_find(rspamd_mempool_t *pool, const gchar *begin, gsize len, gchar **url_str, enum rspamd_url_find_type how, goffset *url_pos, gboolean *prefix_added) { struct url_callback_data cb; gint ret; memset(&cb, 0, sizeof(cb)); cb.begin = begin; cb.end = begin + len; cb.how = how; cb.pool = pool; if (how == RSPAMD_URL_FIND_ALL) { if (url_scanner->search_trie_full) { cb.matchers = url_scanner->matchers_full; ret = rspamd_multipattern_lookup(url_scanner->search_trie_full, begin, len, rspamd_url_trie_callback, &cb, NULL); } else { cb.matchers = url_scanner->matchers_strict; ret = rspamd_multipattern_lookup(url_scanner->search_trie_strict, begin, len, rspamd_url_trie_callback, &cb, NULL); } } else { cb.matchers = url_scanner->matchers_strict; ret = rspamd_multipattern_lookup(url_scanner->search_trie_strict, begin, len, rspamd_url_trie_callback, &cb, NULL); } if (ret) { if (url_str) { *url_str = cb.url_str; } if (url_pos) { *url_pos = cb.start - begin; } if (prefix_added) { *prefix_added = cb.prefix_added; } return TRUE; } return FALSE; } static gint rspamd_url_trie_generic_callback_common(struct rspamd_multipattern *mp, guint strnum, gint match_start, gint match_pos, const gchar *text, gsize len, void *context, gboolean multiple) { struct rspamd_url *url; struct url_matcher *matcher; url_match_t m; const gchar *pos, *newline_pos = NULL; struct url_callback_data *cb = context; gint rc; rspamd_mempool_t *pool; pos = text + match_pos; if (cb->fin > pos) { /* Already seen */ return 0; } matcher = &g_array_index(cb->matchers, struct url_matcher, strnum); pool = cb->pool; if ((matcher->flags & URL_FLAG_NOHTML) && cb->how == RSPAMD_URL_FIND_STRICT) { /* Do not try to match non-html like urls in html texts, continue matching */ return 0; } memset(&m, 0, sizeof(m)); /* Find the next newline after our pos */ if (cb->newlines && cb->newlines->len > 0) { newline_pos = g_ptr_array_index(cb->newlines, cb->newline_idx); while (pos > newline_pos && cb->newline_idx < cb->newlines->len - 1) { cb->newline_idx++; newline_pos = g_ptr_array_index(cb->newlines, cb->newline_idx); } if (pos > newline_pos) { newline_pos = NULL; } if (cb->newline_idx > 0) { m.prev_newline_pos = g_ptr_array_index(cb->newlines, cb->newline_idx - 1); } } if (!rspamd_url_trie_is_match(matcher, pos, text + len, newline_pos)) { /* Mismatch, continue */ return 0; } pos = cb->begin + match_start; m.pattern = matcher->pattern; m.prefix = matcher->prefix; m.add_prefix = FALSE; m.m_begin = text + match_start; m.m_len = match_pos - match_start; m.newline_pos = newline_pos; if (matcher->start(cb, pos, &m) && matcher->end(cb, pos, &m)) { if (m.add_prefix || matcher->prefix[0] != '\0') { cb->len = m.m_len + strlen(matcher->prefix); cb->url_str = rspamd_mempool_alloc(cb->pool, cb->len + 1); cb->len = rspamd_snprintf(cb->url_str, cb->len + 1, "%s%*s", m.prefix, (gint) m.m_len, m.m_begin); cb->prefix_added = TRUE; } else { cb->url_str = rspamd_mempool_alloc(cb->pool, m.m_len + 1); cb->len = rspamd_strlcpy(cb->url_str, m.m_begin, m.m_len + 1); } cb->start = m.m_begin; if (pos > cb->fin) { cb->fin = pos; } url = rspamd_mempool_alloc0(pool, sizeof(struct rspamd_url)); g_strstrip(cb->url_str); rc = rspamd_url_parse(url, cb->url_str, strlen(cb->url_str), pool, RSPAMD_URL_PARSE_TEXT); if (rc == URI_ERRNO_OK && url->hostlen > 0) { if (cb->prefix_added) { url->flags |= RSPAMD_URL_FLAG_SCHEMALESS; cb->prefix_added = FALSE; } if (cb->func) { if (!cb->func(url, cb->start - text, (m.m_begin + m.m_len) - text, cb->funcd)) { /* We need to stop here in any case! */ return -1; } } } else if (rc != URI_ERRNO_OK) { msg_debug_pool_check("extract of url '%s' failed: %s", cb->url_str, rspamd_url_strerror(rc)); } } else { cb->url_str = NULL; /* Continue search if no pattern has been found */ return 0; } /* Continue search if required (return 0 means continue) */ return !multiple; } static gint rspamd_url_trie_generic_callback_multiple(struct rspamd_multipattern *mp, guint strnum, gint match_start, gint match_pos, const gchar *text, gsize len, void *context) { return rspamd_url_trie_generic_callback_common(mp, strnum, match_start, match_pos, text, len, context, TRUE); } static gint rspamd_url_trie_generic_callback_single(struct rspamd_multipattern *mp, guint strnum, gint match_start, gint match_pos, const gchar *text, gsize len, void *context) { return rspamd_url_trie_generic_callback_common(mp, strnum, match_start, match_pos, text, len, context, FALSE); } struct rspamd_url_mimepart_cbdata { struct rspamd_task *task; struct rspamd_mime_text_part *part; gsize url_len; uint16_t *cur_url_order; /* Global ordering */ uint16_t cur_part_order; /* Per part ordering */ }; static gboolean rspamd_url_query_callback(struct rspamd_url *url, gsize start_offset, gsize end_offset, gpointer ud) { struct rspamd_url_mimepart_cbdata *cbd = (struct rspamd_url_mimepart_cbdata *) ud; struct rspamd_task *task; task = cbd->task; if (url->protocol == PROTOCOL_MAILTO) { if (url->userlen == 0) { return FALSE; } } /* Also check max urls */ if (cbd->task->cfg && cbd->task->cfg->max_urls > 0) { if (kh_size(MESSAGE_FIELD(task, urls)) > cbd->task->cfg->max_urls) { msg_err_task("part has too many URLs, we cannot process more: " "%d urls extracted ", (guint) kh_size(MESSAGE_FIELD(task, urls))); return FALSE; } } url->flags |= RSPAMD_URL_FLAG_QUERY; if (rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls), url, false)) { if (cbd->part && cbd->part->mime_part->urls) { g_ptr_array_add(cbd->part->mime_part->urls, url); } url->part_order = cbd->cur_part_order++; if (cbd->cur_url_order) { url->order = *(cbd->cur_url_order)++; } } return TRUE; } static gboolean rspamd_url_text_part_callback(struct rspamd_url *url, gsize start_offset, gsize end_offset, gpointer ud) { struct rspamd_url_mimepart_cbdata *cbd = (struct rspamd_url_mimepart_cbdata *) ud; struct rspamd_process_exception *ex; struct rspamd_task *task; task = cbd->task; ex = rspamd_mempool_alloc0(task->task_pool, sizeof(struct rspamd_process_exception)); ex->pos = start_offset; ex->len = end_offset - start_offset; ex->type = RSPAMD_EXCEPTION_URL; ex->ptr = url; cbd->url_len += ex->len; if (cbd->part->utf_stripped_content && cbd->url_len > cbd->part->utf_stripped_content->len * 10) { /* Absurd case, stop here now */ msg_err_task("part has too many URLs, we cannot process more: %z url len; " "%d stripped content length", cbd->url_len, cbd->part->utf_stripped_content->len); return FALSE; } if (url->protocol == PROTOCOL_MAILTO) { if (url->userlen == 0) { return FALSE; } } /* Also check max urls */ if (cbd->task->cfg && cbd->task->cfg->max_urls > 0) { if (kh_size(MESSAGE_FIELD(task, urls)) > cbd->task->cfg->max_urls) { msg_err_task("part has too many URLs, we cannot process more: " "%d urls extracted ", (guint) kh_size(MESSAGE_FIELD(task, urls))); return FALSE; } } url->flags |= RSPAMD_URL_FLAG_FROM_TEXT; if (rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls), url, false) && cbd->part->mime_part->urls) { url->part_order = cbd->cur_part_order++; if (cbd->cur_url_order) { url->order = *(cbd->cur_url_order)++; } g_ptr_array_add(cbd->part->mime_part->urls, url); } cbd->part->exceptions = g_list_prepend( cbd->part->exceptions, ex); /* We also search the query for additional url inside */ if (url->querylen > 0) { rspamd_url_find_multiple(task->task_pool, rspamd_url_query_unsafe(url), url->querylen, RSPAMD_URL_FIND_ALL, NULL, rspamd_url_query_callback, cbd); } return TRUE; } void rspamd_url_text_extract(rspamd_mempool_t *pool, struct rspamd_task *task, struct rspamd_mime_text_part *part, uint16_t *cur_url_order, enum rspamd_url_find_type how) { struct rspamd_url_mimepart_cbdata mcbd; if (part->utf_stripped_content == NULL || part->utf_stripped_content->len == 0) { msg_warn_task("got empty text part"); return; } mcbd.task = task; mcbd.part = part; mcbd.url_len = 0; mcbd.cur_url_order = cur_url_order; mcbd.cur_part_order = 0; rspamd_url_find_multiple(task->task_pool, part->utf_stripped_content->data, part->utf_stripped_content->len, how, part->newlines, rspamd_url_text_part_callback, &mcbd); } void rspamd_url_find_multiple(rspamd_mempool_t *pool, const gchar *in, gsize inlen, enum rspamd_url_find_type how, GPtrArray *nlines, url_insert_function func, gpointer ud) { struct url_callback_data cb; g_assert(in != NULL); if (inlen == 0) { inlen = strlen(in); } memset(&cb, 0, sizeof(cb)); cb.begin = in; cb.end = in + inlen; cb.how = how; cb.pool = pool; cb.funcd = ud; cb.func = func; cb.newlines = nlines; if (how == RSPAMD_URL_FIND_ALL) { if (url_scanner->search_trie_full) { cb.matchers = url_scanner->matchers_full; rspamd_multipattern_lookup(url_scanner->search_trie_full, in, inlen, rspamd_url_trie_generic_callback_multiple, &cb, NULL); } else { cb.matchers = url_scanner->matchers_strict; rspamd_multipattern_lookup(url_scanner->search_trie_strict, in, inlen, rspamd_url_trie_generic_callback_multiple, &cb, NULL); } } else { cb.matchers = url_scanner->matchers_strict; rspamd_multipattern_lookup(url_scanner->search_trie_strict, in, inlen, rspamd_url_trie_generic_callback_multiple, &cb, NULL); } } void rspamd_url_find_single(rspamd_mempool_t *pool, const gchar *in, gsize inlen, enum rspamd_url_find_type how, url_insert_function func, gpointer ud) { struct url_callback_data cb; g_assert(in != NULL); if (inlen == 0) { inlen = strlen(in); } /* * We might have a situation when we need to parse URLs on config file * parsing, but there is no valid url_scanner loaded. Hence, we just load * some defaults and it should be fine... */ if (url_scanner == NULL) { rspamd_url_init(NULL); } memset(&cb, 0, sizeof(cb)); cb.begin = in; cb.end = in + inlen; cb.how = how; cb.pool = pool; cb.funcd = ud; cb.func = func; if (how == RSPAMD_URL_FIND_ALL) { if (url_scanner->search_trie_full) { cb.matchers = url_scanner->matchers_full; rspamd_multipattern_lookup(url_scanner->search_trie_full, in, inlen, rspamd_url_trie_generic_callback_single, &cb, NULL); } else { cb.matchers = url_scanner->matchers_strict; rspamd_multipattern_lookup(url_scanner->search_trie_strict, in, inlen, rspamd_url_trie_generic_callback_single, &cb, NULL); } } else { cb.matchers = url_scanner->matchers_strict; rspamd_multipattern_lookup(url_scanner->search_trie_strict, in, inlen, rspamd_url_trie_generic_callback_single, &cb, NULL); } } gboolean rspamd_url_task_subject_callback(struct rspamd_url *url, gsize start_offset, gsize end_offset, gpointer ud) { struct rspamd_task *task = ud; gchar *url_str = NULL; struct rspamd_url *query_url; gint rc; gboolean prefix_added; /* It is just a displayed URL, we should not check it for certain things */ url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED | RSPAMD_URL_FLAG_SUBJECT; if (url->protocol == PROTOCOL_MAILTO) { if (url->userlen == 0) { return FALSE; } } rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls), url, false); /* We also search the query for additional url inside */ if (url->querylen > 0) { if (rspamd_url_find(task->task_pool, rspamd_url_query_unsafe(url), url->querylen, &url_str, RSPAMD_URL_FIND_ALL, NULL, &prefix_added)) { query_url = rspamd_mempool_alloc0(task->task_pool, sizeof(struct rspamd_url)); rc = rspamd_url_parse(query_url, url_str, strlen(url_str), task->task_pool, RSPAMD_URL_PARSE_TEXT); if (rc == URI_ERRNO_OK && url->hostlen > 0) { msg_debug_task("found url %s in query of url" " %*s", url_str, url->querylen, rspamd_url_query_unsafe(url)); if (prefix_added) { query_url->flags |= RSPAMD_URL_FLAG_SCHEMALESS; } if (query_url->protocol == PROTOCOL_MAILTO) { if (query_url->userlen == 0) { return TRUE; } } rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls), query_url, false); } } } return TRUE; } static inline khint_t rspamd_url_hash(struct rspamd_url *url) { if (url->urllen > 0) { return (khint_t) rspamd_cryptobox_fast_hash(url->string, url->urllen, rspamd_hash_seed()); } return 0; } static inline khint_t rspamd_url_host_hash(struct rspamd_url *url) { if (url->hostlen > 0) { return (khint_t) rspamd_cryptobox_fast_hash(rspamd_url_host_unsafe(url), url->hostlen, rspamd_hash_seed()); } return 0; } /* Compare two emails for building emails tree */ static inline bool rspamd_emails_cmp(struct rspamd_url *u1, struct rspamd_url *u2) { gint r; if (u1->hostlen != u2->hostlen || u1->hostlen == 0) { return FALSE; } else { if ((r = rspamd_lc_cmp(rspamd_url_host_unsafe(u1), rspamd_url_host_unsafe(u2), u1->hostlen)) == 0) { if (u1->userlen != u2->userlen || u1->userlen == 0) { return FALSE; } else { return (rspamd_lc_cmp(rspamd_url_user_unsafe(u1), rspamd_url_user_unsafe(u2), u1->userlen) == 0); } } else { return r == 0; } } return FALSE; } static inline bool rspamd_urls_cmp(struct rspamd_url *u1, struct rspamd_url *u2) { int r = 0; if (u1->protocol != u2->protocol || u1->urllen != u2->urllen) { return false; } else { if (u1->protocol & PROTOCOL_MAILTO) { return rspamd_emails_cmp(u1, u2); } r = memcmp(u1->string, u2->string, u1->urllen); } return r == 0; } static inline bool rspamd_urls_host_cmp(struct rspamd_url *u1, struct rspamd_url *u2) { int r = 0; if (u1->hostlen != u2->hostlen) { return false; } else { r = memcmp(rspamd_url_host_unsafe(u1), rspamd_url_host_unsafe(u2), u1->hostlen); } return r == 0; } gsize rspamd_url_decode(gchar *dst, const gchar *src, gsize size) { gchar *d, ch, c, decoded; const gchar *s; enum { sw_usual = 0, sw_quoted, sw_quoted_second } state; d = dst; s = src; state = 0; decoded = 0; while (size--) { ch = *s++; switch (state) { case sw_usual: if (ch == '%') { state = sw_quoted; break; } else if (ch == '+') { *d++ = ' '; } else { *d++ = ch; } break; case sw_quoted: if (ch >= '0' && ch <= '9') { decoded = (ch - '0'); state = sw_quoted_second; break; } c = (ch | 0x20); if (c >= 'a' && c <= 'f') { decoded = (c - 'a' + 10); state = sw_quoted_second; break; } /* the invalid quoted character */ state = sw_usual; *d++ = ch; break; case sw_quoted_second: state = sw_usual; if (ch >= '0' && ch <= '9') { ch = ((decoded << 4) + ch - '0'); *d++ = ch; break; } c = (u_char) (ch | 0x20); if (c >= 'a' && c <= 'f') { ch = ((decoded << 4) + c - 'a' + 10); *d++ = ch; break; } /* the invalid quoted character */ break; } } return (d - dst); } enum rspamd_url_char_class { RSPAMD_URL_UNRESERVED = (1 << 0), RSPAMD_URL_SUBDELIM = (1 << 1), RSPAMD_URL_PATHSAFE = (1 << 2), RSPAMD_URL_QUERYSAFE = (1 << 3), RSPAMD_URL_FRAGMENTSAFE = (1 << 4), RSPAMD_URL_HOSTSAFE = (1 << 5), RSPAMD_URL_USERSAFE = (1 << 6), }; #define RSPAMD_URL_FLAGS_HOSTSAFE (RSPAMD_URL_UNRESERVED | RSPAMD_URL_HOSTSAFE | RSPAMD_URL_SUBDELIM) #define RSPAMD_URL_FLAGS_USERSAFE (RSPAMD_URL_UNRESERVED | RSPAMD_URL_USERSAFE | RSPAMD_URL_SUBDELIM) #define RSPAMD_URL_FLAGS_PATHSAFE (RSPAMD_URL_UNRESERVED | RSPAMD_URL_PATHSAFE | RSPAMD_URL_SUBDELIM) #define RSPAMD_URL_FLAGS_QUERYSAFE (RSPAMD_URL_UNRESERVED | RSPAMD_URL_QUERYSAFE | RSPAMD_URL_SUBDELIM) #define RSPAMD_URL_FLAGS_FRAGMENTSAFE (RSPAMD_URL_UNRESERVED | RSPAMD_URL_FRAGMENTSAFE | RSPAMD_URL_SUBDELIM) static const unsigned char rspamd_url_encoding_classes[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* */, RSPAMD_URL_SUBDELIM /* ! */, 0 /* " */, 0 /* # */, RSPAMD_URL_SUBDELIM /* $ */, 0 /* % */, RSPAMD_URL_SUBDELIM /* & */, RSPAMD_URL_SUBDELIM /* ' */, RSPAMD_URL_SUBDELIM /* ( */, RSPAMD_URL_SUBDELIM /* ) */, RSPAMD_URL_SUBDELIM /* * */, RSPAMD_URL_SUBDELIM /* + */, RSPAMD_URL_SUBDELIM /* , */, RSPAMD_URL_UNRESERVED /* - */, RSPAMD_URL_UNRESERVED /* . */, RSPAMD_URL_PATHSAFE | RSPAMD_URL_QUERYSAFE | RSPAMD_URL_FRAGMENTSAFE /* / */, RSPAMD_URL_UNRESERVED /* 0 */, RSPAMD_URL_UNRESERVED /* 1 */, RSPAMD_URL_UNRESERVED /* 2 */, RSPAMD_URL_UNRESERVED /* 3 */, RSPAMD_URL_UNRESERVED /* 4 */, RSPAMD_URL_UNRESERVED /* 5 */, RSPAMD_URL_UNRESERVED /* 6 */, RSPAMD_URL_UNRESERVED /* 7 */, RSPAMD_URL_UNRESERVED /* 8 */, RSPAMD_URL_UNRESERVED /* 9 */, RSPAMD_URL_USERSAFE | RSPAMD_URL_HOSTSAFE | RSPAMD_URL_PATHSAFE | RSPAMD_URL_QUERYSAFE | RSPAMD_URL_FRAGMENTSAFE /* : */, RSPAMD_URL_SUBDELIM /* ; */, 0 /* < */, RSPAMD_URL_SUBDELIM /* = */, 0 /* > */, RSPAMD_URL_QUERYSAFE | RSPAMD_URL_FRAGMENTSAFE /* ? */, RSPAMD_URL_PATHSAFE | RSPAMD_URL_QUERYSAFE | RSPAMD_URL_FRAGMENTSAFE /* @ */, RSPAMD_URL_UNRESERVED /* A */, RSPAMD_URL_UNRESERVED /* B */, RSPAMD_URL_UNRESERVED /* C */, RSPAMD_URL_UNRESERVED /* D */, RSPAMD_URL_UNRESERVED /* E */, RSPAMD_URL_UNRESERVED /* F */, RSPAMD_URL_UNRESERVED /* G */, RSPAMD_URL_UNRESERVED /* H */, RSPAMD_URL_UNRESERVED /* I */, RSPAMD_URL_UNRESERVED /* J */, RSPAMD_URL_UNRESERVED /* K */, RSPAMD_URL_UNRESERVED /* L */, RSPAMD_URL_UNRESERVED /* M */, RSPAMD_URL_UNRESERVED /* N */, RSPAMD_URL_UNRESERVED /* O */, RSPAMD_URL_UNRESERVED /* P */, RSPAMD_URL_UNRESERVED /* Q */, RSPAMD_URL_UNRESERVED /* R */, RSPAMD_URL_UNRESERVED /* S */, RSPAMD_URL_UNRESERVED /* T */, RSPAMD_URL_UNRESERVED /* U */, RSPAMD_URL_UNRESERVED /* V */, RSPAMD_URL_UNRESERVED /* W */, RSPAMD_URL_UNRESERVED /* X */, RSPAMD_URL_UNRESERVED /* Y */, RSPAMD_URL_UNRESERVED /* Z */, RSPAMD_URL_HOSTSAFE /* [ */, 0 /* \ */, RSPAMD_URL_HOSTSAFE /* ] */, 0 /* ^ */, RSPAMD_URL_UNRESERVED /* _ */, 0 /* ` */, RSPAMD_URL_UNRESERVED /* a */, RSPAMD_URL_UNRESERVED /* b */, RSPAMD_URL_UNRESERVED /* c */, RSPAMD_URL_UNRESERVED /* d */, RSPAMD_URL_UNRESERVED /* e */, RSPAMD_URL_UNRESERVED /* f */, RSPAMD_URL_UNRESERVED /* g */, RSPAMD_URL_UNRESERVED /* h */, RSPAMD_URL_UNRESERVED /* i */, RSPAMD_URL_UNRESERVED /* j */, RSPAMD_URL_UNRESERVED /* k */, RSPAMD_URL_UNRESERVED /* l */, RSPAMD_URL_UNRESERVED /* m */, RSPAMD_URL_UNRESERVED /* n */, RSPAMD_URL_UNRESERVED /* o */, RSPAMD_URL_UNRESERVED /* p */, RSPAMD_URL_UNRESERVED /* q */, RSPAMD_URL_UNRESERVED /* r */, RSPAMD_URL_UNRESERVED /* s */, RSPAMD_URL_UNRESERVED /* t */, RSPAMD_URL_UNRESERVED /* u */, RSPAMD_URL_UNRESERVED /* v */, RSPAMD_URL_UNRESERVED /* w */, RSPAMD_URL_UNRESERVED /* x */, RSPAMD_URL_UNRESERVED /* y */, RSPAMD_URL_UNRESERVED /* z */, 0 /* { */, 0 /* | */, 0 /* } */, RSPAMD_URL_UNRESERVED /* ~ */, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; #define CHECK_URL_COMPONENT(beg, len, flags) \ do { \ for (i = 0; i < (len); i++) { \ if ((rspamd_url_encoding_classes[(guchar) (beg)[i]] & (flags)) == 0) { \ dlen += 2; \ } \ } \ } while (0) #define ENCODE_URL_COMPONENT(beg, len, flags) \ do { \ for (i = 0; i < (len) && dend > d; i++) { \ if ((rspamd_url_encoding_classes[(guchar) (beg)[i]] & (flags)) == 0) { \ *d++ = '%'; \ *d++ = hexdigests[(guchar) ((beg)[i] >> 4) & 0xf]; \ *d++ = hexdigests[(guchar) (beg)[i] & 0xf]; \ } \ else { \ *d++ = (beg)[i]; \ } \ } \ } while (0) const gchar * rspamd_url_encode(struct rspamd_url *url, gsize *pdlen, rspamd_mempool_t *pool) { guchar *dest, *d, *dend; static const gchar hexdigests[16] = "0123456789ABCDEF"; guint i; gsize dlen = 0; g_assert(pdlen != NULL && url != NULL && pool != NULL); CHECK_URL_COMPONENT(rspamd_url_host_unsafe(url), url->hostlen, RSPAMD_URL_FLAGS_HOSTSAFE); CHECK_URL_COMPONENT(rspamd_url_user_unsafe(url), url->userlen, RSPAMD_URL_FLAGS_USERSAFE); CHECK_URL_COMPONENT(rspamd_url_data_unsafe(url), url->datalen, RSPAMD_URL_FLAGS_PATHSAFE); CHECK_URL_COMPONENT(rspamd_url_query_unsafe(url), url->querylen, RSPAMD_URL_FLAGS_QUERYSAFE); CHECK_URL_COMPONENT(rspamd_url_fragment_unsafe(url), url->fragmentlen, RSPAMD_URL_FLAGS_FRAGMENTSAFE); if (dlen == 0) { *pdlen = url->urllen; return url->string; } /* Need to encode */ dlen += url->urllen + sizeof("telephone://"); /* Protocol hack */ dest = rspamd_mempool_alloc(pool, dlen + 1); d = dest; dend = d + dlen; if (url->protocollen > 0) { if (!(url->protocol & PROTOCOL_UNKNOWN)) { const gchar *known_proto = rspamd_url_protocol_name(url->protocol); d += rspamd_snprintf((gchar *) d, dend - d, "%s://", known_proto); } else { d += rspamd_snprintf((gchar *) d, dend - d, "%*s://", (gint) url->protocollen, url->string); } } else { d += rspamd_snprintf((gchar *) d, dend - d, "http://"); } if (url->userlen > 0) { ENCODE_URL_COMPONENT(rspamd_url_user_unsafe(url), url->userlen, RSPAMD_URL_FLAGS_USERSAFE); *d++ = '@'; } ENCODE_URL_COMPONENT(rspamd_url_host_unsafe(url), url->hostlen, RSPAMD_URL_FLAGS_HOSTSAFE); if (url->datalen > 0) { *d++ = '/'; ENCODE_URL_COMPONENT(rspamd_url_data_unsafe(url), url->datalen, RSPAMD_URL_FLAGS_PATHSAFE); } if (url->querylen > 0) { *d++ = '?'; ENCODE_URL_COMPONENT(rspamd_url_query_unsafe(url), url->querylen, RSPAMD_URL_FLAGS_QUERYSAFE); } if (url->fragmentlen > 0) { *d++ = '#'; ENCODE_URL_COMPONENT(rspamd_url_fragment_unsafe(url), url->fragmentlen, RSPAMD_URL_FLAGS_FRAGMENTSAFE); } *pdlen = (d - dest); return (const gchar *) dest; } gboolean rspamd_url_is_domain(int c) { return is_domain((guchar) c); } const gchar * rspamd_url_protocol_name(enum rspamd_url_protocol proto) { const gchar *ret = "unknown"; switch (proto) { case PROTOCOL_HTTP: ret = "http"; break; case PROTOCOL_HTTPS: ret = "https"; break; case PROTOCOL_FTP: ret = "ftp"; break; case PROTOCOL_FILE: ret = "file"; break; case PROTOCOL_MAILTO: ret = "mailto"; break; case PROTOCOL_TELEPHONE: ret = "telephone"; break; default: break; } return ret; } enum rspamd_url_protocol rspamd_url_protocol_from_string(const gchar *str) { enum rspamd_url_protocol ret = PROTOCOL_UNKNOWN; if (strcmp(str, "http") == 0) { ret = PROTOCOL_HTTP; } else if (strcmp(str, "https") == 0) { ret = PROTOCOL_HTTPS; } else if (strcmp(str, "mailto") == 0) { ret = PROTOCOL_MAILTO; } else if (strcmp(str, "ftp") == 0) { ret = PROTOCOL_FTP; } else if (strcmp(str, "file") == 0) { ret = PROTOCOL_FILE; } else if (strcmp(str, "telephone") == 0) { ret = PROTOCOL_TELEPHONE; } return ret; } bool rspamd_url_set_add_or_increase(khash_t(rspamd_url_hash) * set, struct rspamd_url *u, bool enforce_replace) { khiter_t k; gint r; k = kh_get(rspamd_url_hash, set, u); if (k != kh_end(set)) { /* Existing url */ struct rspamd_url *ex = kh_key(set, k); #define SUSPICIOUS_URL_FLAGS (RSPAMD_URL_FLAG_PHISHED | RSPAMD_URL_FLAG_OBSCURED | RSPAMD_URL_FLAG_ZW_SPACES) if (enforce_replace) { kh_key(set, k) = u; u->count++; } else { if (u->flags & SUSPICIOUS_URL_FLAGS) { if (!(ex->flags & SUSPICIOUS_URL_FLAGS)) { /* Propagate new url to an old one */ kh_key(set, k) = u; u->count++; } else { ex->count++; } } else { ex->count++; } } return false; } else { k = kh_put(rspamd_url_hash, set, u, &r); } return true; } struct rspamd_url * rspamd_url_set_add_or_return(khash_t(rspamd_url_hash) * set, struct rspamd_url *u) { khiter_t k; gint r; if (set) { k = kh_get(rspamd_url_hash, set, u); if (k != kh_end(set)) { return kh_key(set, k); } else { k = kh_put(rspamd_url_hash, set, u, &r); return kh_key(set, k); } } return NULL; } bool rspamd_url_host_set_add(khash_t(rspamd_url_host_hash) * set, struct rspamd_url *u) { gint r; if (set) { kh_put(rspamd_url_host_hash, set, u, &r); if (r == 0) { return false; } return true; } return false; } bool rspamd_url_set_has(khash_t(rspamd_url_hash) * set, struct rspamd_url *u) { khiter_t k; if (set) { k = kh_get(rspamd_url_hash, set, u); if (k == kh_end(set)) { return false; } return true; } return false; } bool rspamd_url_host_set_has(khash_t(rspamd_url_host_hash) * set, struct rspamd_url *u) { khiter_t k; if (set) { k = kh_get(rspamd_url_host_hash, set, u); if (k == kh_end(set)) { return false; } return true; } return false; } bool rspamd_url_flag_from_string(const gchar *str, gint *flag) { gint h = rspamd_cryptobox_fast_hash_specific(RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT, str, strlen(str), 0); for (int i = 0; i < G_N_ELEMENTS(url_flag_names); i++) { if (url_flag_names[i].hash == h) { *flag |= url_flag_names[i].flag; return true; } } return false; } const gchar * rspamd_url_flag_to_string(int flag) { for (int i = 0; i < G_N_ELEMENTS(url_flag_names); i++) { if (url_flag_names[i].flag & flag) { return url_flag_names[i].name; } } return NULL; } inline int rspamd_url_cmp(const struct rspamd_url *u1, const struct rspamd_url *u2) { int min_len = MIN(u1->urllen, u2->urllen); int r; if (u1->protocol != u2->protocol) { return u1->protocol - u2->protocol; } if (u1->protocol & PROTOCOL_MAILTO) { /* Emails specialisation (hosts must be compared in a case insensitive matter */ min_len = MIN(u1->hostlen, u2->hostlen); if ((r = rspamd_lc_cmp(rspamd_url_host_unsafe(u1), rspamd_url_host_unsafe(u2), min_len)) == 0) { if (u1->hostlen == u2->hostlen) { if (u1->userlen != u2->userlen || u1->userlen == 0) { r = (int) u1->userlen - (int) u2->userlen; } else { r = memcmp(rspamd_url_user_unsafe(u1), rspamd_url_user_unsafe(u2), u1->userlen); } } else { r = u1->hostlen - u2->hostlen; } } } else { if (u1->urllen != u2->urllen) { /* Different length, compare common part and then compare length */ r = memcmp(u1->string, u2->string, min_len); if (r == 0) { r = u1->urllen - u2->urllen; } } else { /* Equal length */ r = memcmp(u1->string, u2->string, u1->urllen); } } return r; } int rspamd_url_cmp_qsort(const void *_u1, const void *_u2) { const struct rspamd_url *u1 = *(struct rspamd_url **) _u1, *u2 = *(struct rspamd_url **) _u2; return rspamd_url_cmp(u1, u2); }