Update emoji regex (#11584)

When matching emoji, use a regex built from the data we have instead of something generic using unicode ranges. A generic regex can't tell the difference between two separate emoji next to each other or one emoji that is built out of two separate emoji next to each other. This means that emoji that are next to each other without space in between will be now accurately spanned individually with proper title etc...
author: mrsdizzie <info@mrsdizzie.com> 2020-05-29 12:08:36 -0400
committer: GitHub <noreply@github.com> 2020-05-29 17:08:36 +0100
commit: 4c1ff57f1a41197bb6f6797d33461c76378e354c (patch)
tree: 5f1fc2fa51ccfefe937353c036b76566712f0fd5 /modules/emoji
parent: 02fa329a7c2190d947cd5e02ea90d2d4406653be (diff)
download: gitea-4c1ff57f1a41197bb6f6797d33461c76378e354c.tar.gz
gitea-4c1ff57f1a41197bb6f6797d33461c76378e354c.zip
1 files changed, 27 insertions, 0 deletions
diff --git a/modules/emoji/emoji.go b/modules/emoji/emoji.go
index 2a51e61fcf..e4b9e5631d 100644
--- a/modules/emoji/emoji.go
+++ b/modules/emoji/emoji.go
@@ -6,8 +6,10 @@
 package emoji
 
 import (
+	"sort"
 	"strings"
 	"sync"
+	"unicode/utf8"
 )
 
 // Gemoji is a set of emoji data.
@@ -48,6 +50,12 @@ func loadMap() {
 		// process emoji codes and aliases
 		codePairs := make([]string, 0)
 		aliasPairs := make([]string, 0)
+
+		// sort from largest to small so we match combined emoji first
+		sort.Slice(GemojiData, func(i, j int) bool {
+			return len(GemojiData[i].Emoji) > len(GemojiData[j].Emoji)
+		})
+
 		for i, e := range GemojiData {
 			if e.Emoji == "" || len(e.Aliases) == 0 {
 				continue
@@ -72,6 +80,7 @@ func loadMap() {
 		codeReplacer = strings.NewReplacer(codePairs...)
 		aliasReplacer = strings.NewReplacer(aliasPairs...)
 	})
+
 }
 
 // FromCode retrieves the emoji data based on the provided unicode code (ie,
@@ -117,3 +126,21 @@ func ReplaceAliases(s string) string {
 	loadMap()
 	return aliasReplacer.Replace(s)
 }
+
+// FindEmojiSubmatchIndex returns index pair of longest emoji in a string
+func FindEmojiSubmatchIndex(s string) []int {
+	loadMap()
+
+	// if rune and string length are the same then no emoji will be present
+	// similar performance when there is unicode present but almost 200% faster when not
+	if utf8.RuneCountInString(s) == len(s) {
+		return nil
+	}
+	for j := range GemojiData {
+		i := strings.Index(s, GemojiData[j].Emoji)
+		if i != -1 {
+			return []int{i, i + len(GemojiData[j].Emoji)}
+		}
+	}
+	return nil
+}
author	mrsdizzie <info@mrsdizzie.com>	2020-05-29 12:08:36 -0400
committer	GitHub <noreply@github.com>	2020-05-29 17:08:36 +0100
commit	4c1ff57f1a41197bb6f6797d33461c76378e354c (patch)
tree	5f1fc2fa51ccfefe937353c036b76566712f0fd5 /modules/emoji
parent	02fa329a7c2190d947cd5e02ea90d2d4406653be (diff)
download	gitea-4c1ff57f1a41197bb6f6797d33461c76378e354c.tar.gz gitea-4c1ff57f1a41197bb6f6797d33461c76378e354c.zip