aboutsummaryrefslogtreecommitdiffstats
path: root/modules/typesniffer
diff options
context:
space:
mode:
Diffstat (limited to 'modules/typesniffer')
-rw-r--r--modules/typesniffer/typesniffer.go65
-rw-r--r--modules/typesniffer/typesniffer_test.go16
2 files changed, 32 insertions, 49 deletions
diff --git a/modules/typesniffer/typesniffer.go b/modules/typesniffer/typesniffer.go
index 8cb3d278ce..2e8d9c4a1e 100644
--- a/modules/typesniffer/typesniffer.go
+++ b/modules/typesniffer/typesniffer.go
@@ -6,18 +6,14 @@ package typesniffer
import (
"bytes"
"encoding/binary"
- "fmt"
- "io"
"net/http"
"regexp"
"slices"
"strings"
-
- "code.gitea.io/gitea/modules/util"
+ "sync"
)
-// Use at most this many bytes to determine Content Type.
-const sniffLen = 1024
+const SniffContentSize = 1024
const (
MimeTypeImageSvg = "image/svg+xml"
@@ -26,22 +22,30 @@ const (
MimeTypeApplicationOctetStream = "application/octet-stream"
)
-var (
- svgComment = regexp.MustCompile(`(?s)<!--.*?-->`)
- svgTagRegex = regexp.MustCompile(`(?si)\A\s*(?:(<!DOCTYPE\s+svg([\s:]+.*?>|>))\s*)*<svg\b`)
- svgTagInXMLRegex = regexp.MustCompile(`(?si)\A<\?xml\b.*?\?>\s*(?:(<!DOCTYPE\s+svg([\s:]+.*?>|>))\s*)*<svg\b`)
-)
-
-// SniffedType contains information about a blobs type.
+var globalVars = sync.OnceValue(func() (ret struct {
+ svgComment, svgTagRegex, svgTagInXMLRegex *regexp.Regexp
+},
+) {
+ ret.svgComment = regexp.MustCompile(`(?s)<!--.*?-->`)
+ ret.svgTagRegex = regexp.MustCompile(`(?si)\A\s*(?:(<!DOCTYPE\s+svg([\s:]+.*?>|>))\s*)*<svg\b`)
+ ret.svgTagInXMLRegex = regexp.MustCompile(`(?si)\A<\?xml\b.*?\?>\s*(?:(<!DOCTYPE\s+svg([\s:]+.*?>|>))\s*)*<svg\b`)
+ return ret
+})
+
+// SniffedType contains information about a blob's type.
type SniffedType struct {
contentType string
}
-// IsText etects if content format is plain text.
+// IsText detects if the content format is text family, including text/plain, text/html, text/css, etc.
func (ct SniffedType) IsText() bool {
return strings.Contains(ct.contentType, "text/")
}
+func (ct SniffedType) IsTextPlain() bool {
+ return strings.Contains(ct.contentType, "text/plain")
+}
+
// IsImage detects if data is an image format
func (ct SniffedType) IsImage() bool {
return strings.Contains(ct.contentType, "image/")
@@ -57,12 +61,12 @@ func (ct SniffedType) IsPDF() bool {
return strings.Contains(ct.contentType, "application/pdf")
}
-// IsVideo detects if data is an video format
+// IsVideo detects if data is a video format
func (ct SniffedType) IsVideo() bool {
return strings.Contains(ct.contentType, "video/")
}
-// IsAudio detects if data is an video format
+// IsAudio detects if data is a video format
func (ct SniffedType) IsAudio() bool {
return strings.Contains(ct.contentType, "audio/")
}
@@ -103,33 +107,34 @@ func detectFileTypeBox(data []byte) (brands []string, found bool) {
return brands, true
}
-// DetectContentType extends http.DetectContentType with more content types. Defaults to text/unknown if input is empty.
+// DetectContentType extends http.DetectContentType with more content types. Defaults to text/plain if input is empty.
func DetectContentType(data []byte) SniffedType {
if len(data) == 0 {
- return SniffedType{"text/unknown"}
+ return SniffedType{"text/plain"}
}
ct := http.DetectContentType(data)
- if len(data) > sniffLen {
- data = data[:sniffLen]
+ if len(data) > SniffContentSize {
+ data = data[:SniffContentSize]
}
+ vars := globalVars()
// SVG is unsupported by http.DetectContentType, https://github.com/golang/go/issues/15888
detectByHTML := strings.Contains(ct, "text/plain") || strings.Contains(ct, "text/html")
detectByXML := strings.Contains(ct, "text/xml")
if detectByHTML || detectByXML {
- dataProcessed := svgComment.ReplaceAll(data, nil)
+ dataProcessed := vars.svgComment.ReplaceAll(data, nil)
dataProcessed = bytes.TrimSpace(dataProcessed)
- if detectByHTML && svgTagRegex.Match(dataProcessed) ||
- detectByXML && svgTagInXMLRegex.Match(dataProcessed) {
+ if detectByHTML && vars.svgTagRegex.Match(dataProcessed) ||
+ detectByXML && vars.svgTagInXMLRegex.Match(dataProcessed) {
ct = MimeTypeImageSvg
}
}
if strings.HasPrefix(ct, "audio/") && bytes.HasPrefix(data, []byte("ID3")) {
// The MP3 detection is quite inaccurate, any content with "ID3" prefix will result in "audio/mpeg".
- // So remove the "ID3" prefix and detect again, if result is text, then it must be text content.
+ // So remove the "ID3" prefix and detect again, then if the result is "text", it must be text content.
// This works especially because audio files contain many unprintable/invalid characters like `0x00`
ct2 := http.DetectContentType(data[3:])
if strings.HasPrefix(ct2, "text/") {
@@ -155,15 +160,3 @@ func DetectContentType(data []byte) SniffedType {
}
return SniffedType{ct}
}
-
-// DetectContentTypeFromReader guesses the content type contained in the reader.
-func DetectContentTypeFromReader(r io.Reader) (SniffedType, error) {
- buf := make([]byte, sniffLen)
- n, err := util.ReadAtMost(r, buf)
- if err != nil {
- return SniffedType{}, fmt.Errorf("DetectContentTypeFromReader io error: %w", err)
- }
- buf = buf[:n]
-
- return DetectContentType(buf), nil
-}
diff --git a/modules/typesniffer/typesniffer_test.go b/modules/typesniffer/typesniffer_test.go
index 3e5db3308b..a0c824b912 100644
--- a/modules/typesniffer/typesniffer_test.go
+++ b/modules/typesniffer/typesniffer_test.go
@@ -4,7 +4,6 @@
package typesniffer
import (
- "bytes"
"encoding/base64"
"encoding/hex"
"strings"
@@ -17,7 +16,7 @@ func TestDetectContentTypeLongerThanSniffLen(t *testing.T) {
// Pre-condition: Shorter than sniffLen detects SVG.
assert.Equal(t, "image/svg+xml", DetectContentType([]byte(`<!-- Comment --><svg></svg>`)).contentType)
// Longer than sniffLen detects something else.
- assert.NotEqual(t, "image/svg+xml", DetectContentType([]byte(`<!-- `+strings.Repeat("x", sniffLen)+` --><svg></svg>`)).contentType)
+ assert.NotEqual(t, "image/svg+xml", DetectContentType([]byte(`<!-- `+strings.Repeat("x", SniffContentSize)+` --><svg></svg>`)).contentType)
}
func TestIsTextFile(t *testing.T) {
@@ -116,22 +115,13 @@ func TestIsAudio(t *testing.T) {
assert.True(t, DetectContentType([]byte("ID3Toy\n====\t* hi 🌞, ..."+"🌛"[0:2])).IsText()) // test ID3 tag with incomplete UTF8 char
}
-func TestDetectContentTypeFromReader(t *testing.T) {
- mp3, _ := base64.StdEncoding.DecodeString("SUQzBAAAAAABAFRYWFgAAAASAAADbWFqb3JfYnJhbmQAbXA0MgBUWFhYAAAAEQAAA21pbm9yX3Zl")
- st, err := DetectContentTypeFromReader(bytes.NewReader(mp3))
- assert.NoError(t, err)
- assert.True(t, st.IsAudio())
-}
-
func TestDetectContentTypeOgg(t *testing.T) {
oggAudio, _ := hex.DecodeString("4f67675300020000000000000000352f0000000000007dc39163011e01766f72626973000000000244ac0000000000000071020000000000b8014f6767530000")
- st, err := DetectContentTypeFromReader(bytes.NewReader(oggAudio))
- assert.NoError(t, err)
+ st := DetectContentType(oggAudio)
assert.True(t, st.IsAudio())
oggVideo, _ := hex.DecodeString("4f676753000200000000000000007d9747ef000000009b59daf3012a807468656f7261030201001e00110001e000010e00020000001e00000001000001000001")
- st, err = DetectContentTypeFromReader(bytes.NewReader(oggVideo))
- assert.NoError(t, err)
+ st = DetectContentType(oggVideo)
assert.True(t, st.IsVideo())
}