diff options
Diffstat (limited to 'modules/typesniffer')
-rw-r--r-- | modules/typesniffer/typesniffer.go | 65 | ||||
-rw-r--r-- | modules/typesniffer/typesniffer_test.go | 16 |
2 files changed, 32 insertions, 49 deletions
diff --git a/modules/typesniffer/typesniffer.go b/modules/typesniffer/typesniffer.go index 8cb3d278ce..2e8d9c4a1e 100644 --- a/modules/typesniffer/typesniffer.go +++ b/modules/typesniffer/typesniffer.go @@ -6,18 +6,14 @@ package typesniffer import ( "bytes" "encoding/binary" - "fmt" - "io" "net/http" "regexp" "slices" "strings" - - "code.gitea.io/gitea/modules/util" + "sync" ) -// Use at most this many bytes to determine Content Type. -const sniffLen = 1024 +const SniffContentSize = 1024 const ( MimeTypeImageSvg = "image/svg+xml" @@ -26,22 +22,30 @@ const ( MimeTypeApplicationOctetStream = "application/octet-stream" ) -var ( - svgComment = regexp.MustCompile(`(?s)<!--.*?-->`) - svgTagRegex = regexp.MustCompile(`(?si)\A\s*(?:(<!DOCTYPE\s+svg([\s:]+.*?>|>))\s*)*<svg\b`) - svgTagInXMLRegex = regexp.MustCompile(`(?si)\A<\?xml\b.*?\?>\s*(?:(<!DOCTYPE\s+svg([\s:]+.*?>|>))\s*)*<svg\b`) -) - -// SniffedType contains information about a blobs type. +var globalVars = sync.OnceValue(func() (ret struct { + svgComment, svgTagRegex, svgTagInXMLRegex *regexp.Regexp +}, +) { + ret.svgComment = regexp.MustCompile(`(?s)<!--.*?-->`) + ret.svgTagRegex = regexp.MustCompile(`(?si)\A\s*(?:(<!DOCTYPE\s+svg([\s:]+.*?>|>))\s*)*<svg\b`) + ret.svgTagInXMLRegex = regexp.MustCompile(`(?si)\A<\?xml\b.*?\?>\s*(?:(<!DOCTYPE\s+svg([\s:]+.*?>|>))\s*)*<svg\b`) + return ret +}) + +// SniffedType contains information about a blob's type. type SniffedType struct { contentType string } -// IsText etects if content format is plain text. +// IsText detects if the content format is text family, including text/plain, text/html, text/css, etc. func (ct SniffedType) IsText() bool { return strings.Contains(ct.contentType, "text/") } +func (ct SniffedType) IsTextPlain() bool { + return strings.Contains(ct.contentType, "text/plain") +} + // IsImage detects if data is an image format func (ct SniffedType) IsImage() bool { return strings.Contains(ct.contentType, "image/") @@ -57,12 +61,12 @@ func (ct SniffedType) IsPDF() bool { return strings.Contains(ct.contentType, "application/pdf") } -// IsVideo detects if data is an video format +// IsVideo detects if data is a video format func (ct SniffedType) IsVideo() bool { return strings.Contains(ct.contentType, "video/") } -// IsAudio detects if data is an video format +// IsAudio detects if data is a video format func (ct SniffedType) IsAudio() bool { return strings.Contains(ct.contentType, "audio/") } @@ -103,33 +107,34 @@ func detectFileTypeBox(data []byte) (brands []string, found bool) { return brands, true } -// DetectContentType extends http.DetectContentType with more content types. Defaults to text/unknown if input is empty. +// DetectContentType extends http.DetectContentType with more content types. Defaults to text/plain if input is empty. func DetectContentType(data []byte) SniffedType { if len(data) == 0 { - return SniffedType{"text/unknown"} + return SniffedType{"text/plain"} } ct := http.DetectContentType(data) - if len(data) > sniffLen { - data = data[:sniffLen] + if len(data) > SniffContentSize { + data = data[:SniffContentSize] } + vars := globalVars() // SVG is unsupported by http.DetectContentType, https://github.com/golang/go/issues/15888 detectByHTML := strings.Contains(ct, "text/plain") || strings.Contains(ct, "text/html") detectByXML := strings.Contains(ct, "text/xml") if detectByHTML || detectByXML { - dataProcessed := svgComment.ReplaceAll(data, nil) + dataProcessed := vars.svgComment.ReplaceAll(data, nil) dataProcessed = bytes.TrimSpace(dataProcessed) - if detectByHTML && svgTagRegex.Match(dataProcessed) || - detectByXML && svgTagInXMLRegex.Match(dataProcessed) { + if detectByHTML && vars.svgTagRegex.Match(dataProcessed) || + detectByXML && vars.svgTagInXMLRegex.Match(dataProcessed) { ct = MimeTypeImageSvg } } if strings.HasPrefix(ct, "audio/") && bytes.HasPrefix(data, []byte("ID3")) { // The MP3 detection is quite inaccurate, any content with "ID3" prefix will result in "audio/mpeg". - // So remove the "ID3" prefix and detect again, if result is text, then it must be text content. + // So remove the "ID3" prefix and detect again, then if the result is "text", it must be text content. // This works especially because audio files contain many unprintable/invalid characters like `0x00` ct2 := http.DetectContentType(data[3:]) if strings.HasPrefix(ct2, "text/") { @@ -155,15 +160,3 @@ func DetectContentType(data []byte) SniffedType { } return SniffedType{ct} } - -// DetectContentTypeFromReader guesses the content type contained in the reader. -func DetectContentTypeFromReader(r io.Reader) (SniffedType, error) { - buf := make([]byte, sniffLen) - n, err := util.ReadAtMost(r, buf) - if err != nil { - return SniffedType{}, fmt.Errorf("DetectContentTypeFromReader io error: %w", err) - } - buf = buf[:n] - - return DetectContentType(buf), nil -} diff --git a/modules/typesniffer/typesniffer_test.go b/modules/typesniffer/typesniffer_test.go index 3e5db3308b..a0c824b912 100644 --- a/modules/typesniffer/typesniffer_test.go +++ b/modules/typesniffer/typesniffer_test.go @@ -4,7 +4,6 @@ package typesniffer import ( - "bytes" "encoding/base64" "encoding/hex" "strings" @@ -17,7 +16,7 @@ func TestDetectContentTypeLongerThanSniffLen(t *testing.T) { // Pre-condition: Shorter than sniffLen detects SVG. assert.Equal(t, "image/svg+xml", DetectContentType([]byte(`<!-- Comment --><svg></svg>`)).contentType) // Longer than sniffLen detects something else. - assert.NotEqual(t, "image/svg+xml", DetectContentType([]byte(`<!-- `+strings.Repeat("x", sniffLen)+` --><svg></svg>`)).contentType) + assert.NotEqual(t, "image/svg+xml", DetectContentType([]byte(`<!-- `+strings.Repeat("x", SniffContentSize)+` --><svg></svg>`)).contentType) } func TestIsTextFile(t *testing.T) { @@ -116,22 +115,13 @@ func TestIsAudio(t *testing.T) { assert.True(t, DetectContentType([]byte("ID3Toy\n====\t* hi 🌞, ..."+"🌛"[0:2])).IsText()) // test ID3 tag with incomplete UTF8 char } -func TestDetectContentTypeFromReader(t *testing.T) { - mp3, _ := base64.StdEncoding.DecodeString("SUQzBAAAAAABAFRYWFgAAAASAAADbWFqb3JfYnJhbmQAbXA0MgBUWFhYAAAAEQAAA21pbm9yX3Zl") - st, err := DetectContentTypeFromReader(bytes.NewReader(mp3)) - assert.NoError(t, err) - assert.True(t, st.IsAudio()) -} - func TestDetectContentTypeOgg(t *testing.T) { oggAudio, _ := hex.DecodeString("4f67675300020000000000000000352f0000000000007dc39163011e01766f72626973000000000244ac0000000000000071020000000000b8014f6767530000") - st, err := DetectContentTypeFromReader(bytes.NewReader(oggAudio)) - assert.NoError(t, err) + st := DetectContentType(oggAudio) assert.True(t, st.IsAudio()) oggVideo, _ := hex.DecodeString("4f676753000200000000000000007d9747ef000000009b59daf3012a807468656f7261030201001e00110001e000010e00020000001e00000001000001000001") - st, err = DetectContentTypeFromReader(bytes.NewReader(oggVideo)) - assert.NoError(t, err) + st = DetectContentType(oggVideo) assert.True(t, st.IsVideo()) } |