]> source.dussan.org Git - gitea.git/commitdiff
Do not recognize text files as audio (#23355)
authorwxiaoguang <wxiaoguang@gmail.com>
Wed, 8 Mar 2023 03:40:41 +0000 (11:40 +0800)
committerGitHub <noreply@github.com>
Wed, 8 Mar 2023 03:40:41 +0000 (22:40 -0500)
Close #17108

This PR uses a trick (removing the ID3 tag) to detect the content again
to to see whether the content is text type.

---------

Co-authored-by: delvh <dev.lh@web.de>
Co-authored-by: techknowlogick <techknowlogick@gitea.io>
Co-authored-by: Lunny Xiao <xiaolunwen@gmail.com>
modules/typesniffer/typesniffer.go
modules/typesniffer/typesniffer_test.go

index 5b215496b80c8f152d6185b07a9365b3aae041dd..7887fd42b72efb039b492413360d9e8e5a2fe9b1 100644 (file)
@@ -106,6 +106,16 @@ func DetectContentType(data []byte) SniffedType {
                }
        }
 
+       if strings.HasPrefix(ct, "audio/") && bytes.HasPrefix(data, []byte("ID3")) {
+               // The MP3 detection is quite inaccurate, any content with "ID3" prefix will result in "audio/mpeg".
+               // So remove the "ID3" prefix and detect again, if result is text, then it must be text content.
+               // This works especially because audio files contain many unprintable/invalid characters like `0x00`
+               ct2 := http.DetectContentType(data[3:])
+               if strings.HasPrefix(ct2, "text/") {
+                       ct = ct2
+               }
+       }
+
        return SniffedType{ct}
 }
 
index 2bafdffd141c305c364368b282c773177498db19..6c6da34aa006afd404db7f66fb36e93bde57153c 100644 (file)
@@ -109,6 +109,10 @@ func TestIsAudio(t *testing.T) {
        mp3, _ := base64.StdEncoding.DecodeString("SUQzBAAAAAABAFRYWFgAAAASAAADbWFqb3JfYnJhbmQAbXA0MgBUWFhYAAAAEQAAA21pbm9yX3Zl")
        assert.True(t, DetectContentType(mp3).IsAudio())
        assert.False(t, DetectContentType([]byte("plain text")).IsAudio())
+
+       assert.True(t, DetectContentType([]byte("ID3Toy\000")).IsAudio())
+       assert.True(t, DetectContentType([]byte("ID3Toy\n====\t* hi ðŸŒž, ...")).IsText())          // test ID3 tag for plain text
+       assert.True(t, DetectContentType([]byte("ID3Toy\n====\t* hi ðŸŒž, ..."+"🌛"[0:2])).IsText()) // test ID3 tag with incomplete UTF8 char
 }
 
 func TestDetectContentTypeFromReader(t *testing.T) {