Do not recognize text files as audio (#23355)

author wxiaoguang <wxiaoguang@gmail.com>

Wed, 8 Mar 2023 03:40:41 +0000 (11:40 +0800)

committer GitHub <noreply@github.com>

Wed, 8 Mar 2023 03:40:41 +0000 (22:40 -0500)
author wxiaoguang <wxiaoguang@gmail.com>
Wed, 8 Mar 2023 03:40:41 +0000 (11:40 +0800)
committer GitHub <noreply@github.com>
Wed, 8 Mar 2023 03:40:41 +0000 (22:40 -0500)
diff --git a/modules/typesniffer/typesniffer.go b/modules/typesniffer/typesniffer.go

index 5b215496b80c8f152d6185b07a9365b3aae041dd..7887fd42b72efb039b492413360d9e8e5a2fe9b1 100644 (file)
--- a/modules/typesniffer/typesniffer.go
+++ b/modules/typesniffer/typesniffer.go
@@ -106,6 +106,16 @@ func DetectContentType(data []byte) SniffedType {
                 }
         }
  
+       if strings.HasPrefix(ct, "audio/") && bytes.HasPrefix(data, []byte("ID3")) {
+               // The MP3 detection is quite inaccurate, any content with "ID3" prefix will result in "audio/mpeg".
+               // So remove the "ID3" prefix and detect again, if result is text, then it must be text content.
+               // This works especially because audio files contain many unprintable/invalid characters like `0x00`
+               ct2 := http.DetectContentType(data[3:])
+               if strings.HasPrefix(ct2, "text/") {
+                       ct = ct2
+               }
+       }
+
         return SniffedType{ct}
  }
  
diff --git a/modules/typesniffer/typesniffer_test.go b/modules/typesniffer/typesniffer_test.go

index 2bafdffd141c305c364368b282c773177498db19..6c6da34aa006afd404db7f66fb36e93bde57153c 100644 (file)
--- a/modules/typesniffer/typesniffer_test.go
+++ b/modules/typesniffer/typesniffer_test.go
@@ -109,6 +109,10 @@ func TestIsAudio(t *testing.T) {
         mp3, _ := base64.StdEncoding.DecodeString("SUQzBAAAAAABAFRYWFgAAAASAAADbWFqb3JfYnJhbmQAbXA0MgBUWFhYAAAAEQAAA21pbm9yX3Zl")
         assert.True(t, DetectContentType(mp3).IsAudio())
         assert.False(t, DetectContentType([]byte("plain text")).IsAudio())
+
+       assert.True(t, DetectContentType([]byte("ID3Toy\000")).IsAudio())
+       assert.True(t, DetectContentType([]byte("ID3Toy\n====\t* hi 🌞, ...")).IsText())          // test ID3 tag for plain text
+       assert.True(t, DetectContentType([]byte("ID3Toy\n====\t* hi 🌞, ..."+"🌛"[0:2])).IsText()) // test ID3 tag with incomplete UTF8 char
  }
  
  func TestDetectContentTypeFromReader(t *testing.T) {
author	wxiaoguang <wxiaoguang@gmail.com>
	Wed, 8 Mar 2023 03:40:41 +0000 (11:40 +0800)
committer	GitHub <noreply@github.com>
	Wed, 8 Mar 2023 03:40:41 +0000 (22:40 -0500)
modules/typesniffer/typesniffer.go		patch \| blob \| history
modules/typesniffer/typesniffer_test.go		patch \| blob \| history