您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

typesniffer.go 4.3KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143
  1. // Copyright 2021 The Gitea Authors. All rights reserved.
  2. // SPDX-License-Identifier: MIT
  3. package typesniffer
  4. import (
  5. "bytes"
  6. "fmt"
  7. "io"
  8. "net/http"
  9. "regexp"
  10. "strings"
  11. "code.gitea.io/gitea/modules/util"
  12. )
  13. // Use at most this many bytes to determine Content Type.
  14. const sniffLen = 1024
  15. const (
  16. // SvgMimeType MIME type of SVG images.
  17. SvgMimeType = "image/svg+xml"
  18. // ApplicationOctetStream MIME type of binary files.
  19. ApplicationOctetStream = "application/octet-stream"
  20. )
  21. var (
  22. svgComment = regexp.MustCompile(`(?s)<!--.*?-->`)
  23. svgTagRegex = regexp.MustCompile(`(?si)\A\s*(?:(<!DOCTYPE\s+svg([\s:]+.*?>|>))\s*)*<svg\b`)
  24. svgTagInXMLRegex = regexp.MustCompile(`(?si)\A<\?xml\b.*?\?>\s*(?:(<!DOCTYPE\s+svg([\s:]+.*?>|>))\s*)*<svg\b`)
  25. )
  26. // SniffedType contains information about a blobs type.
  27. type SniffedType struct {
  28. contentType string
  29. }
  30. // IsText etects if content format is plain text.
  31. func (ct SniffedType) IsText() bool {
  32. return strings.Contains(ct.contentType, "text/")
  33. }
  34. // IsImage detects if data is an image format
  35. func (ct SniffedType) IsImage() bool {
  36. return strings.Contains(ct.contentType, "image/")
  37. }
  38. // IsSvgImage detects if data is an SVG image format
  39. func (ct SniffedType) IsSvgImage() bool {
  40. return strings.Contains(ct.contentType, SvgMimeType)
  41. }
  42. // IsPDF detects if data is a PDF format
  43. func (ct SniffedType) IsPDF() bool {
  44. return strings.Contains(ct.contentType, "application/pdf")
  45. }
  46. // IsVideo detects if data is an video format
  47. func (ct SniffedType) IsVideo() bool {
  48. return strings.Contains(ct.contentType, "video/")
  49. }
  50. // IsAudio detects if data is an video format
  51. func (ct SniffedType) IsAudio() bool {
  52. return strings.Contains(ct.contentType, "audio/")
  53. }
  54. // IsRepresentableAsText returns true if file content can be represented as
  55. // plain text or is empty.
  56. func (ct SniffedType) IsRepresentableAsText() bool {
  57. return ct.IsText() || ct.IsSvgImage()
  58. }
  59. // IsBrowsableBinaryType returns whether a non-text type can be displayed in a browser
  60. func (ct SniffedType) IsBrowsableBinaryType() bool {
  61. return ct.IsImage() || ct.IsSvgImage() || ct.IsPDF() || ct.IsVideo() || ct.IsAudio()
  62. }
  63. // GetMimeType returns the mime type
  64. func (ct SniffedType) GetMimeType() string {
  65. return strings.SplitN(ct.contentType, ";", 2)[0]
  66. }
  67. // DetectContentType extends http.DetectContentType with more content types. Defaults to text/unknown if input is empty.
  68. func DetectContentType(data []byte) SniffedType {
  69. if len(data) == 0 {
  70. return SniffedType{"text/unknown"}
  71. }
  72. ct := http.DetectContentType(data)
  73. if len(data) > sniffLen {
  74. data = data[:sniffLen]
  75. }
  76. // SVG is unsupported by http.DetectContentType, https://github.com/golang/go/issues/15888
  77. detectByHTML := strings.Contains(ct, "text/plain") || strings.Contains(ct, "text/html")
  78. detectByXML := strings.Contains(ct, "text/xml")
  79. if detectByHTML || detectByXML {
  80. dataProcessed := svgComment.ReplaceAll(data, nil)
  81. dataProcessed = bytes.TrimSpace(dataProcessed)
  82. if detectByHTML && svgTagRegex.Match(dataProcessed) ||
  83. detectByXML && svgTagInXMLRegex.Match(dataProcessed) {
  84. ct = SvgMimeType
  85. }
  86. }
  87. if strings.HasPrefix(ct, "audio/") && bytes.HasPrefix(data, []byte("ID3")) {
  88. // The MP3 detection is quite inaccurate, any content with "ID3" prefix will result in "audio/mpeg".
  89. // So remove the "ID3" prefix and detect again, if result is text, then it must be text content.
  90. // This works especially because audio files contain many unprintable/invalid characters like `0x00`
  91. ct2 := http.DetectContentType(data[3:])
  92. if strings.HasPrefix(ct2, "text/") {
  93. ct = ct2
  94. }
  95. }
  96. if ct == "application/ogg" {
  97. dataHead := data
  98. if len(dataHead) > 256 {
  99. dataHead = dataHead[:256] // only need to do a quick check for the file header
  100. }
  101. if bytes.Contains(dataHead, []byte("theora")) || bytes.Contains(dataHead, []byte("dirac")) {
  102. ct = "video/ogg" // ogg is only used for some video formats, and it's not popular
  103. } else {
  104. ct = "audio/ogg" // for most cases, it is used as an audio container
  105. }
  106. }
  107. return SniffedType{ct}
  108. }
  109. // DetectContentTypeFromReader guesses the content type contained in the reader.
  110. func DetectContentTypeFromReader(r io.Reader) (SniffedType, error) {
  111. buf := make([]byte, sniffLen)
  112. n, err := util.ReadAtMost(r, buf)
  113. if err != nil {
  114. return SniffedType{}, fmt.Errorf("DetectContentTypeFromReader io error: %w", err)
  115. }
  116. buf = buf[:n]
  117. return DetectContentType(buf), nil
  118. }