aboutsummaryrefslogtreecommitdiffstats
path: root/modules/typesniffer/typesniffer.go
blob: 2e8d9c4a1e7273c830093ea3c10cd9fbe6e1864b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
// Copyright 2021 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT

package typesniffer

import (
	"bytes"
	"encoding/binary"
	"net/http"
	"regexp"
	"slices"
	"strings"
	"sync"
)

const SniffContentSize = 1024

const (
	MimeTypeImageSvg  = "image/svg+xml"
	MimeTypeImageAvif = "image/avif"

	MimeTypeApplicationOctetStream = "application/octet-stream"
)

var globalVars = sync.OnceValue(func() (ret struct {
	svgComment, svgTagRegex, svgTagInXMLRegex *regexp.Regexp
},
) {
	ret.svgComment = regexp.MustCompile(`(?s)<!--.*?-->`)
	ret.svgTagRegex = regexp.MustCompile(`(?si)\A\s*(?:(<!DOCTYPE\s+svg([\s:]+.*?>|>))\s*)*<svg\b`)
	ret.svgTagInXMLRegex = regexp.MustCompile(`(?si)\A<\?xml\b.*?\?>\s*(?:(<!DOCTYPE\s+svg([\s:]+.*?>|>))\s*)*<svg\b`)
	return ret
})

// SniffedType contains information about a blob's type.
type SniffedType struct {
	contentType string
}

// IsText detects if the content format is text family, including text/plain, text/html, text/css, etc.
func (ct SniffedType) IsText() bool {
	return strings.Contains(ct.contentType, "text/")
}

func (ct SniffedType) IsTextPlain() bool {
	return strings.Contains(ct.contentType, "text/plain")
}

// IsImage detects if data is an image format
func (ct SniffedType) IsImage() bool {
	return strings.Contains(ct.contentType, "image/")
}

// IsSvgImage detects if data is an SVG image format
func (ct SniffedType) IsSvgImage() bool {
	return strings.Contains(ct.contentType, MimeTypeImageSvg)
}

// IsPDF detects if data is a PDF format
func (ct SniffedType) IsPDF() bool {
	return strings.Contains(ct.contentType, "application/pdf")
}

// IsVideo detects if data is a video format
func (ct SniffedType) IsVideo() bool {
	return strings.Contains(ct.contentType, "video/")
}

// IsAudio detects if data is a video format
func (ct SniffedType) IsAudio() bool {
	return strings.Contains(ct.contentType, "audio/")
}

// IsRepresentableAsText returns true if file content can be represented as
// plain text or is empty.
func (ct SniffedType) IsRepresentableAsText() bool {
	return ct.IsText() || ct.IsSvgImage()
}

// IsBrowsableBinaryType returns whether a non-text type can be displayed in a browser
func (ct SniffedType) IsBrowsableBinaryType() bool {
	return ct.IsImage() || ct.IsSvgImage() || ct.IsPDF() || ct.IsVideo() || ct.IsAudio()
}

// GetMimeType returns the mime type
func (ct SniffedType) GetMimeType() string {
	return strings.SplitN(ct.contentType, ";", 2)[0]
}

// https://en.wikipedia.org/wiki/ISO_base_media_file_format#File_type_box
func detectFileTypeBox(data []byte) (brands []string, found bool) {
	if len(data) < 12 {
		return nil, false
	}
	boxSize := int(binary.BigEndian.Uint32(data[:4]))
	if boxSize < 12 || boxSize > len(data) {
		return nil, false
	}
	tag := string(data[4:8])
	if tag != "ftyp" {
		return nil, false
	}
	brands = append(brands, string(data[8:12]))
	for i := 16; i+4 <= boxSize; i += 4 {
		brands = append(brands, string(data[i:i+4]))
	}
	return brands, true
}

// DetectContentType extends http.DetectContentType with more content types. Defaults to text/plain if input is empty.
func DetectContentType(data []byte) SniffedType {
	if len(data) == 0 {
		return SniffedType{"text/plain"}
	}

	ct := http.DetectContentType(data)

	if len(data) > SniffContentSize {
		data = data[:SniffContentSize]
	}

	vars := globalVars()
	// SVG is unsupported by http.DetectContentType, https://github.com/golang/go/issues/15888
	detectByHTML := strings.Contains(ct, "text/plain") || strings.Contains(ct, "text/html")
	detectByXML := strings.Contains(ct, "text/xml")
	if detectByHTML || detectByXML {
		dataProcessed := vars.svgComment.ReplaceAll(data, nil)
		dataProcessed = bytes.TrimSpace(dataProcessed)
		if detectByHTML && vars.svgTagRegex.Match(dataProcessed) ||
			detectByXML && vars.svgTagInXMLRegex.Match(dataProcessed) {
			ct = MimeTypeImageSvg
		}
	}

	if strings.HasPrefix(ct, "audio/") && bytes.HasPrefix(data, []byte("ID3")) {
		// The MP3 detection is quite inaccurate, any content with "ID3" prefix will result in "audio/mpeg".
		// So remove the "ID3" prefix and detect again, then if the result is "text", it must be text content.
		// This works especially because audio files contain many unprintable/invalid characters like `0x00`
		ct2 := http.DetectContentType(data[3:])
		if strings.HasPrefix(ct2, "text/") {
			ct = ct2
		}
	}

	fileTypeBrands, found := detectFileTypeBox(data)
	if found && slices.Contains(fileTypeBrands, "avif") {
		ct = MimeTypeImageAvif
	}

	if ct == "application/ogg" {
		dataHead := data
		if len(dataHead) > 256 {
			dataHead = dataHead[:256] // only need to do a quick check for the file header
		}
		if bytes.Contains(dataHead, []byte("theora")) || bytes.Contains(dataHead, []byte("dirac")) {
			ct = "video/ogg" // ogg is only used for some video formats, and it's not popular
		} else {
			ct = "audio/ogg" // for most cases, it is used as an audio container
		}
	}
	return SniffedType{ct}
}