diff options
Diffstat (limited to 'modules/charset/charset.go')
-rw-r--r-- | modules/charset/charset.go | 43 |
1 files changed, 22 insertions, 21 deletions
diff --git a/modules/charset/charset.go b/modules/charset/charset.go index 51152142a5..1855446a98 100644 --- a/modules/charset/charset.go +++ b/modules/charset/charset.go @@ -22,17 +22,21 @@ import ( // UTF8BOM is the utf-8 byte-order marker var UTF8BOM = []byte{'\xef', '\xbb', '\xbf'} +type ConvertOpts struct { + KeepBOM bool +} + // ToUTF8WithFallbackReader detects the encoding of content and converts to UTF-8 reader if possible -func ToUTF8WithFallbackReader(rd io.Reader) io.Reader { +func ToUTF8WithFallbackReader(rd io.Reader, opts ConvertOpts) io.Reader { buf := make([]byte, 2048) n, err := util.ReadAtMost(rd, buf) if err != nil { - return io.MultiReader(bytes.NewReader(RemoveBOMIfPresent(buf[:n])), rd) + return io.MultiReader(bytes.NewReader(MaybeRemoveBOM(buf[:n], opts)), rd) } charsetLabel, err := DetectEncoding(buf[:n]) if err != nil || charsetLabel == "UTF-8" { - return io.MultiReader(bytes.NewReader(RemoveBOMIfPresent(buf[:n])), rd) + return io.MultiReader(bytes.NewReader(MaybeRemoveBOM(buf[:n], opts)), rd) } encoding, _ := charset.Lookup(charsetLabel) @@ -42,20 +46,20 @@ func ToUTF8WithFallbackReader(rd io.Reader) io.Reader { return transform.NewReader( io.MultiReader( - bytes.NewReader(RemoveBOMIfPresent(buf[:n])), + bytes.NewReader(MaybeRemoveBOM(buf[:n], opts)), rd, ), encoding.NewDecoder(), ) } -// ToUTF8WithErr converts content to UTF8 encoding -func ToUTF8WithErr(content []byte) (string, error) { +// ToUTF8 converts content to UTF8 encoding +func ToUTF8(content []byte, opts ConvertOpts) (string, error) { charsetLabel, err := DetectEncoding(content) if err != nil { return "", err } else if charsetLabel == "UTF-8" { - return string(RemoveBOMIfPresent(content)), nil + return string(MaybeRemoveBOM(content, opts)), nil } encoding, _ := charset.Lookup(charsetLabel) @@ -70,28 +74,22 @@ func ToUTF8WithErr(content []byte) (string, error) { result = append(result, content[n:]...) } - result = RemoveBOMIfPresent(result) + result = MaybeRemoveBOM(result, opts) return string(result), err } // ToUTF8WithFallback detects the encoding of content and converts to UTF-8 if possible -func ToUTF8WithFallback(content []byte) []byte { - bs, _ := io.ReadAll(ToUTF8WithFallbackReader(bytes.NewReader(content))) +func ToUTF8WithFallback(content []byte, opts ConvertOpts) []byte { + bs, _ := io.ReadAll(ToUTF8WithFallbackReader(bytes.NewReader(content), opts)) return bs } -// ToUTF8 converts content to UTF8 encoding and ignore error -func ToUTF8(content string) string { - res, _ := ToUTF8WithErr([]byte(content)) - return res -} - // ToUTF8DropErrors makes sure the return string is valid utf-8; attempts conversion if possible -func ToUTF8DropErrors(content []byte) []byte { +func ToUTF8DropErrors(content []byte, opts ConvertOpts) []byte { charsetLabel, err := DetectEncoding(content) if err != nil || charsetLabel == "UTF-8" { - return RemoveBOMIfPresent(content) + return MaybeRemoveBOM(content, opts) } encoding, _ := charset.Lookup(charsetLabel) @@ -117,11 +115,14 @@ func ToUTF8DropErrors(content []byte) []byte { } } - return RemoveBOMIfPresent(decoded) + return MaybeRemoveBOM(decoded, opts) } -// RemoveBOMIfPresent removes a UTF-8 BOM from a []byte -func RemoveBOMIfPresent(content []byte) []byte { +// MaybeRemoveBOM removes a UTF-8 BOM from a []byte when opts.KeepBOM is false +func MaybeRemoveBOM(content []byte, opts ConvertOpts) []byte { + if opts.KeepBOM { + return content + } if len(content) > 2 && bytes.Equal(content[0:3], UTF8BOM) { return content[3:] } |