summaryrefslogtreecommitdiffstats
path: root/modules/charset
diff options
context:
space:
mode:
authorLunny Xiao <xiaolunwen@gmail.com>2021-04-20 06:25:08 +0800
committerGitHub <noreply@github.com>2021-04-19 18:25:08 -0400
commit9d99f6ab19ac3f97af3ca126720e9075c127a652 (patch)
treeb817b4582a871f83b91ad7977fe772fc3501c1e8 /modules/charset
parentc9cc6698d2172625854cd063301e63602204a2a1 (diff)
downloadgitea-9d99f6ab19ac3f97af3ca126720e9075c127a652.tar.gz
gitea-9d99f6ab19ac3f97af3ca126720e9075c127a652.zip
Refactor renders (#15175)
* Refactor renders * Some performance optimization * Fix comment * Transform reader * Fix csv test * Fix test * Fix tests * Improve optimaziation * Fix test * Fix test * Detect file encoding with reader * Improve optimaziation * reduce memory usage * improve code * fix build * Fix test * Fix for go1.15 * Fix render * Fix comment * Fix lint * Fix test * Don't use NormalEOF when unnecessary * revert change on util.go * Apply suggestions from code review Co-authored-by: zeripath <art27@cantab.net> * rename function * Take NormalEOF back Co-authored-by: zeripath <art27@cantab.net>
Diffstat (limited to 'modules/charset')
-rw-r--r--modules/charset/charset.go49
1 files changed, 31 insertions, 18 deletions
diff --git a/modules/charset/charset.go b/modules/charset/charset.go
index a7e427db99..3000864c2e 100644
--- a/modules/charset/charset.go
+++ b/modules/charset/charset.go
@@ -7,6 +7,8 @@ package charset
import (
"bytes"
"fmt"
+ "io"
+ "io/ioutil"
"strings"
"unicode/utf8"
@@ -21,6 +23,33 @@ import (
// UTF8BOM is the utf-8 byte-order marker
var UTF8BOM = []byte{'\xef', '\xbb', '\xbf'}
+// ToUTF8WithFallbackReader detects the encoding of content and coverts to UTF-8 reader if possible
+func ToUTF8WithFallbackReader(rd io.Reader) io.Reader {
+ var buf = make([]byte, 2048)
+ n, err := rd.Read(buf)
+ if err != nil {
+ return rd
+ }
+
+ charsetLabel, err := DetectEncoding(buf[:n])
+ if err != nil || charsetLabel == "UTF-8" {
+ return io.MultiReader(bytes.NewReader(RemoveBOMIfPresent(buf[:n])), rd)
+ }
+
+ encoding, _ := charset.Lookup(charsetLabel)
+ if encoding == nil {
+ return io.MultiReader(bytes.NewReader(buf[:n]), rd)
+ }
+
+ return transform.NewReader(
+ io.MultiReader(
+ bytes.NewReader(RemoveBOMIfPresent(buf[:n])),
+ rd,
+ ),
+ encoding.NewDecoder(),
+ )
+}
+
// ToUTF8WithErr converts content to UTF8 encoding
func ToUTF8WithErr(content []byte) (string, error) {
charsetLabel, err := DetectEncoding(content)
@@ -49,24 +78,8 @@ func ToUTF8WithErr(content []byte) (string, error) {
// ToUTF8WithFallback detects the encoding of content and coverts to UTF-8 if possible
func ToUTF8WithFallback(content []byte) []byte {
- charsetLabel, err := DetectEncoding(content)
- if err != nil || charsetLabel == "UTF-8" {
- return RemoveBOMIfPresent(content)
- }
-
- encoding, _ := charset.Lookup(charsetLabel)
- if encoding == nil {
- return content
- }
-
- // If there is an error, we concatenate the nicely decoded part and the
- // original left over. This way we won't lose data.
- result, n, err := transform.Bytes(encoding.NewDecoder(), content)
- if err != nil {
- return append(result, content[n:]...)
- }
-
- return RemoveBOMIfPresent(result)
+ bs, _ := ioutil.ReadAll(ToUTF8WithFallbackReader(bytes.NewReader(content)))
+ return bs
}
// ToUTF8 converts content to UTF8 encoding and ignore error