From 9d99f6ab19ac3f97af3ca126720e9075c127a652 Mon Sep 17 00:00:00 2001 From: Lunny Xiao Date: Tue, 20 Apr 2021 06:25:08 +0800 Subject: Refactor renders (#15175) * Refactor renders * Some performance optimization * Fix comment * Transform reader * Fix csv test * Fix test * Fix tests * Improve optimaziation * Fix test * Fix test * Detect file encoding with reader * Improve optimaziation * reduce memory usage * improve code * fix build * Fix test * Fix for go1.15 * Fix render * Fix comment * Fix lint * Fix test * Don't use NormalEOF when unnecessary * revert change on util.go * Apply suggestions from code review Co-authored-by: zeripath * rename function * Take NormalEOF back Co-authored-by: zeripath --- modules/charset/charset.go | 49 +++++++++++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 18 deletions(-) (limited to 'modules/charset') diff --git a/modules/charset/charset.go b/modules/charset/charset.go index a7e427db99..3000864c2e 100644 --- a/modules/charset/charset.go +++ b/modules/charset/charset.go @@ -7,6 +7,8 @@ package charset import ( "bytes" "fmt" + "io" + "io/ioutil" "strings" "unicode/utf8" @@ -21,6 +23,33 @@ import ( // UTF8BOM is the utf-8 byte-order marker var UTF8BOM = []byte{'\xef', '\xbb', '\xbf'} +// ToUTF8WithFallbackReader detects the encoding of content and coverts to UTF-8 reader if possible +func ToUTF8WithFallbackReader(rd io.Reader) io.Reader { + var buf = make([]byte, 2048) + n, err := rd.Read(buf) + if err != nil { + return rd + } + + charsetLabel, err := DetectEncoding(buf[:n]) + if err != nil || charsetLabel == "UTF-8" { + return io.MultiReader(bytes.NewReader(RemoveBOMIfPresent(buf[:n])), rd) + } + + encoding, _ := charset.Lookup(charsetLabel) + if encoding == nil { + return io.MultiReader(bytes.NewReader(buf[:n]), rd) + } + + return transform.NewReader( + io.MultiReader( + bytes.NewReader(RemoveBOMIfPresent(buf[:n])), + rd, + ), + encoding.NewDecoder(), + ) +} + // ToUTF8WithErr converts content to UTF8 encoding func ToUTF8WithErr(content []byte) (string, error) { charsetLabel, err := DetectEncoding(content) @@ -49,24 +78,8 @@ func ToUTF8WithErr(content []byte) (string, error) { // ToUTF8WithFallback detects the encoding of content and coverts to UTF-8 if possible func ToUTF8WithFallback(content []byte) []byte { - charsetLabel, err := DetectEncoding(content) - if err != nil || charsetLabel == "UTF-8" { - return RemoveBOMIfPresent(content) - } - - encoding, _ := charset.Lookup(charsetLabel) - if encoding == nil { - return content - } - - // If there is an error, we concatenate the nicely decoded part and the - // original left over. This way we won't lose data. - result, n, err := transform.Bytes(encoding.NewDecoder(), content) - if err != nil { - return append(result, content[n:]...) - } - - return RemoveBOMIfPresent(result) + bs, _ := ioutil.ReadAll(ToUTF8WithFallbackReader(bytes.NewReader(content))) + return bs } // ToUTF8 converts content to UTF8 encoding and ignore error -- cgit v1.2.3