summaryrefslogtreecommitdiffstats
path: root/modules/charset/charset.go
diff options
context:
space:
mode:
authorzeripath <art27@cantab.net>2020-06-02 23:20:19 +0100
committerGitHub <noreply@github.com>2020-06-02 19:20:19 -0300
commita1ad188326f9af633d2be0920a140275a4972bfe (patch)
treef8d3df4a5e43b9e4db91947e7948520f27a89a50 /modules/charset/charset.go
parentfe2cacf5ea2e371c4e74f003ee594767c16028fa (diff)
downloadgitea-a1ad188326f9af633d2be0920a140275a4972bfe.tar.gz
gitea-a1ad188326f9af633d2be0920a140275a4972bfe.zip
Fix chardet test and add ordering option (#11621)
* Fix chardet test and add ordering option Signed-off-by: Andrew Thornton <art27@cantab.net> * minor fixes Signed-off-by: Andrew Thornton <art27@cantab.net> * remove log Signed-off-by: Andrew Thornton <art27@cantab.net> * remove log2 Signed-off-by: Andrew Thornton <art27@cantab.net> * only iterate through top results Signed-off-by: Andrew Thornton <art27@cantab.net> * Update docs/content/doc/advanced/config-cheat-sheet.en-us.md * slight restructure of for loop Signed-off-by: Andrew Thornton <art27@cantab.net> Co-authored-by: techknowlogick <techknowlogick@gitea.io>
Diffstat (limited to 'modules/charset/charset.go')
-rw-r--r--modules/charset/charset.go35
1 files changed, 31 insertions, 4 deletions
diff --git a/modules/charset/charset.go b/modules/charset/charset.go
index 3d3d1664f9..a7e427db99 100644
--- a/modules/charset/charset.go
+++ b/modules/charset/charset.go
@@ -7,6 +7,7 @@ package charset
import (
"bytes"
"fmt"
+ "strings"
"unicode/utf8"
"code.gitea.io/gitea/modules/log"
@@ -137,16 +138,42 @@ func DetectEncoding(content []byte) (string, error) {
} else {
detectContent = content
}
- result, err := textDetector.DetectBest(detectContent)
+
+ // Now we can't use DetectBest or just results[0] because the result isn't stable - so we need a tie break
+ results, err := textDetector.DetectAll(detectContent)
if err != nil {
+ if err == chardet.NotDetectedError && len(setting.Repository.AnsiCharset) > 0 {
+ log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset)
+ return setting.Repository.AnsiCharset, nil
+ }
return "", err
}
+
+ topConfidence := results[0].Confidence
+ topResult := results[0]
+ priority, has := setting.Repository.DetectedCharsetScore[strings.ToLower(strings.TrimSpace(topResult.Charset))]
+ for _, result := range results {
+ // As results are sorted in confidence order - if we have a different confidence
+ // we know it's less than the current confidence and can break out of the loop early
+ if result.Confidence != topConfidence {
+ break
+ }
+
+ // Otherwise check if this results is earlier in the DetectedCharsetOrder than our current top guesss
+ resultPriority, resultHas := setting.Repository.DetectedCharsetScore[strings.ToLower(strings.TrimSpace(result.Charset))]
+ if resultHas && (!has || resultPriority < priority) {
+ topResult = result
+ priority = resultPriority
+ has = true
+ }
+ }
+
// FIXME: to properly decouple this function the fallback ANSI charset should be passed as an argument
- if result.Charset != "UTF-8" && len(setting.Repository.AnsiCharset) > 0 {
+ if topResult.Charset != "UTF-8" && len(setting.Repository.AnsiCharset) > 0 {
log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset)
return setting.Repository.AnsiCharset, err
}
- log.Debug("Detected encoding: %s", result.Charset)
- return result.Charset, err
+ log.Debug("Detected encoding: %s", topResult.Charset)
+ return topResult.Charset, err
}