diff options
author | zeripath <art27@cantab.net> | 2020-06-02 23:20:19 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-06-02 19:20:19 -0300 |
commit | a1ad188326f9af633d2be0920a140275a4972bfe (patch) | |
tree | f8d3df4a5e43b9e4db91947e7948520f27a89a50 /modules/charset/charset.go | |
parent | fe2cacf5ea2e371c4e74f003ee594767c16028fa (diff) | |
download | gitea-a1ad188326f9af633d2be0920a140275a4972bfe.tar.gz gitea-a1ad188326f9af633d2be0920a140275a4972bfe.zip |
Fix chardet test and add ordering option (#11621)
* Fix chardet test and add ordering option
Signed-off-by: Andrew Thornton <art27@cantab.net>
* minor fixes
Signed-off-by: Andrew Thornton <art27@cantab.net>
* remove log
Signed-off-by: Andrew Thornton <art27@cantab.net>
* remove log2
Signed-off-by: Andrew Thornton <art27@cantab.net>
* only iterate through top results
Signed-off-by: Andrew Thornton <art27@cantab.net>
* Update docs/content/doc/advanced/config-cheat-sheet.en-us.md
* slight restructure of for loop
Signed-off-by: Andrew Thornton <art27@cantab.net>
Co-authored-by: techknowlogick <techknowlogick@gitea.io>
Diffstat (limited to 'modules/charset/charset.go')
-rw-r--r-- | modules/charset/charset.go | 35 |
1 files changed, 31 insertions, 4 deletions
diff --git a/modules/charset/charset.go b/modules/charset/charset.go index 3d3d1664f9..a7e427db99 100644 --- a/modules/charset/charset.go +++ b/modules/charset/charset.go @@ -7,6 +7,7 @@ package charset import ( "bytes" "fmt" + "strings" "unicode/utf8" "code.gitea.io/gitea/modules/log" @@ -137,16 +138,42 @@ func DetectEncoding(content []byte) (string, error) { } else { detectContent = content } - result, err := textDetector.DetectBest(detectContent) + + // Now we can't use DetectBest or just results[0] because the result isn't stable - so we need a tie break + results, err := textDetector.DetectAll(detectContent) if err != nil { + if err == chardet.NotDetectedError && len(setting.Repository.AnsiCharset) > 0 { + log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset) + return setting.Repository.AnsiCharset, nil + } return "", err } + + topConfidence := results[0].Confidence + topResult := results[0] + priority, has := setting.Repository.DetectedCharsetScore[strings.ToLower(strings.TrimSpace(topResult.Charset))] + for _, result := range results { + // As results are sorted in confidence order - if we have a different confidence + // we know it's less than the current confidence and can break out of the loop early + if result.Confidence != topConfidence { + break + } + + // Otherwise check if this results is earlier in the DetectedCharsetOrder than our current top guesss + resultPriority, resultHas := setting.Repository.DetectedCharsetScore[strings.ToLower(strings.TrimSpace(result.Charset))] + if resultHas && (!has || resultPriority < priority) { + topResult = result + priority = resultPriority + has = true + } + } + // FIXME: to properly decouple this function the fallback ANSI charset should be passed as an argument - if result.Charset != "UTF-8" && len(setting.Repository.AnsiCharset) > 0 { + if topResult.Charset != "UTF-8" && len(setting.Repository.AnsiCharset) > 0 { log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset) return setting.Repository.AnsiCharset, err } - log.Debug("Detected encoding: %s", result.Charset) - return result.Charset, err + log.Debug("Detected encoding: %s", topResult.Charset) + return topResult.Charset, err } |