aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorzeripath <art27@cantab.net>2020-06-02 23:20:19 +0100
committerGitHub <noreply@github.com>2020-06-02 19:20:19 -0300
commita1ad188326f9af633d2be0920a140275a4972bfe (patch)
treef8d3df4a5e43b9e4db91947e7948520f27a89a50
parentfe2cacf5ea2e371c4e74f003ee594767c16028fa (diff)
downloadgitea-a1ad188326f9af633d2be0920a140275a4972bfe.tar.gz
gitea-a1ad188326f9af633d2be0920a140275a4972bfe.zip
Fix chardet test and add ordering option (#11621)
* Fix chardet test and add ordering option Signed-off-by: Andrew Thornton <art27@cantab.net> * minor fixes Signed-off-by: Andrew Thornton <art27@cantab.net> * remove log Signed-off-by: Andrew Thornton <art27@cantab.net> * remove log2 Signed-off-by: Andrew Thornton <art27@cantab.net> * only iterate through top results Signed-off-by: Andrew Thornton <art27@cantab.net> * Update docs/content/doc/advanced/config-cheat-sheet.en-us.md * slight restructure of for loop Signed-off-by: Andrew Thornton <art27@cantab.net> Co-authored-by: techknowlogick <techknowlogick@gitea.io>
-rw-r--r--custom/conf/app.ini.sample7
-rw-r--r--docs/content/doc/advanced/config-cheat-sheet.en-us.md3
-rw-r--r--modules/charset/charset.go35
-rw-r--r--modules/charset/charset_test.go4
-rw-r--r--modules/setting/repository.go74
5 files changed, 117 insertions, 6 deletions
diff --git a/custom/conf/app.ini.sample b/custom/conf/app.ini.sample
index 5e150172d5..4f5529edfa 100644
--- a/custom/conf/app.ini.sample
+++ b/custom/conf/app.ini.sample
@@ -14,7 +14,12 @@ RUN_MODE = dev
[repository]
ROOT =
SCRIPT_TYPE = bash
-; Default ANSI charset
+; DETECTED_CHARSETS_ORDER tie-break order for detected charsets.
+; If the charsets have equal confidence, tie-breaking will be done by order in this list
+; with charsets earlier in the list chosen in preference to those later.
+; Adding "defaults" will place the unused charsets at that position.
+DETECTED_CHARSETS_ORDER=UTF-8, UTF-16BE, UTF-16LE, UTF-32BE, UTF-32LE, ISO-8859, windows-1252, ISO-8859, windows-1250, ISO-8859, ISO-8859, ISO-8859, windows-1253, ISO-8859, windows-1255, ISO-8859, windows-1251, windows-1256, KOI8-R, ISO-8859, windows-1254, Shift_JIS, GB18030, EUC-JP, EUC-KR, Big5, ISO-2022, ISO-2022, ISO-2022, IBM424_rtl, IBM424_ltr, IBM420_rtl, IBM420_ltr
+; Default ANSI charset to override non-UTF-8 charsets to
ANSI_CHARSET =
; Force every new repository to be private
FORCE_PRIVATE = false
diff --git a/docs/content/doc/advanced/config-cheat-sheet.en-us.md b/docs/content/doc/advanced/config-cheat-sheet.en-us.md
index f0908c22a3..c29151f648 100644
--- a/docs/content/doc/advanced/config-cheat-sheet.en-us.md
+++ b/docs/content/doc/advanced/config-cheat-sheet.en-us.md
@@ -46,7 +46,8 @@ Values containing `#` or `;` must be quoted using `` ` `` or `"""`.
an absolute path.
- `SCRIPT_TYPE`: **bash**: The script type this server supports. Usually this is `bash`,
but some users report that only `sh` is available.
-- `ANSI_CHARSET`: **\<empty\>**: The default charset for an unrecognized charset.
+- `DETECTED_CHARSETS_ORDER`: **UTF-8, UTF-16BE, UTF-16LE, UTF-32BE, UTF-32LE, ISO-8859, windows-1252, ISO-8859, windows-1250, ISO-8859, ISO-8859, ISO-8859, windows-1253, ISO-8859, windows-1255, ISO-8859, windows-1251, windows-1256, KOI8-R, ISO-8859, windows-1254, Shift_JIS, GB18030, EUC-JP, EUC-KR, Big5, ISO-2022, ISO-2022, ISO-2022, IBM424_rtl, IBM424_ltr, IBM420_rtl, IBM420_ltr**: Tie-break order of detected charsets - if the detected charsets have equal confidence, charsets earlier in the list will be chosen in preference to those later. Adding `defaults` will place the unnamed charsets at that point.
+- `ANSI_CHARSET`: **\<empty\>**: Default ANSI charset to override non-UTF-8 charsets to.
- `FORCE_PRIVATE`: **false**: Force every new repository to be private.
- `DEFAULT_PRIVATE`: **last**: Default private when creating a new repository.
\[last, private, public\]
diff --git a/modules/charset/charset.go b/modules/charset/charset.go
index 3d3d1664f9..a7e427db99 100644
--- a/modules/charset/charset.go
+++ b/modules/charset/charset.go
@@ -7,6 +7,7 @@ package charset
import (
"bytes"
"fmt"
+ "strings"
"unicode/utf8"
"code.gitea.io/gitea/modules/log"
@@ -137,16 +138,42 @@ func DetectEncoding(content []byte) (string, error) {
} else {
detectContent = content
}
- result, err := textDetector.DetectBest(detectContent)
+
+ // Now we can't use DetectBest or just results[0] because the result isn't stable - so we need a tie break
+ results, err := textDetector.DetectAll(detectContent)
if err != nil {
+ if err == chardet.NotDetectedError && len(setting.Repository.AnsiCharset) > 0 {
+ log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset)
+ return setting.Repository.AnsiCharset, nil
+ }
return "", err
}
+
+ topConfidence := results[0].Confidence
+ topResult := results[0]
+ priority, has := setting.Repository.DetectedCharsetScore[strings.ToLower(strings.TrimSpace(topResult.Charset))]
+ for _, result := range results {
+ // As results are sorted in confidence order - if we have a different confidence
+ // we know it's less than the current confidence and can break out of the loop early
+ if result.Confidence != topConfidence {
+ break
+ }
+
+ // Otherwise check if this results is earlier in the DetectedCharsetOrder than our current top guesss
+ resultPriority, resultHas := setting.Repository.DetectedCharsetScore[strings.ToLower(strings.TrimSpace(result.Charset))]
+ if resultHas && (!has || resultPriority < priority) {
+ topResult = result
+ priority = resultPriority
+ has = true
+ }
+ }
+
// FIXME: to properly decouple this function the fallback ANSI charset should be passed as an argument
- if result.Charset != "UTF-8" && len(setting.Repository.AnsiCharset) > 0 {
+ if topResult.Charset != "UTF-8" && len(setting.Repository.AnsiCharset) > 0 {
log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset)
return setting.Repository.AnsiCharset, err
}
- log.Debug("Detected encoding: %s", result.Charset)
- return result.Charset, err
+ log.Debug("Detected encoding: %s", topResult.Charset)
+ return topResult.Charset, err
}
diff --git a/modules/charset/charset_test.go b/modules/charset/charset_test.go
index a81a6e03ee..394a42c71f 100644
--- a/modules/charset/charset_test.go
+++ b/modules/charset/charset_test.go
@@ -230,7 +230,11 @@ func TestDetectEncoding(t *testing.T) {
// we accept either.
assert.Contains(t, encoding, "ISO-8859")
+ old := setting.Repository.AnsiCharset
setting.Repository.AnsiCharset = "placeholder"
+ defer func() {
+ setting.Repository.AnsiCharset = old
+ }()
testSuccess(b, "placeholder")
// invalid bytes
diff --git a/modules/setting/repository.go b/modules/setting/repository.go
index 8af3eaaf46..1796a8d6b5 100644
--- a/modules/setting/repository.go
+++ b/modules/setting/repository.go
@@ -24,6 +24,8 @@ const (
// Repository settings
var (
Repository = struct {
+ DetectedCharsetsOrder []string
+ DetectedCharsetScore map[string]int `ini:"-"`
AnsiCharset string
ForcePrivate bool
DefaultPrivate string
@@ -88,6 +90,42 @@ var (
Wiki []string
} `ini:"repository.signing"`
}{
+ DetectedCharsetsOrder: []string{
+ "UTF-8",
+ "UTF-16BE",
+ "UTF-16LE",
+ "UTF-32BE",
+ "UTF-32LE",
+ "ISO-8859-1",
+ "windows-1252",
+ "ISO-8859-2",
+ "windows-1250",
+ "ISO-8859-5",
+ "ISO-8859-6",
+ "ISO-8859-7",
+ "windows-1253",
+ "ISO-8859-8-I",
+ "windows-1255",
+ "ISO-8859-8",
+ "windows-1251",
+ "windows-1256",
+ "KOI8-R",
+ "ISO-8859-9",
+ "windows-1254",
+ "Shift_JIS",
+ "GB18030",
+ "EUC-JP",
+ "EUC-KR",
+ "Big5",
+ "ISO-2022-JP",
+ "ISO-2022-KR",
+ "ISO-2022-CN",
+ "IBM424_rtl",
+ "IBM424_ltr",
+ "IBM420_rtl",
+ "IBM420_ltr",
+ },
+ DetectedCharsetScore: map[string]int{},
AnsiCharset: "",
ForcePrivate: false,
DefaultPrivate: RepoCreatingLastUserVisibility,
@@ -208,6 +246,10 @@ func newRepository() {
} else {
RepoRootPath = filepath.Clean(RepoRootPath)
}
+ defaultDetectedCharsetsOrder := make([]string, 0, len(Repository.DetectedCharsetsOrder))
+ for _, charset := range Repository.DetectedCharsetsOrder {
+ defaultDetectedCharsetsOrder = append(defaultDetectedCharsetsOrder, strings.ToLower(strings.TrimSpace(charset)))
+ }
ScriptType = sec.Key("SCRIPT_TYPE").MustString("bash")
if err = Cfg.Section("repository").MapTo(&Repository); err != nil {
@@ -222,6 +264,38 @@ func newRepository() {
log.Fatal("Failed to map Repository.PullRequest settings: %v", err)
}
+ preferred := make([]string, 0, len(Repository.DetectedCharsetsOrder))
+ for _, charset := range Repository.DetectedCharsetsOrder {
+ canonicalCharset := strings.ToLower(strings.TrimSpace(charset))
+ preferred = append(preferred, canonicalCharset)
+ // remove it from the defaults
+ for i, charset := range defaultDetectedCharsetsOrder {
+ if charset == canonicalCharset {
+ defaultDetectedCharsetsOrder = append(defaultDetectedCharsetsOrder[:i], defaultDetectedCharsetsOrder[i+1:]...)
+ break
+ }
+ }
+ }
+
+ i := 0
+ for _, charset := range preferred {
+ // Add the defaults
+ if charset == "defaults" {
+ for _, charset := range defaultDetectedCharsetsOrder {
+ canonicalCharset := strings.ToLower(strings.TrimSpace(charset))
+ if _, has := Repository.DetectedCharsetScore[canonicalCharset]; !has {
+ Repository.DetectedCharsetScore[canonicalCharset] = i
+ i++
+ }
+ }
+ continue
+ }
+ if _, has := Repository.DetectedCharsetScore[charset]; !has {
+ Repository.DetectedCharsetScore[charset] = i
+ i++
+ }
+ }
+
if !filepath.IsAbs(Repository.Upload.TempPath) {
Repository.Upload.TempPath = path.Join(AppWorkPath, Repository.Upload.TempPath)
}