You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2022.go 2.2KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102
  1. package chardet
  2. import (
  3. "bytes"
  4. )
  5. type recognizer2022 struct {
  6. charset string
  7. escapes [][]byte
  8. }
  9. func (r *recognizer2022) Match(input *recognizerInput) (output recognizerOutput) {
  10. return recognizerOutput{
  11. Charset: r.charset,
  12. Confidence: r.matchConfidence(input.input),
  13. }
  14. }
  15. func (r *recognizer2022) matchConfidence(input []byte) int {
  16. var hits, misses, shifts int
  17. input:
  18. for i := 0; i < len(input); i++ {
  19. c := input[i]
  20. if c == 0x1B {
  21. for _, esc := range r.escapes {
  22. if bytes.HasPrefix(input[i+1:], esc) {
  23. hits++
  24. i += len(esc)
  25. continue input
  26. }
  27. }
  28. misses++
  29. } else if c == 0x0E || c == 0x0F {
  30. shifts++
  31. }
  32. }
  33. if hits == 0 {
  34. return 0
  35. }
  36. quality := (100*hits - 100*misses) / (hits + misses)
  37. if hits+shifts < 5 {
  38. quality -= (5 - (hits + shifts)) * 10
  39. }
  40. if quality < 0 {
  41. quality = 0
  42. }
  43. return quality
  44. }
  45. var escapeSequences_2022JP = [][]byte{
  46. {0x24, 0x28, 0x43}, // KS X 1001:1992
  47. {0x24, 0x28, 0x44}, // JIS X 212-1990
  48. {0x24, 0x40}, // JIS C 6226-1978
  49. {0x24, 0x41}, // GB 2312-80
  50. {0x24, 0x42}, // JIS X 208-1983
  51. {0x26, 0x40}, // JIS X 208 1990, 1997
  52. {0x28, 0x42}, // ASCII
  53. {0x28, 0x48}, // JIS-Roman
  54. {0x28, 0x49}, // Half-width katakana
  55. {0x28, 0x4a}, // JIS-Roman
  56. {0x2e, 0x41}, // ISO 8859-1
  57. {0x2e, 0x46}, // ISO 8859-7
  58. }
  59. var escapeSequences_2022KR = [][]byte{
  60. {0x24, 0x29, 0x43},
  61. }
  62. var escapeSequences_2022CN = [][]byte{
  63. {0x24, 0x29, 0x41}, // GB 2312-80
  64. {0x24, 0x29, 0x47}, // CNS 11643-1992 Plane 1
  65. {0x24, 0x2A, 0x48}, // CNS 11643-1992 Plane 2
  66. {0x24, 0x29, 0x45}, // ISO-IR-165
  67. {0x24, 0x2B, 0x49}, // CNS 11643-1992 Plane 3
  68. {0x24, 0x2B, 0x4A}, // CNS 11643-1992 Plane 4
  69. {0x24, 0x2B, 0x4B}, // CNS 11643-1992 Plane 5
  70. {0x24, 0x2B, 0x4C}, // CNS 11643-1992 Plane 6
  71. {0x24, 0x2B, 0x4D}, // CNS 11643-1992 Plane 7
  72. {0x4e}, // SS2
  73. {0x4f}, // SS3
  74. }
  75. func newRecognizer_2022JP() *recognizer2022 {
  76. return &recognizer2022{
  77. "ISO-2022-JP",
  78. escapeSequences_2022JP,
  79. }
  80. }
  81. func newRecognizer_2022KR() *recognizer2022 {
  82. return &recognizer2022{
  83. "ISO-2022-KR",
  84. escapeSequences_2022KR,
  85. }
  86. }
  87. func newRecognizer_2022CN() *recognizer2022 {
  88. return &recognizer2022{
  89. "ISO-2022-CN",
  90. escapeSequences_2022CN,
  91. }
  92. }