You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

count.go 2.2KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120
  1. // Copyright 2015 Huan Du. All rights reserved.
  2. // Licensed under the MIT license that can be found in the LICENSE file.
  3. package xstrings
  4. import (
  5. "unicode"
  6. "unicode/utf8"
  7. )
  8. // Len returns str's utf8 rune length.
  9. func Len(str string) int {
  10. return utf8.RuneCountInString(str)
  11. }
  12. // WordCount returns number of words in a string.
  13. //
  14. // Word is defined as a locale dependent string containing alphabetic characters,
  15. // which may also contain but not start with `'` and `-` characters.
  16. func WordCount(str string) int {
  17. var r rune
  18. var size, n int
  19. inWord := false
  20. for len(str) > 0 {
  21. r, size = utf8.DecodeRuneInString(str)
  22. switch {
  23. case isAlphabet(r):
  24. if !inWord {
  25. inWord = true
  26. n++
  27. }
  28. case inWord && (r == '\'' || r == '-'):
  29. // Still in word.
  30. default:
  31. inWord = false
  32. }
  33. str = str[size:]
  34. }
  35. return n
  36. }
  37. const minCJKCharacter = '\u3400'
  38. // Checks r is a letter but not CJK character.
  39. func isAlphabet(r rune) bool {
  40. if !unicode.IsLetter(r) {
  41. return false
  42. }
  43. switch {
  44. // Quick check for non-CJK character.
  45. case r < minCJKCharacter:
  46. return true
  47. // Common CJK characters.
  48. case r >= '\u4E00' && r <= '\u9FCC':
  49. return false
  50. // Rare CJK characters.
  51. case r >= '\u3400' && r <= '\u4D85':
  52. return false
  53. // Rare and historic CJK characters.
  54. case r >= '\U00020000' && r <= '\U0002B81D':
  55. return false
  56. }
  57. return true
  58. }
  59. // Width returns string width in monotype font.
  60. // Multi-byte characters are usually twice the width of single byte characters.
  61. //
  62. // Algorithm comes from `mb_strwidth` in PHP.
  63. // http://php.net/manual/en/function.mb-strwidth.php
  64. func Width(str string) int {
  65. var r rune
  66. var size, n int
  67. for len(str) > 0 {
  68. r, size = utf8.DecodeRuneInString(str)
  69. n += RuneWidth(r)
  70. str = str[size:]
  71. }
  72. return n
  73. }
  74. // RuneWidth returns character width in monotype font.
  75. // Multi-byte characters are usually twice the width of single byte characters.
  76. //
  77. // Algorithm comes from `mb_strwidth` in PHP.
  78. // http://php.net/manual/en/function.mb-strwidth.php
  79. func RuneWidth(r rune) int {
  80. switch {
  81. case r == utf8.RuneError || r < '\x20':
  82. return 0
  83. case '\x20' <= r && r < '\u2000':
  84. return 1
  85. case '\u2000' <= r && r < '\uFF61':
  86. return 2
  87. case '\uFF61' <= r && r < '\uFFA0':
  88. return 1
  89. case '\uFFA0' <= r:
  90. return 2
  91. }
  92. return 0
  93. }