Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

segment_words.rl 8.9KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285
  1. // Copyright (c) 2015 Couchbase, Inc.
  2. // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
  3. // except in compliance with the License. You may obtain a copy of the License at
  4. // http://www.apache.org/licenses/LICENSE-2.0
  5. // Unless required by applicable law or agreed to in writing, software distributed under the
  6. // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
  7. // either express or implied. See the License for the specific language governing permissions
  8. // and limitations under the License.
  9. // +build BUILDTAGS
  10. package segment
  11. import (
  12. "fmt"
  13. "unicode/utf8"
  14. )
  15. var RagelFlags = "RAGELFLAGS"
  16. var ParseError = fmt.Errorf("unicode word segmentation parse error")
  17. // Word Types
  18. const (
  19. None = iota
  20. Number
  21. Letter
  22. Kana
  23. Ideo
  24. )
  25. %%{
  26. machine s;
  27. write data;
  28. }%%
  29. func segmentWords(data []byte, maxTokens int, atEOF bool, val [][]byte, types []int) ([][]byte, []int, int, error) {
  30. cs, p, pe := 0, 0, len(data)
  31. cap := maxTokens
  32. if cap < 0 {
  33. cap = 1000
  34. }
  35. if val == nil {
  36. val = make([][]byte, 0, cap)
  37. }
  38. if types == nil {
  39. types = make([]int, 0, cap)
  40. }
  41. // added for scanner
  42. ts := 0
  43. te := 0
  44. act := 0
  45. eof := pe
  46. _ = ts // compiler not happy
  47. _ = te
  48. _ = act
  49. // our state
  50. startPos := 0
  51. endPos := 0
  52. totalConsumed := 0
  53. %%{
  54. include SCRIPTS "ragel/uscript.rl";
  55. include WB "ragel/uwb.rl";
  56. action startToken {
  57. startPos = p
  58. }
  59. action endToken {
  60. endPos = p
  61. }
  62. action finishNumericToken {
  63. if !atEOF {
  64. return val, types, totalConsumed, nil
  65. }
  66. val = append(val, data[startPos:endPos+1])
  67. types = append(types, Number)
  68. totalConsumed = endPos+1
  69. if maxTokens > 0 && len(val) >= maxTokens {
  70. return val, types, totalConsumed, nil
  71. }
  72. }
  73. action finishHangulToken {
  74. if endPos+1 == pe && !atEOF {
  75. return val, types, totalConsumed, nil
  76. } else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 {
  77. return val, types, totalConsumed, nil
  78. }
  79. val = append(val, data[startPos:endPos+1])
  80. types = append(types, Letter)
  81. totalConsumed = endPos+1
  82. if maxTokens > 0 && len(val) >= maxTokens {
  83. return val, types, totalConsumed, nil
  84. }
  85. }
  86. action finishKatakanaToken {
  87. if endPos+1 == pe && !atEOF {
  88. return val, types, totalConsumed, nil
  89. } else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 {
  90. return val, types, totalConsumed, nil
  91. }
  92. val = append(val, data[startPos:endPos+1])
  93. types = append(types, Ideo)
  94. totalConsumed = endPos+1
  95. if maxTokens > 0 && len(val) >= maxTokens {
  96. return val, types, totalConsumed, nil
  97. }
  98. }
  99. action finishWordToken {
  100. if !atEOF {
  101. return val, types, totalConsumed, nil
  102. }
  103. val = append(val, data[startPos:endPos+1])
  104. types = append(types, Letter)
  105. totalConsumed = endPos+1
  106. if maxTokens > 0 && len(val) >= maxTokens {
  107. return val, types, totalConsumed, nil
  108. }
  109. }
  110. action finishHanToken {
  111. if endPos+1 == pe && !atEOF {
  112. return val, types, totalConsumed, nil
  113. } else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 {
  114. return val, types, totalConsumed, nil
  115. }
  116. val = append(val, data[startPos:endPos+1])
  117. types = append(types, Ideo)
  118. totalConsumed = endPos+1
  119. if maxTokens > 0 && len(val) >= maxTokens {
  120. return val, types, totalConsumed, nil
  121. }
  122. }
  123. action finishHiraganaToken {
  124. if endPos+1 == pe && !atEOF {
  125. return val, types, totalConsumed, nil
  126. } else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 {
  127. return val, types, totalConsumed, nil
  128. }
  129. val = append(val, data[startPos:endPos+1])
  130. types = append(types, Ideo)
  131. totalConsumed = endPos+1
  132. if maxTokens > 0 && len(val) >= maxTokens {
  133. return val, types, totalConsumed, nil
  134. }
  135. }
  136. action finishNoneToken {
  137. lastPos := startPos
  138. for lastPos <= endPos {
  139. _, size := utf8.DecodeRune(data[lastPos:])
  140. lastPos += size
  141. }
  142. endPos = lastPos -1
  143. p = endPos
  144. if endPos+1 == pe && !atEOF {
  145. return val, types, totalConsumed, nil
  146. } else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 {
  147. return val, types, totalConsumed, nil
  148. }
  149. // otherwise, consume this as well
  150. val = append(val, data[startPos:endPos+1])
  151. types = append(types, None)
  152. totalConsumed = endPos+1
  153. if maxTokens > 0 && len(val) >= maxTokens {
  154. return val, types, totalConsumed, nil
  155. }
  156. }
  157. HangulEx = Hangul ( Extend | Format )*;
  158. HebrewOrALetterEx = ( Hebrew_Letter | ALetter ) ( Extend | Format )*;
  159. NumericEx = Numeric ( Extend | Format )*;
  160. KatakanaEx = Katakana ( Extend | Format )*;
  161. MidLetterEx = ( MidLetter | MidNumLet | Single_Quote ) ( Extend | Format )*;
  162. MidNumericEx = ( MidNum | MidNumLet | Single_Quote ) ( Extend | Format )*;
  163. ExtendNumLetEx = ExtendNumLet ( Extend | Format )*;
  164. HanEx = Han ( Extend | Format )*;
  165. HiraganaEx = Hiragana ( Extend | Format )*;
  166. SingleQuoteEx = Single_Quote ( Extend | Format )*;
  167. DoubleQuoteEx = Double_Quote ( Extend | Format )*;
  168. HebrewLetterEx = Hebrew_Letter ( Extend | Format )*;
  169. RegionalIndicatorEx = Regional_Indicator ( Extend | Format )*;
  170. NLCRLF = Newline | CR | LF;
  171. OtherEx = ^(NLCRLF) ( Extend | Format )* ;
  172. # UAX#29 WB8. Numeric × Numeric
  173. # WB11. Numeric (MidNum | MidNumLet | Single_Quote) × Numeric
  174. # WB12. Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
  175. # WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
  176. # WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
  177. #
  178. WordNumeric = ( ( ExtendNumLetEx )* NumericEx ( ( ( ExtendNumLetEx )* | MidNumericEx ) NumericEx )* ( ExtendNumLetEx )* ) >startToken @endToken;
  179. # subset of the below for typing purposes only!
  180. WordHangul = ( HangulEx )+ >startToken @endToken;
  181. WordKatakana = ( KatakanaEx )+ >startToken @endToken;
  182. # UAX#29 WB5. (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
  183. # WB6. (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
  184. # WB7. (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) × (ALetter | Hebrew_Letter)
  185. # WB7a. Hebrew_Letter × Single_Quote
  186. # WB7b. Hebrew_Letter × Double_Quote Hebrew_Letter
  187. # WB7c. Hebrew_Letter Double_Quote × Hebrew_Letter
  188. # WB9. (ALetter | Hebrew_Letter) × Numeric
  189. # WB10. Numeric × (ALetter | Hebrew_Letter)
  190. # WB13. Katakana × Katakana
  191. # WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
  192. # WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
  193. #
  194. # Marty -deviated here to allow for (ExtendNumLetEx x ExtendNumLetEx) part of 13a
  195. #
  196. Word = ( ( ExtendNumLetEx )* ( KatakanaEx ( ( ExtendNumLetEx )* KatakanaEx )*
  197. | ( HebrewLetterEx ( SingleQuoteEx | DoubleQuoteEx HebrewLetterEx )
  198. | NumericEx ( ( ( ExtendNumLetEx )* | MidNumericEx ) NumericEx )*
  199. | HebrewOrALetterEx ( ( ( ExtendNumLetEx )* | MidLetterEx ) HebrewOrALetterEx )*
  200. |ExtendNumLetEx
  201. )+
  202. )
  203. (
  204. ( ExtendNumLetEx )+ ( KatakanaEx ( ( ExtendNumLetEx )* KatakanaEx )*
  205. | ( HebrewLetterEx ( SingleQuoteEx | DoubleQuoteEx HebrewLetterEx )
  206. | NumericEx ( ( ( ExtendNumLetEx )* | MidNumericEx ) NumericEx )*
  207. | HebrewOrALetterEx ( ( ( ExtendNumLetEx )* | MidLetterEx ) HebrewOrALetterEx )*
  208. )+
  209. )
  210. )* ExtendNumLetEx*) >startToken @endToken;
  211. # UAX#29 WB14. Any ÷ Any
  212. WordHan = HanEx >startToken @endToken;
  213. WordHiragana = HiraganaEx >startToken @endToken;
  214. WordExt = ( ( Extend | Format )* ) >startToken @endToken; # maybe plus not star
  215. WordCRLF = (CR LF) >startToken @endToken;
  216. WordCR = CR >startToken @endToken;
  217. WordLF = LF >startToken @endToken;
  218. WordNL = Newline >startToken @endToken;
  219. WordRegional = (RegionalIndicatorEx+) >startToken @endToken;
  220. Other = OtherEx >startToken @endToken;
  221. main := |*
  222. WordNumeric => finishNumericToken;
  223. WordHangul => finishHangulToken;
  224. WordKatakana => finishKatakanaToken;
  225. Word => finishWordToken;
  226. WordHan => finishHanToken;
  227. WordHiragana => finishHiraganaToken;
  228. WordRegional =>finishNoneToken;
  229. WordCRLF => finishNoneToken;
  230. WordCR => finishNoneToken;
  231. WordLF => finishNoneToken;
  232. WordNL => finishNoneToken;
  233. WordExt => finishNoneToken;
  234. Other => finishNoneToken;
  235. *|;
  236. write init;
  237. write exec;
  238. }%%
  239. if cs < s_first_final {
  240. return val, types, totalConsumed, ParseError
  241. }
  242. return val, types, totalConsumed, nil
  243. }