You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

env.go 6.8KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389
  1. package snowballstem
  2. import (
  3. "log"
  4. "strings"
  5. "unicode/utf8"
  6. )
  7. // Env represents the Snowball execution environment
  8. type Env struct {
  9. current string
  10. Cursor int
  11. Limit int
  12. LimitBackward int
  13. Bra int
  14. Ket int
  15. }
  16. // NewEnv creates a new Snowball execution environment on the provided string
  17. func NewEnv(val string) *Env {
  18. return &Env{
  19. current: val,
  20. Cursor: 0,
  21. Limit: len(val),
  22. LimitBackward: 0,
  23. Bra: 0,
  24. Ket: len(val),
  25. }
  26. }
  27. func (env *Env) Current() string {
  28. return env.current
  29. }
  30. func (env *Env) SetCurrent(s string) {
  31. env.current = s
  32. env.Cursor = 0
  33. env.Limit = len(s)
  34. env.LimitBackward = 0
  35. env.Bra = 0
  36. env.Ket = len(s)
  37. }
  38. func (env *Env) ReplaceS(bra, ket int, s string) int32 {
  39. adjustment := int32(len(s)) - (int32(ket) - int32(bra))
  40. result, _ := splitAt(env.current, bra)
  41. rsplit := ket
  42. if ket < bra {
  43. rsplit = bra
  44. }
  45. _, rhs := splitAt(env.current, rsplit)
  46. result += s
  47. result += rhs
  48. newLim := int32(env.Limit) + adjustment
  49. env.Limit = int(newLim)
  50. if env.Cursor >= ket {
  51. newCur := int32(env.Cursor) + adjustment
  52. env.Cursor = int(newCur)
  53. } else if env.Cursor > bra {
  54. env.Cursor = bra
  55. }
  56. env.current = result
  57. return adjustment
  58. }
  59. func (env *Env) EqS(s string) bool {
  60. if env.Cursor >= env.Limit {
  61. return false
  62. }
  63. if strings.HasPrefix(env.current[env.Cursor:], s) {
  64. env.Cursor += len(s)
  65. for !onCharBoundary(env.current, env.Cursor) {
  66. env.Cursor++
  67. }
  68. return true
  69. }
  70. return false
  71. }
  72. func (env *Env) EqSB(s string) bool {
  73. if int32(env.Cursor)-int32(env.LimitBackward) < int32(len(s)) {
  74. return false
  75. } else if !onCharBoundary(env.current, env.Cursor-len(s)) ||
  76. !strings.HasPrefix(env.current[env.Cursor-len(s):], s) {
  77. return false
  78. } else {
  79. env.Cursor -= len(s)
  80. return true
  81. }
  82. }
  83. func (env *Env) SliceFrom(s string) bool {
  84. bra, ket := env.Bra, env.Ket
  85. env.ReplaceS(bra, ket, s)
  86. return true
  87. }
  88. func (env *Env) NextChar() {
  89. env.Cursor++
  90. for !onCharBoundary(env.current, env.Cursor) {
  91. env.Cursor++
  92. }
  93. }
  94. func (env *Env) PrevChar() {
  95. env.Cursor--
  96. for !onCharBoundary(env.current, env.Cursor) {
  97. env.Cursor--
  98. }
  99. }
  100. func (env *Env) ByteIndexForHop(delta int32) int32 {
  101. if delta > 0 {
  102. res := env.Cursor
  103. for delta > 0 {
  104. res++
  105. delta--
  106. for res <= len(env.current) && !onCharBoundary(env.current, res) {
  107. res++
  108. }
  109. }
  110. return int32(res)
  111. } else if delta < 0 {
  112. res := env.Cursor
  113. for delta < 0 {
  114. res--
  115. delta++
  116. for res >= 0 && !onCharBoundary(env.current, res) {
  117. res--
  118. }
  119. }
  120. return int32(res)
  121. } else {
  122. return int32(env.Cursor)
  123. }
  124. }
  125. func (env *Env) InGrouping(chars []byte, min, max int32) bool {
  126. if env.Cursor >= env.Limit {
  127. return false
  128. }
  129. r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:])
  130. if r != utf8.RuneError {
  131. if r > max || r < min {
  132. return false
  133. }
  134. r -= min
  135. if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 {
  136. return false
  137. }
  138. env.NextChar()
  139. return true
  140. }
  141. return false
  142. }
  143. func (env *Env) InGroupingB(chars []byte, min, max int32) bool {
  144. if env.Cursor <= env.LimitBackward {
  145. return false
  146. }
  147. env.PrevChar()
  148. r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:])
  149. if r != utf8.RuneError {
  150. env.NextChar()
  151. if r > max || r < min {
  152. return false
  153. }
  154. r -= min
  155. if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 {
  156. return false
  157. }
  158. env.PrevChar()
  159. return true
  160. }
  161. return false
  162. }
  163. func (env *Env) OutGrouping(chars []byte, min, max int32) bool {
  164. if env.Cursor >= env.Limit {
  165. return false
  166. }
  167. r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:])
  168. if r != utf8.RuneError {
  169. if r > max || r < min {
  170. env.NextChar()
  171. return true
  172. }
  173. r -= min
  174. if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 {
  175. env.NextChar()
  176. return true
  177. }
  178. }
  179. return false
  180. }
  181. func (env *Env) OutGroupingB(chars []byte, min, max int32) bool {
  182. if env.Cursor <= env.LimitBackward {
  183. return false
  184. }
  185. env.PrevChar()
  186. r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:])
  187. if r != utf8.RuneError {
  188. env.NextChar()
  189. if r > max || r < min {
  190. env.PrevChar()
  191. return true
  192. }
  193. r -= min
  194. if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 {
  195. env.PrevChar()
  196. return true
  197. }
  198. }
  199. return false
  200. }
  201. func (env *Env) SliceDel() bool {
  202. return env.SliceFrom("")
  203. }
  204. func (env *Env) Insert(bra, ket int, s string) {
  205. adjustment := env.ReplaceS(bra, ket, s)
  206. if bra <= env.Bra {
  207. env.Bra = int(int32(env.Bra) + adjustment)
  208. }
  209. if bra <= env.Ket {
  210. env.Ket = int(int32(env.Ket) + adjustment)
  211. }
  212. }
  213. func (env *Env) SliceTo() string {
  214. return env.current[env.Bra:env.Ket]
  215. }
  216. func (env *Env) FindAmong(amongs []*Among, ctx interface{}) int32 {
  217. var i int32
  218. j := int32(len(amongs))
  219. c := env.Cursor
  220. l := env.Limit
  221. var commonI, commonJ int
  222. firstKeyInspected := false
  223. for {
  224. k := i + ((j - i) >> 1)
  225. var diff int32
  226. common := min(commonI, commonJ)
  227. w := amongs[k]
  228. for lvar := common; lvar < len(w.Str); lvar++ {
  229. if c+common == l {
  230. diff--
  231. break
  232. }
  233. diff = int32(env.current[c+common]) - int32(w.Str[lvar])
  234. if diff != 0 {
  235. break
  236. }
  237. common++
  238. }
  239. if diff < 0 {
  240. j = k
  241. commonJ = common
  242. } else {
  243. i = k
  244. commonI = common
  245. }
  246. if j-i <= 1 {
  247. if i > 0 {
  248. break
  249. }
  250. if j == i {
  251. break
  252. }
  253. if firstKeyInspected {
  254. break
  255. }
  256. firstKeyInspected = true
  257. }
  258. }
  259. for {
  260. w := amongs[i]
  261. if commonI >= len(w.Str) {
  262. env.Cursor = c + len(w.Str)
  263. if w.F != nil {
  264. res := w.F(env, ctx)
  265. env.Cursor = c + len(w.Str)
  266. if res {
  267. return w.B
  268. }
  269. } else {
  270. return w.B
  271. }
  272. }
  273. i = w.A
  274. if i < 0 {
  275. return 0
  276. }
  277. }
  278. }
  279. func (env *Env) FindAmongB(amongs []*Among, ctx interface{}) int32 {
  280. var i int32
  281. j := int32(len(amongs))
  282. c := env.Cursor
  283. lb := env.LimitBackward
  284. var commonI, commonJ int
  285. firstKeyInspected := false
  286. for {
  287. k := i + ((j - i) >> 1)
  288. diff := int32(0)
  289. common := min(commonI, commonJ)
  290. w := amongs[k]
  291. for lvar := len(w.Str) - int(common) - 1; lvar >= 0; lvar-- {
  292. if c-common == lb {
  293. diff--
  294. break
  295. }
  296. diff = int32(env.current[c-common-1]) - int32(w.Str[lvar])
  297. if diff != 0 {
  298. break
  299. }
  300. // Count up commons. But not one character but the byte width of that char
  301. common++
  302. }
  303. if diff < 0 {
  304. j = k
  305. commonJ = common
  306. } else {
  307. i = k
  308. commonI = common
  309. }
  310. if j-i <= 1 {
  311. if i > 0 {
  312. break
  313. }
  314. if j == i {
  315. break
  316. }
  317. if firstKeyInspected {
  318. break
  319. }
  320. firstKeyInspected = true
  321. }
  322. }
  323. for {
  324. w := amongs[i]
  325. if commonI >= len(w.Str) {
  326. env.Cursor = c - len(w.Str)
  327. if w.F != nil {
  328. res := w.F(env, ctx)
  329. env.Cursor = c - len(w.Str)
  330. if res {
  331. return w.B
  332. }
  333. } else {
  334. return w.B
  335. }
  336. }
  337. i = w.A
  338. if i < 0 {
  339. return 0
  340. }
  341. }
  342. }
  343. func (env *Env) Debug(count, lineNumber int) {
  344. log.Printf("snowball debug, count: %d, line: %d", count, lineNumber)
  345. }
  346. func (env *Env) Clone() *Env {
  347. clone := *env
  348. return &clone
  349. }
  350. func (env *Env) AssignTo() string {
  351. return env.Current()
  352. }