aboutsummaryrefslogtreecommitdiffstats
path: root/vendor/github.com/blevesearch/segment/segment_words.rl
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/blevesearch/segment/segment_words.rl')
-rw-r--r--vendor/github.com/blevesearch/segment/segment_words.rl285
1 files changed, 285 insertions, 0 deletions
diff --git a/vendor/github.com/blevesearch/segment/segment_words.rl b/vendor/github.com/blevesearch/segment/segment_words.rl
new file mode 100644
index 0000000000..e69af8b214
--- /dev/null
+++ b/vendor/github.com/blevesearch/segment/segment_words.rl
@@ -0,0 +1,285 @@
+// Copyright (c) 2015 Couchbase, Inc.
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+// except in compliance with the License. You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software distributed under the
+// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+// either express or implied. See the License for the specific language governing permissions
+// and limitations under the License.
+
+// +build BUILDTAGS
+
+package segment
+
+import (
+ "fmt"
+ "unicode/utf8"
+)
+
+var RagelFlags = "RAGELFLAGS"
+
+var ParseError = fmt.Errorf("unicode word segmentation parse error")
+
+// Word Types
+const (
+ None = iota
+ Number
+ Letter
+ Kana
+ Ideo
+)
+
+%%{
+ machine s;
+ write data;
+}%%
+
+func segmentWords(data []byte, maxTokens int, atEOF bool, val [][]byte, types []int) ([][]byte, []int, int, error) {
+ cs, p, pe := 0, 0, len(data)
+ cap := maxTokens
+ if cap < 0 {
+ cap = 1000
+ }
+ if val == nil {
+ val = make([][]byte, 0, cap)
+ }
+ if types == nil {
+ types = make([]int, 0, cap)
+ }
+
+ // added for scanner
+ ts := 0
+ te := 0
+ act := 0
+ eof := pe
+ _ = ts // compiler not happy
+ _ = te
+ _ = act
+
+ // our state
+ startPos := 0
+ endPos := 0
+ totalConsumed := 0
+ %%{
+
+ include SCRIPTS "ragel/uscript.rl";
+ include WB "ragel/uwb.rl";
+
+ action startToken {
+ startPos = p
+ }
+
+ action endToken {
+ endPos = p
+ }
+
+ action finishNumericToken {
+ if !atEOF {
+ return val, types, totalConsumed, nil
+ }
+
+ val = append(val, data[startPos:endPos+1])
+ types = append(types, Number)
+ totalConsumed = endPos+1
+ if maxTokens > 0 && len(val) >= maxTokens {
+ return val, types, totalConsumed, nil
+ }
+ }
+
+ action finishHangulToken {
+ if endPos+1 == pe && !atEOF {
+ return val, types, totalConsumed, nil
+ } else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 {
+ return val, types, totalConsumed, nil
+ }
+
+ val = append(val, data[startPos:endPos+1])
+ types = append(types, Letter)
+ totalConsumed = endPos+1
+ if maxTokens > 0 && len(val) >= maxTokens {
+ return val, types, totalConsumed, nil
+ }
+ }
+
+ action finishKatakanaToken {
+ if endPos+1 == pe && !atEOF {
+ return val, types, totalConsumed, nil
+ } else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 {
+ return val, types, totalConsumed, nil
+ }
+
+ val = append(val, data[startPos:endPos+1])
+ types = append(types, Ideo)
+ totalConsumed = endPos+1
+ if maxTokens > 0 && len(val) >= maxTokens {
+ return val, types, totalConsumed, nil
+ }
+ }
+
+ action finishWordToken {
+ if !atEOF {
+ return val, types, totalConsumed, nil
+ }
+ val = append(val, data[startPos:endPos+1])
+ types = append(types, Letter)
+ totalConsumed = endPos+1
+ if maxTokens > 0 && len(val) >= maxTokens {
+ return val, types, totalConsumed, nil
+ }
+ }
+
+ action finishHanToken {
+ if endPos+1 == pe && !atEOF {
+ return val, types, totalConsumed, nil
+ } else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 {
+ return val, types, totalConsumed, nil
+ }
+
+ val = append(val, data[startPos:endPos+1])
+ types = append(types, Ideo)
+ totalConsumed = endPos+1
+ if maxTokens > 0 && len(val) >= maxTokens {
+ return val, types, totalConsumed, nil
+ }
+ }
+
+ action finishHiraganaToken {
+ if endPos+1 == pe && !atEOF {
+ return val, types, totalConsumed, nil
+ } else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 {
+ return val, types, totalConsumed, nil
+ }
+
+ val = append(val, data[startPos:endPos+1])
+ types = append(types, Ideo)
+ totalConsumed = endPos+1
+ if maxTokens > 0 && len(val) >= maxTokens {
+ return val, types, totalConsumed, nil
+ }
+ }
+
+ action finishNoneToken {
+ lastPos := startPos
+ for lastPos <= endPos {
+ _, size := utf8.DecodeRune(data[lastPos:])
+ lastPos += size
+ }
+ endPos = lastPos -1
+ p = endPos
+
+ if endPos+1 == pe && !atEOF {
+ return val, types, totalConsumed, nil
+ } else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 {
+ return val, types, totalConsumed, nil
+ }
+ // otherwise, consume this as well
+ val = append(val, data[startPos:endPos+1])
+ types = append(types, None)
+ totalConsumed = endPos+1
+ if maxTokens > 0 && len(val) >= maxTokens {
+ return val, types, totalConsumed, nil
+ }
+ }
+
+ HangulEx = Hangul ( Extend | Format )*;
+ HebrewOrALetterEx = ( Hebrew_Letter | ALetter ) ( Extend | Format )*;
+ NumericEx = Numeric ( Extend | Format )*;
+ KatakanaEx = Katakana ( Extend | Format )*;
+ MidLetterEx = ( MidLetter | MidNumLet | Single_Quote ) ( Extend | Format )*;
+ MidNumericEx = ( MidNum | MidNumLet | Single_Quote ) ( Extend | Format )*;
+ ExtendNumLetEx = ExtendNumLet ( Extend | Format )*;
+ HanEx = Han ( Extend | Format )*;
+ HiraganaEx = Hiragana ( Extend | Format )*;
+ SingleQuoteEx = Single_Quote ( Extend | Format )*;
+ DoubleQuoteEx = Double_Quote ( Extend | Format )*;
+ HebrewLetterEx = Hebrew_Letter ( Extend | Format )*;
+ RegionalIndicatorEx = Regional_Indicator ( Extend | Format )*;
+ NLCRLF = Newline | CR | LF;
+ OtherEx = ^(NLCRLF) ( Extend | Format )* ;
+
+ # UAX#29 WB8. Numeric × Numeric
+ # WB11. Numeric (MidNum | MidNumLet | Single_Quote) × Numeric
+ # WB12. Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
+ # WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
+ # WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
+ #
+ WordNumeric = ( ( ExtendNumLetEx )* NumericEx ( ( ( ExtendNumLetEx )* | MidNumericEx ) NumericEx )* ( ExtendNumLetEx )* ) >startToken @endToken;
+
+ # subset of the below for typing purposes only!
+ WordHangul = ( HangulEx )+ >startToken @endToken;
+ WordKatakana = ( KatakanaEx )+ >startToken @endToken;
+
+ # UAX#29 WB5. (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
+ # WB6. (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
+ # WB7. (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) × (ALetter | Hebrew_Letter)
+ # WB7a. Hebrew_Letter × Single_Quote
+ # WB7b. Hebrew_Letter × Double_Quote Hebrew_Letter
+ # WB7c. Hebrew_Letter Double_Quote × Hebrew_Letter
+ # WB9. (ALetter | Hebrew_Letter) × Numeric
+ # WB10. Numeric × (ALetter | Hebrew_Letter)
+ # WB13. Katakana × Katakana
+ # WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
+ # WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
+ #
+ # Marty -deviated here to allow for (ExtendNumLetEx x ExtendNumLetEx) part of 13a
+ #
+ Word = ( ( ExtendNumLetEx )* ( KatakanaEx ( ( ExtendNumLetEx )* KatakanaEx )*
+ | ( HebrewLetterEx ( SingleQuoteEx | DoubleQuoteEx HebrewLetterEx )
+ | NumericEx ( ( ( ExtendNumLetEx )* | MidNumericEx ) NumericEx )*
+ | HebrewOrALetterEx ( ( ( ExtendNumLetEx )* | MidLetterEx ) HebrewOrALetterEx )*
+ |ExtendNumLetEx
+ )+
+ )
+ (
+ ( ExtendNumLetEx )+ ( KatakanaEx ( ( ExtendNumLetEx )* KatakanaEx )*
+ | ( HebrewLetterEx ( SingleQuoteEx | DoubleQuoteEx HebrewLetterEx )
+ | NumericEx ( ( ( ExtendNumLetEx )* | MidNumericEx ) NumericEx )*
+ | HebrewOrALetterEx ( ( ( ExtendNumLetEx )* | MidLetterEx ) HebrewOrALetterEx )*
+ )+
+ )
+ )* ExtendNumLetEx*) >startToken @endToken;
+
+ # UAX#29 WB14. Any ÷ Any
+ WordHan = HanEx >startToken @endToken;
+ WordHiragana = HiraganaEx >startToken @endToken;
+
+ WordExt = ( ( Extend | Format )* ) >startToken @endToken; # maybe plus not star
+
+ WordCRLF = (CR LF) >startToken @endToken;
+
+ WordCR = CR >startToken @endToken;
+
+ WordLF = LF >startToken @endToken;
+
+ WordNL = Newline >startToken @endToken;
+
+ WordRegional = (RegionalIndicatorEx+) >startToken @endToken;
+
+ Other = OtherEx >startToken @endToken;
+
+ main := |*
+ WordNumeric => finishNumericToken;
+ WordHangul => finishHangulToken;
+ WordKatakana => finishKatakanaToken;
+ Word => finishWordToken;
+ WordHan => finishHanToken;
+ WordHiragana => finishHiraganaToken;
+ WordRegional =>finishNoneToken;
+ WordCRLF => finishNoneToken;
+ WordCR => finishNoneToken;
+ WordLF => finishNoneToken;
+ WordNL => finishNoneToken;
+ WordExt => finishNoneToken;
+ Other => finishNoneToken;
+ *|;
+
+ write init;
+ write exec;
+ }%%
+
+ if cs < s_first_final {
+ return val, types, totalConsumed, ParseError
+ }
+
+ return val, types, totalConsumed, nil
+}