You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

maketesttables.go 5.4KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219
  1. // Copyright (c) 2015 Couchbase, Inc.
  2. // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
  3. // except in compliance with the License. You may obtain a copy of the License at
  4. // http://www.apache.org/licenses/LICENSE-2.0
  5. // Unless required by applicable law or agreed to in writing, software distributed under the
  6. // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
  7. // either express or implied. See the License for the specific language governing permissions
  8. // and limitations under the License.
  9. // +build ignore
  10. package main
  11. import (
  12. "bufio"
  13. "bytes"
  14. "flag"
  15. "fmt"
  16. "io"
  17. "log"
  18. "net/http"
  19. "os"
  20. "os/exec"
  21. "strconv"
  22. "strings"
  23. "unicode"
  24. )
  25. var url = flag.String("url",
  26. "http://www.unicode.org/Public/"+unicode.Version+"/ucd/auxiliary/",
  27. "URL of Unicode database directory")
  28. var verbose = flag.Bool("verbose",
  29. false,
  30. "write data to stdout as it is parsed")
  31. var localFiles = flag.Bool("local",
  32. false,
  33. "data files have been copied to the current directory; for debugging only")
  34. var outputFile = flag.String("output",
  35. "",
  36. "output file for generated tables; default stdout")
  37. var output *bufio.Writer
  38. func main() {
  39. flag.Parse()
  40. setupOutput()
  41. graphemeTests := make([]test, 0)
  42. graphemeComments := make([]string, 0)
  43. graphemeTests, graphemeComments = loadUnicodeData("GraphemeBreakTest.txt", graphemeTests, graphemeComments)
  44. wordTests := make([]test, 0)
  45. wordComments := make([]string, 0)
  46. wordTests, wordComments = loadUnicodeData("WordBreakTest.txt", wordTests, wordComments)
  47. sentenceTests := make([]test, 0)
  48. sentenceComments := make([]string, 0)
  49. sentenceTests, sentenceComments = loadUnicodeData("SentenceBreakTest.txt", sentenceTests, sentenceComments)
  50. fmt.Fprintf(output, fileHeader, *url)
  51. generateTestTables("Grapheme", graphemeTests, graphemeComments)
  52. generateTestTables("Word", wordTests, wordComments)
  53. generateTestTables("Sentence", sentenceTests, sentenceComments)
  54. flushOutput()
  55. }
  56. // WordBreakProperty.txt has the form:
  57. // 05F0..05F2 ; Hebrew_Letter # Lo [3] HEBREW LIGATURE YIDDISH DOUBLE VAV..HEBREW LIGATURE YIDDISH DOUBLE YOD
  58. // FB1D ; Hebrew_Letter # Lo HEBREW LETTER YOD WITH HIRIQ
  59. func openReader(file string) (input io.ReadCloser) {
  60. if *localFiles {
  61. f, err := os.Open(file)
  62. if err != nil {
  63. log.Fatal(err)
  64. }
  65. input = f
  66. } else {
  67. path := *url + file
  68. resp, err := http.Get(path)
  69. if err != nil {
  70. log.Fatal(err)
  71. }
  72. if resp.StatusCode != 200 {
  73. log.Fatal("bad GET status for "+file, resp.Status)
  74. }
  75. input = resp.Body
  76. }
  77. return
  78. }
  79. func loadUnicodeData(filename string, tests []test, comments []string) ([]test, []string) {
  80. f := openReader(filename)
  81. defer f.Close()
  82. bufioReader := bufio.NewReader(f)
  83. line, err := bufioReader.ReadString('\n')
  84. for err == nil {
  85. tests, comments = parseLine(line, tests, comments)
  86. line, err = bufioReader.ReadString('\n')
  87. }
  88. // if the err was EOF still need to process last value
  89. if err == io.EOF {
  90. tests, comments = parseLine(line, tests, comments)
  91. }
  92. return tests, comments
  93. }
  94. const comment = "#"
  95. const brk = "÷"
  96. const nbrk = "×"
  97. type test [][]byte
  98. func parseLine(line string, tests []test, comments []string) ([]test, []string) {
  99. if strings.HasPrefix(line, comment) {
  100. return tests, comments
  101. }
  102. line = strings.TrimSpace(line)
  103. if len(line) == 0 {
  104. return tests, comments
  105. }
  106. commentStart := strings.Index(line, comment)
  107. comment := strings.TrimSpace(line[commentStart+1:])
  108. if commentStart > 0 {
  109. line = line[0:commentStart]
  110. }
  111. pieces := strings.Split(line, brk)
  112. t := make(test, 0)
  113. for _, piece := range pieces {
  114. piece = strings.TrimSpace(piece)
  115. if len(piece) > 0 {
  116. codePoints := strings.Split(piece, nbrk)
  117. word := ""
  118. for _, codePoint := range codePoints {
  119. codePoint = strings.TrimSpace(codePoint)
  120. r, err := strconv.ParseInt(codePoint, 16, 64)
  121. if err != nil {
  122. log.Printf("err: %v for '%s'", err, string(r))
  123. return tests, comments
  124. }
  125. word += string(r)
  126. }
  127. t = append(t, []byte(word))
  128. }
  129. }
  130. tests = append(tests, t)
  131. comments = append(comments, comment)
  132. return tests, comments
  133. }
  134. func generateTestTables(prefix string, tests []test, comments []string) {
  135. fmt.Fprintf(output, testHeader, prefix)
  136. for i, t := range tests {
  137. fmt.Fprintf(output, "\t\t{\n")
  138. fmt.Fprintf(output, "\t\t\tinput: %#v,\n", bytes.Join(t, []byte{}))
  139. fmt.Fprintf(output, "\t\t\toutput: %s,\n", generateTest(t))
  140. fmt.Fprintf(output, "\t\t\tcomment: `%s`,\n", comments[i])
  141. fmt.Fprintf(output, "\t\t},\n")
  142. }
  143. fmt.Fprintf(output, "}\n")
  144. }
  145. func generateTest(t test) string {
  146. rv := "[][]byte{"
  147. for _, te := range t {
  148. rv += fmt.Sprintf("%#v,", te)
  149. }
  150. rv += "}"
  151. return rv
  152. }
  153. const fileHeader = `// Generated by running
  154. // maketesttables --url=%s
  155. // DO NOT EDIT
  156. package segment
  157. `
  158. const testHeader = `var unicode%sTests = []struct {
  159. input []byte
  160. output [][]byte
  161. comment string
  162. }{
  163. `
  164. func setupOutput() {
  165. output = bufio.NewWriter(startGofmt())
  166. }
  167. // startGofmt connects output to a gofmt process if -output is set.
  168. func startGofmt() io.Writer {
  169. if *outputFile == "" {
  170. return os.Stdout
  171. }
  172. stdout, err := os.Create(*outputFile)
  173. if err != nil {
  174. log.Fatal(err)
  175. }
  176. // Pipe output to gofmt.
  177. gofmt := exec.Command("gofmt")
  178. fd, err := gofmt.StdinPipe()
  179. if err != nil {
  180. log.Fatal(err)
  181. }
  182. gofmt.Stdout = stdout
  183. gofmt.Stderr = os.Stderr
  184. err = gofmt.Start()
  185. if err != nil {
  186. log.Fatal(err)
  187. }
  188. return fd
  189. }
  190. func flushOutput() {
  191. err := output.Flush()
  192. if err != nil {
  193. log.Fatal(err)
  194. }
  195. }