You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

index.go 13KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435
  1. // Copyright (c) 2014 Couchbase, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package mapping
  15. import (
  16. "encoding/json"
  17. "fmt"
  18. "github.com/blevesearch/bleve/analysis"
  19. "github.com/blevesearch/bleve/analysis/analyzer/standard"
  20. "github.com/blevesearch/bleve/analysis/datetime/optional"
  21. "github.com/blevesearch/bleve/document"
  22. "github.com/blevesearch/bleve/registry"
  23. )
  24. var MappingJSONStrict = false
  25. const defaultTypeField = "_type"
  26. const defaultType = "_default"
  27. const defaultField = "_all"
  28. const defaultAnalyzer = standard.Name
  29. const defaultDateTimeParser = optional.Name
  30. // An IndexMappingImpl controls how objects are placed
  31. // into an index.
  32. // First the type of the object is determined.
  33. // Once the type is know, the appropriate
  34. // DocumentMapping is selected by the type.
  35. // If no mapping was determined for that type,
  36. // a DefaultMapping will be used.
  37. type IndexMappingImpl struct {
  38. TypeMapping map[string]*DocumentMapping `json:"types,omitempty"`
  39. DefaultMapping *DocumentMapping `json:"default_mapping"`
  40. TypeField string `json:"type_field"`
  41. DefaultType string `json:"default_type"`
  42. DefaultAnalyzer string `json:"default_analyzer"`
  43. DefaultDateTimeParser string `json:"default_datetime_parser"`
  44. DefaultField string `json:"default_field"`
  45. StoreDynamic bool `json:"store_dynamic"`
  46. IndexDynamic bool `json:"index_dynamic"`
  47. CustomAnalysis *customAnalysis `json:"analysis,omitempty"`
  48. cache *registry.Cache
  49. }
  50. // AddCustomCharFilter defines a custom char filter for use in this mapping
  51. func (im *IndexMappingImpl) AddCustomCharFilter(name string, config map[string]interface{}) error {
  52. _, err := im.cache.DefineCharFilter(name, config)
  53. if err != nil {
  54. return err
  55. }
  56. im.CustomAnalysis.CharFilters[name] = config
  57. return nil
  58. }
  59. // AddCustomTokenizer defines a custom tokenizer for use in this mapping
  60. func (im *IndexMappingImpl) AddCustomTokenizer(name string, config map[string]interface{}) error {
  61. _, err := im.cache.DefineTokenizer(name, config)
  62. if err != nil {
  63. return err
  64. }
  65. im.CustomAnalysis.Tokenizers[name] = config
  66. return nil
  67. }
  68. // AddCustomTokenMap defines a custom token map for use in this mapping
  69. func (im *IndexMappingImpl) AddCustomTokenMap(name string, config map[string]interface{}) error {
  70. _, err := im.cache.DefineTokenMap(name, config)
  71. if err != nil {
  72. return err
  73. }
  74. im.CustomAnalysis.TokenMaps[name] = config
  75. return nil
  76. }
  77. // AddCustomTokenFilter defines a custom token filter for use in this mapping
  78. func (im *IndexMappingImpl) AddCustomTokenFilter(name string, config map[string]interface{}) error {
  79. _, err := im.cache.DefineTokenFilter(name, config)
  80. if err != nil {
  81. return err
  82. }
  83. im.CustomAnalysis.TokenFilters[name] = config
  84. return nil
  85. }
  86. // AddCustomAnalyzer defines a custom analyzer for use in this mapping. The
  87. // config map must have a "type" string entry to resolve the analyzer
  88. // constructor. The constructor is invoked with the remaining entries and
  89. // returned analyzer is registered in the IndexMapping.
  90. //
  91. // bleve comes with predefined analyzers, like
  92. // github.com/blevesearch/bleve/analysis/analyzers/custom_analyzer. They are
  93. // available only if their package is imported by client code. To achieve this,
  94. // use their metadata to fill configuration entries:
  95. //
  96. // import (
  97. // "github.com/blevesearch/bleve/analysis/analyzers/custom_analyzer"
  98. // "github.com/blevesearch/bleve/analysis/char_filters/html_char_filter"
  99. // "github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
  100. // "github.com/blevesearch/bleve/analysis/tokenizers/unicode"
  101. // )
  102. //
  103. // m := bleve.NewIndexMapping()
  104. // err := m.AddCustomAnalyzer("html", map[string]interface{}{
  105. // "type": custom_analyzer.Name,
  106. // "char_filters": []string{
  107. // html_char_filter.Name,
  108. // },
  109. // "tokenizer": unicode.Name,
  110. // "token_filters": []string{
  111. // lower_case_filter.Name,
  112. // ...
  113. // },
  114. // })
  115. func (im *IndexMappingImpl) AddCustomAnalyzer(name string, config map[string]interface{}) error {
  116. _, err := im.cache.DefineAnalyzer(name, config)
  117. if err != nil {
  118. return err
  119. }
  120. im.CustomAnalysis.Analyzers[name] = config
  121. return nil
  122. }
  123. // AddCustomDateTimeParser defines a custom date time parser for use in this mapping
  124. func (im *IndexMappingImpl) AddCustomDateTimeParser(name string, config map[string]interface{}) error {
  125. _, err := im.cache.DefineDateTimeParser(name, config)
  126. if err != nil {
  127. return err
  128. }
  129. im.CustomAnalysis.DateTimeParsers[name] = config
  130. return nil
  131. }
  132. // NewIndexMapping creates a new IndexMapping that will use all the default indexing rules
  133. func NewIndexMapping() *IndexMappingImpl {
  134. return &IndexMappingImpl{
  135. TypeMapping: make(map[string]*DocumentMapping),
  136. DefaultMapping: NewDocumentMapping(),
  137. TypeField: defaultTypeField,
  138. DefaultType: defaultType,
  139. DefaultAnalyzer: defaultAnalyzer,
  140. DefaultDateTimeParser: defaultDateTimeParser,
  141. DefaultField: defaultField,
  142. IndexDynamic: IndexDynamic,
  143. StoreDynamic: StoreDynamic,
  144. CustomAnalysis: newCustomAnalysis(),
  145. cache: registry.NewCache(),
  146. }
  147. }
  148. // Validate will walk the entire structure ensuring the following
  149. // explicitly named and default analyzers can be built
  150. func (im *IndexMappingImpl) Validate() error {
  151. _, err := im.cache.AnalyzerNamed(im.DefaultAnalyzer)
  152. if err != nil {
  153. return err
  154. }
  155. _, err = im.cache.DateTimeParserNamed(im.DefaultDateTimeParser)
  156. if err != nil {
  157. return err
  158. }
  159. err = im.DefaultMapping.Validate(im.cache)
  160. if err != nil {
  161. return err
  162. }
  163. for _, docMapping := range im.TypeMapping {
  164. err = docMapping.Validate(im.cache)
  165. if err != nil {
  166. return err
  167. }
  168. }
  169. return nil
  170. }
  171. // AddDocumentMapping sets a custom document mapping for the specified type
  172. func (im *IndexMappingImpl) AddDocumentMapping(doctype string, dm *DocumentMapping) {
  173. im.TypeMapping[doctype] = dm
  174. }
  175. func (im *IndexMappingImpl) mappingForType(docType string) *DocumentMapping {
  176. docMapping := im.TypeMapping[docType]
  177. if docMapping == nil {
  178. docMapping = im.DefaultMapping
  179. }
  180. return docMapping
  181. }
  182. // UnmarshalJSON offers custom unmarshaling with optional strict validation
  183. func (im *IndexMappingImpl) UnmarshalJSON(data []byte) error {
  184. var tmp map[string]json.RawMessage
  185. err := json.Unmarshal(data, &tmp)
  186. if err != nil {
  187. return err
  188. }
  189. // set defaults for fields which might have been omitted
  190. im.cache = registry.NewCache()
  191. im.CustomAnalysis = newCustomAnalysis()
  192. im.TypeField = defaultTypeField
  193. im.DefaultType = defaultType
  194. im.DefaultAnalyzer = defaultAnalyzer
  195. im.DefaultDateTimeParser = defaultDateTimeParser
  196. im.DefaultField = defaultField
  197. im.DefaultMapping = NewDocumentMapping()
  198. im.TypeMapping = make(map[string]*DocumentMapping)
  199. im.StoreDynamic = StoreDynamic
  200. im.IndexDynamic = IndexDynamic
  201. var invalidKeys []string
  202. for k, v := range tmp {
  203. switch k {
  204. case "analysis":
  205. err := json.Unmarshal(v, &im.CustomAnalysis)
  206. if err != nil {
  207. return err
  208. }
  209. case "type_field":
  210. err := json.Unmarshal(v, &im.TypeField)
  211. if err != nil {
  212. return err
  213. }
  214. case "default_type":
  215. err := json.Unmarshal(v, &im.DefaultType)
  216. if err != nil {
  217. return err
  218. }
  219. case "default_analyzer":
  220. err := json.Unmarshal(v, &im.DefaultAnalyzer)
  221. if err != nil {
  222. return err
  223. }
  224. case "default_datetime_parser":
  225. err := json.Unmarshal(v, &im.DefaultDateTimeParser)
  226. if err != nil {
  227. return err
  228. }
  229. case "default_field":
  230. err := json.Unmarshal(v, &im.DefaultField)
  231. if err != nil {
  232. return err
  233. }
  234. case "default_mapping":
  235. err := json.Unmarshal(v, &im.DefaultMapping)
  236. if err != nil {
  237. return err
  238. }
  239. case "types":
  240. err := json.Unmarshal(v, &im.TypeMapping)
  241. if err != nil {
  242. return err
  243. }
  244. case "store_dynamic":
  245. err := json.Unmarshal(v, &im.StoreDynamic)
  246. if err != nil {
  247. return err
  248. }
  249. case "index_dynamic":
  250. err := json.Unmarshal(v, &im.IndexDynamic)
  251. if err != nil {
  252. return err
  253. }
  254. default:
  255. invalidKeys = append(invalidKeys, k)
  256. }
  257. }
  258. if MappingJSONStrict && len(invalidKeys) > 0 {
  259. return fmt.Errorf("index mapping contains invalid keys: %v", invalidKeys)
  260. }
  261. err = im.CustomAnalysis.registerAll(im)
  262. if err != nil {
  263. return err
  264. }
  265. return nil
  266. }
  267. func (im *IndexMappingImpl) determineType(data interface{}) string {
  268. // first see if the object implements bleveClassifier
  269. bleveClassifier, ok := data.(bleveClassifier)
  270. if ok {
  271. return bleveClassifier.BleveType()
  272. }
  273. // next see if the object implements Classifier
  274. classifier, ok := data.(Classifier)
  275. if ok {
  276. return classifier.Type()
  277. }
  278. // now see if we can find a type using the mapping
  279. typ, ok := mustString(lookupPropertyPath(data, im.TypeField))
  280. if ok {
  281. return typ
  282. }
  283. return im.DefaultType
  284. }
  285. func (im *IndexMappingImpl) MapDocument(doc *document.Document, data interface{}) error {
  286. docType := im.determineType(data)
  287. docMapping := im.mappingForType(docType)
  288. walkContext := im.newWalkContext(doc, docMapping)
  289. if docMapping.Enabled {
  290. docMapping.walkDocument(data, []string{}, []uint64{}, walkContext)
  291. // see if the _all field was disabled
  292. allMapping := docMapping.documentMappingForPath("_all")
  293. if allMapping == nil || (allMapping.Enabled != false) {
  294. field := document.NewCompositeFieldWithIndexingOptions("_all", true, []string{}, walkContext.excludedFromAll, document.IndexField|document.IncludeTermVectors)
  295. doc.AddField(field)
  296. }
  297. }
  298. return nil
  299. }
  300. type walkContext struct {
  301. doc *document.Document
  302. im *IndexMappingImpl
  303. dm *DocumentMapping
  304. excludedFromAll []string
  305. }
  306. func (im *IndexMappingImpl) newWalkContext(doc *document.Document, dm *DocumentMapping) *walkContext {
  307. return &walkContext{
  308. doc: doc,
  309. im: im,
  310. dm: dm,
  311. excludedFromAll: []string{},
  312. }
  313. }
  314. // AnalyzerNameForPath attempts to find the best analyzer to use with only a
  315. // field name will walk all the document types, look for field mappings at the
  316. // provided path, if one exists and it has an explicit analyzer that is
  317. // returned.
  318. func (im *IndexMappingImpl) AnalyzerNameForPath(path string) string {
  319. // first we look for explicit mapping on the field
  320. for _, docMapping := range im.TypeMapping {
  321. analyzerName := docMapping.analyzerNameForPath(path)
  322. if analyzerName != "" {
  323. return analyzerName
  324. }
  325. }
  326. // now try the default mapping
  327. pathMapping := im.DefaultMapping.documentMappingForPath(path)
  328. if pathMapping != nil {
  329. if len(pathMapping.Fields) > 0 {
  330. if pathMapping.Fields[0].Analyzer != "" {
  331. return pathMapping.Fields[0].Analyzer
  332. }
  333. }
  334. }
  335. // next we will try default analyzers for the path
  336. pathDecoded := decodePath(path)
  337. for _, docMapping := range im.TypeMapping {
  338. rv := docMapping.defaultAnalyzerName(pathDecoded)
  339. if rv != "" {
  340. return rv
  341. }
  342. }
  343. return im.DefaultAnalyzer
  344. }
  345. func (im *IndexMappingImpl) AnalyzerNamed(name string) *analysis.Analyzer {
  346. analyzer, err := im.cache.AnalyzerNamed(name)
  347. if err != nil {
  348. logger.Printf("error using analyzer named: %s", name)
  349. return nil
  350. }
  351. return analyzer
  352. }
  353. func (im *IndexMappingImpl) DateTimeParserNamed(name string) analysis.DateTimeParser {
  354. if name == "" {
  355. name = im.DefaultDateTimeParser
  356. }
  357. dateTimeParser, err := im.cache.DateTimeParserNamed(name)
  358. if err != nil {
  359. logger.Printf("error using datetime parser named: %s", name)
  360. return nil
  361. }
  362. return dateTimeParser
  363. }
  364. func (im *IndexMappingImpl) datetimeParserNameForPath(path string) string {
  365. // first we look for explicit mapping on the field
  366. for _, docMapping := range im.TypeMapping {
  367. pathMapping := docMapping.documentMappingForPath(path)
  368. if pathMapping != nil {
  369. if len(pathMapping.Fields) > 0 {
  370. if pathMapping.Fields[0].Analyzer != "" {
  371. return pathMapping.Fields[0].Analyzer
  372. }
  373. }
  374. }
  375. }
  376. return im.DefaultDateTimeParser
  377. }
  378. func (im *IndexMappingImpl) AnalyzeText(analyzerName string, text []byte) (analysis.TokenStream, error) {
  379. analyzer, err := im.cache.AnalyzerNamed(analyzerName)
  380. if err != nil {
  381. return nil, err
  382. }
  383. return analyzer.Analyze(text), nil
  384. }
  385. // FieldAnalyzer returns the name of the analyzer used on a field.
  386. func (im *IndexMappingImpl) FieldAnalyzer(field string) string {
  387. return im.AnalyzerNameForPath(field)
  388. }
  389. // wrapper to satisfy new interface
  390. func (im *IndexMappingImpl) DefaultSearchField() string {
  391. return im.DefaultField
  392. }