123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147 |
- // Copyright (c) 2014 Couchbase, Inc.
- //
- // Licensed under the Apache License, Version 2.0 (the "License");
- // you may not use this file except in compliance with the License.
- // You may obtain a copy of the License at
- //
- // http://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS,
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- // See the License for the specific language governing permissions and
- // limitations under the License.
-
- package simple
-
- import (
- "unicode/utf8"
-
- "github.com/blevesearch/bleve/registry"
- "github.com/blevesearch/bleve/search/highlight"
- )
-
- const Name = "simple"
-
- const defaultFragmentSize = 200
-
- type Fragmenter struct {
- fragmentSize int
- }
-
- func NewFragmenter(fragmentSize int) *Fragmenter {
- return &Fragmenter{
- fragmentSize: fragmentSize,
- }
- }
-
- func (s *Fragmenter) Fragment(orig []byte, ot highlight.TermLocations) []*highlight.Fragment {
- var rv []*highlight.Fragment
- maxbegin := 0
- OUTER:
- for currTermIndex, termLocation := range ot {
- // start with this
- // it should be the highest scoring fragment with this term first
- start := termLocation.Start
- end := start
- used := 0
- for end < len(orig) && used < s.fragmentSize {
- r, size := utf8.DecodeRune(orig[end:])
- if r == utf8.RuneError {
- continue OUTER // bail
- }
- end += size
- used++
- }
-
- // if we still have more characters available to us
- // push back towards beginning
- // without cross maxbegin
- for start > 0 && used < s.fragmentSize {
- if start > len(orig) {
- // bail if out of bounds, possibly due to token replacement
- // e.g with a regexp replacement
- continue OUTER
- }
- r, size := utf8.DecodeLastRune(orig[0:start])
- if r == utf8.RuneError {
- continue OUTER // bail
- }
- if start-size >= maxbegin {
- start -= size
- used++
- } else {
- break
- }
- }
-
- // however, we'd rather have the tokens centered more in the frag
- // lets try to do that as best we can, without affecting the score
- // find the end of the last term in this fragment
- minend := end
- for _, innerTermLocation := range ot[currTermIndex:] {
- if innerTermLocation.End > end {
- break
- }
- minend = innerTermLocation.End
- }
-
- // find the smaller of the two rooms to move
- roomToMove := utf8.RuneCount(orig[minend:end])
- roomToMoveStart := 0
- if start >= maxbegin {
- roomToMoveStart = utf8.RuneCount(orig[maxbegin:start])
- }
- if roomToMoveStart < roomToMove {
- roomToMove = roomToMoveStart
- }
-
- offset := roomToMove / 2
-
- for offset > 0 {
- r, size := utf8.DecodeLastRune(orig[0:start])
- if r == utf8.RuneError {
- continue OUTER // bail
- }
- start -= size
-
- r, size = utf8.DecodeLastRune(orig[0:end])
- if r == utf8.RuneError {
- continue OUTER // bail
- }
- end -= size
- offset--
- }
-
- rv = append(rv, &highlight.Fragment{Orig: orig, Start: start - offset, End: end - offset})
- // set maxbegin to the end of the current term location
- // so that next one won't back up to include it
- maxbegin = termLocation.End
-
- }
- if len(ot) == 0 {
- // if there were no terms to highlight
- // produce a single fragment from the beginning
- start := 0
- end := start + s.fragmentSize
- if end > len(orig) {
- end = len(orig)
- }
- rv = append(rv, &highlight.Fragment{Orig: orig, Start: start, End: end})
- }
-
- return rv
- }
-
- func Constructor(config map[string]interface{}, cache *registry.Cache) (highlight.Fragmenter, error) {
- size := defaultFragmentSize
- sizeVal, ok := config["size"].(float64)
- if ok {
- size = int(sizeVal)
- }
- return NewFragmenter(size), nil
- }
-
- func init() {
- registry.RegisterFragmenter(Name, Constructor)
- }
|