diff options
author | Wouter Admiraal <wouter.admiraal@sonarsource.com> | 2018-12-26 12:49:27 +0100 |
---|---|---|
committer | SonarTech <sonartech@sonarsource.com> | 2019-01-10 20:21:02 +0100 |
commit | 42fdf73a492026a854a4a36d515df02e239bb82a (patch) | |
tree | c60f6b89c21c3213fc2de887dc4f246f404f68ed | |
parent | 96aa28fc529efd7582ca959297d2d77d88f3085c (diff) | |
download | sonarqube-42fdf73a492026a854a4a36d515df02e239bb82a.tar.gz sonarqube-42fdf73a492026a854a4a36d515df02e239bb82a.zip |
SONAR-11472 Add support for exact search-query matching
Out of the box, Lunr doesn't support exact pattern matching for
search queries, meaning searching for "foo bar" will not boost
a sentence like "Foo bar baz" more than "Baz bar foo" (both
contain both keywords). We now do some crude pattern matching by
storing the token "context" upon indexing. It's not 100% correct,
but it gets the job done.
8 files changed, 207 insertions, 56 deletions
diff --git a/server/sonar-docs/src/layouts/components/Search.js b/server/sonar-docs/src/layouts/components/Search.js index 72d52b712c5..fe09b0f3d0b 100644 --- a/server/sonar-docs/src/layouts/components/Search.js +++ b/server/sonar-docs/src/layouts/components/Search.js @@ -19,6 +19,7 @@ */ import React, { Component } from 'react'; import lunr from 'lunr'; +import { sortBy } from 'lodash'; import ClearIcon from './icons/ClearIcon'; import { getUrlsList } from '../utils'; @@ -31,11 +32,12 @@ export default class Search extends Component { super(props); this.state = { value: '' }; this.index = lunr(function() { + this.use(tokenContextPlugin); this.ref('id'); this.field('title', { boost: 10 }); this.field('text'); - this.metadataWhitelist = ['position']; + this.metadataWhitelist = ['position', 'tokenContext']; props.pages .filter(page => @@ -52,20 +54,34 @@ export default class Search extends Component { } getFormattedResults = (query, results) => { - return results.map(match => { + const formattedResults = results.map(match => { const page = this.props.pages.find(page => page.id === match.ref); const highlights = {}; let longestTerm = ''; + let exactMatch = false; - // remember the longest term that matches the query *exactly* + // Loop over all matching terms/tokens. Object.keys(match.matchData.metadata).forEach(term => { - if (query.toLowerCase().includes(term.toLowerCase()) && longestTerm.length < term.length) { + // Remember the longest term that matches the query as close as possible. + if (query.includes(term.toLowerCase()) && longestTerm.length < term.length) { longestTerm = term; } Object.keys(match.matchData.metadata[term]).forEach(fieldName => { - const { position: positions } = match.matchData.metadata[term][fieldName]; + const { position: positions, tokenContext: tokenContexts } = match.matchData.metadata[ + term + ][fieldName]; + highlights[fieldName] = [...(highlights[fieldName] || []), ...positions]; + + // Check if we have an *exact match*. + if (!exactMatch && tokenContexts) { + tokenContexts.forEach(tokenContext => { + if (!exactMatch && tokenContext.includes(query)) { + exactMatch = true; + } + }); + } }); }); @@ -76,10 +92,22 @@ export default class Search extends Component { title: page.frontmatter.title, url: page.frontmatter.url || page.fields.slug }, + exactMatch, highlights, + query, longestTerm }; }); + + // Re-order results by the length of the longest matched term and by exact + // match (if applicable). The longer the matched term is, the higher the + // chance the result is more relevant. + return sortBy( + // Sort by longest term. + sortBy(formattedResults, result => -result.longestTerm.length), + // Sort by exact match. + result => result.exactMatch && -1 + ); }; handleClear = event => { @@ -122,3 +150,31 @@ export default class Search extends Component { ); } } + +// Lunr doesn't support exact multiple-term matching. Meaning "foo bar" will not +// boost a sentence like "Foo bar baz" more than "Baz bar foo". In order to +// provide more accurate results, we store the token context, to see if we can +// perform an "exact match". Unfortunately, we cannot extend the search logic, +// only the tokenizer at *index time*. This is why we store the context as +// meta-data, and post-process the matches before rendering (see above). For +// performance reasons, we only add 2 extra tokens, one in front, one after. +// This means we support "exact macthing" for up to 3 terms. More search terms +// would fallback to the regular matching algorithm, which is OK: the more terms +// searched for, the better the standard algorithm will perform anyway. In the +// end, the best would be for Lunr to support multi-term matching, as extending +// the search algorithm for this would be way too complicated. +function tokenContextPlugin(builder) { + const pipelineFunction = (token, index, tokens) => { + const prevToken = tokens[index - 1] || ''; + const nextToken = tokens[index + 1] || ''; + token.metadata['tokenContext'] = [prevToken.toString(), token.toString(), nextToken.toString()] + .filter(s => s.length) + .join(' ') + .toLowerCase(); + return token; + }; + + lunr.Pipeline.registerFunction(pipelineFunction, 'tokenContext'); + builder.pipeline.before(lunr.stemmer, pipelineFunction); + builder.metadataWhitelist.push('tokenContext'); +} diff --git a/server/sonar-web/src/main/js/@types/lunr.d.ts b/server/sonar-web/src/main/js/@types/lunr.d.ts index c9c219b0a06..26e2068abd4 100644 --- a/server/sonar-web/src/main/js/@types/lunr.d.ts +++ b/server/sonar-web/src/main/js/@types/lunr.d.ts @@ -25,9 +25,20 @@ declare module 'lunr' { ref(field: string): void; + use(fn: Function): void; + metadataWhitelist?: string[]; } + export interface LunrBuilder { + pipeline: any; + metadataWhitelist: string[]; + } + + export interface LunrIndex { + search(query: string): LunrMatch[]; + } + export interface LunrInit { (this: Lunr): void; } @@ -38,8 +49,9 @@ declare module 'lunr' { matchData: { metadata: any }; } - export interface LunrIndex { - search(query: string): LunrMatch[]; + export interface LunrToken { + str: string; + metadata: any; } function lunr(initializer: LunrInit): LunrIndex; diff --git a/server/sonar-web/src/main/js/apps/documentation/components/App.tsx b/server/sonar-web/src/main/js/apps/documentation/components/App.tsx index 7ce4f0c79ad..923be34ed2e 100644 --- a/server/sonar-web/src/main/js/apps/documentation/components/App.tsx +++ b/server/sonar-web/src/main/js/apps/documentation/components/App.tsx @@ -29,6 +29,7 @@ import ScreenPositionHelper from '../../../components/common/ScreenPositionHelpe import DocMarkdownBlock from '../../../components/docs/DocMarkdownBlock'; import { translate } from '../../../helpers/l10n'; import { isSonarCloud } from '../../../helpers/system'; +import { addSideBarClass, removeSideBarClass } from '../../../helpers/pages'; import { DocsNavigationItem } from '../utils'; import '../styles.css'; @@ -41,17 +42,11 @@ export default class App extends React.PureComponent<Props> { pages = getPages(); componentDidMount() { - const footer = document.getElementById('footer'); - if (footer) { - footer.classList.add('page-footer-with-sidebar', 'documentation-footer'); - } + addSideBarClass(); } componentWillUnmount() { - const footer = document.getElementById('footer'); - if (footer) { - footer.classList.remove('page-footer-with-sidebar', 'documentation-footer'); - } + removeSideBarClass(); } render() { diff --git a/server/sonar-web/src/main/js/apps/documentation/components/SearchResultEntry.tsx b/server/sonar-web/src/main/js/apps/documentation/components/SearchResultEntry.tsx index 09f6c084025..a47e71e0041 100644 --- a/server/sonar-web/src/main/js/apps/documentation/components/SearchResultEntry.tsx +++ b/server/sonar-web/src/main/js/apps/documentation/components/SearchResultEntry.tsx @@ -23,6 +23,7 @@ import { Link } from 'react-router'; import { highlightMarks, cutWords, DocumentationEntry } from '../utils'; export interface SearchResult { + exactMatch?: boolean; highlights: { [field: string]: [number, number][] }; longestTerm: string; page: DocumentationEntry; diff --git a/server/sonar-web/src/main/js/apps/documentation/components/SearchResults.tsx b/server/sonar-web/src/main/js/apps/documentation/components/SearchResults.tsx index 7e68f234c01..7a0be949f76 100644 --- a/server/sonar-web/src/main/js/apps/documentation/components/SearchResults.tsx +++ b/server/sonar-web/src/main/js/apps/documentation/components/SearchResults.tsx @@ -18,7 +18,7 @@ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ import * as React from 'react'; -import lunr, { LunrIndex } from 'lunr'; +import lunr, { LunrBuilder, LunrIndex, LunrToken } from 'lunr'; import { sortBy } from 'lodash'; import SearchResultEntry, { SearchResult } from './SearchResultEntry'; import { DocumentationEntry, getUrlsList, DocsNavigationItem } from '../utils'; @@ -36,11 +36,12 @@ export default class SearchResults extends React.PureComponent<Props> { constructor(props: Props) { super(props); this.index = lunr(function() { + this.use(tokenContextPlugin); this.ref('relativeName'); this.field('title', { boost: 10 }); this.field('text'); - this.metadataWhitelist = ['position']; + this.metadataWhitelist = ['position', 'tokenContext']; props.pages .filter(page => getUrlsList(props.navigation).includes(page.url)) @@ -49,36 +50,58 @@ export default class SearchResults extends React.PureComponent<Props> { } render() { - const { query } = this.props; + const query = this.props.query.toLowerCase(); const results = this.index - .search(`${query}~1 ${query}*`) + .search( + query + .split(/\s+/) + .map(s => `${s}~1 ${s}*`) + .join(' ') + ) .map(match => { const page = this.props.pages.find(page => page.relativeName === match.ref); const highlights: { [field: string]: [number, number][] } = {}; let longestTerm = ''; + let exactMatch = false; - // remember the longest term that matches the query *exactly* + // Loop over all matching terms/tokens. Object.keys(match.matchData.metadata).forEach(term => { - if ( - query.toLowerCase().includes(term.toLowerCase()) && - longestTerm.length < term.length - ) { + // Remember the longest term that matches the query as close as possible. + if (query.includes(term.toLowerCase()) && longestTerm.length < term.length) { longestTerm = term; } Object.keys(match.matchData.metadata[term]).forEach(fieldName => { - const { position: positions } = match.matchData.metadata[term][fieldName]; + const { position: positions, tokenContext: tokenContexts } = match.matchData.metadata[ + term + ][fieldName]; + highlights[fieldName] = [...(highlights[fieldName] || []), ...positions]; + + // Check if we have an *exact match*. + if (!exactMatch && tokenContexts) { + tokenContexts.forEach((tokenContext: string) => { + if (!exactMatch && tokenContext.includes(query)) { + exactMatch = true; + } + }); + } }); }); - return { page, highlights, longestTerm }; + return { page, highlights, longestTerm, exactMatch }; }) .filter(result => result.page) as SearchResult[]; - // re-order results by the length of the longest matched term - // the longer term is the more chances the result is more relevant - const sortedResults = sortBy(results, result => -result.longestTerm.length); + // Re-order results by the length of the longest matched term and by exact + // match (if applicable). The longer the matched term is, the higher the + // chance the result is more relevant. + const sortedResults = sortBy( + // Sort by longest term. + sortBy(results, result => -result.longestTerm.length), + // Sort by exact match. + result => result.exactMatch && -1 + ); return ( <> @@ -93,3 +116,31 @@ export default class SearchResults extends React.PureComponent<Props> { ); } } + +// Lunr doesn't support exact multiple-term matching. Meaning "foo bar" will not +// boost a sentence like "Foo bar baz" more than "Baz bar foo". In order to +// provide more accurate results, we store the token context, to see if we can +// perform an "exact match". Unfortunately, we cannot extend the search logic, +// only the tokenizer at *index time*. This is why we store the context as +// meta-data, and post-process the matches before rendering (see above). For +// performance reasons, we only add 2 extra tokens, one in front, one after. +// This means we support "exact macthing" for up to 3 terms. More search terms +// would fallback to the regular matching algorithm, which is OK: the more terms +// searched for, the better the standard algorithm will perform anyway. In the +// end, the best would be for Lunr to support multi-term matching, as extending +// the search algorithm for this would be way too complicated. +function tokenContextPlugin(builder: LunrBuilder) { + const pipelineFunction = (token: LunrToken, index: number, tokens: LunrToken[]) => { + const prevToken = tokens[index - 1] || ''; + const nextToken = tokens[index + 1] || ''; + token.metadata['tokenContext'] = [prevToken.toString(), token.toString(), nextToken.toString()] + .filter(s => s.length) + .join(' ') + .toLowerCase(); + return token; + }; + + (lunr as any).Pipeline.registerFunction(pipelineFunction, 'tokenContext'); + builder.pipeline.before((lunr as any).stemmer, pipelineFunction); + builder.metadataWhitelist.push('tokenContext'); +} diff --git a/server/sonar-web/src/main/js/apps/documentation/components/Sidebar.tsx b/server/sonar-web/src/main/js/apps/documentation/components/Sidebar.tsx index ba501d825cc..5dfdfe20e7f 100644 --- a/server/sonar-web/src/main/js/apps/documentation/components/Sidebar.tsx +++ b/server/sonar-web/src/main/js/apps/documentation/components/Sidebar.tsx @@ -37,7 +37,7 @@ export default class Sidebar extends React.PureComponent<Props, State> { state: State = { query: '' }; handleSearch = (query: string) => { - this.setState({ query }); + this.setState({ query: query.trim() }); }; render() { diff --git a/server/sonar-web/src/main/js/apps/documentation/components/__tests__/SearchResults-test.tsx b/server/sonar-web/src/main/js/apps/documentation/components/__tests__/SearchResults-test.tsx index 5706ed4131f..7a55e8c44d0 100644 --- a/server/sonar-web/src/main/js/apps/documentation/components/__tests__/SearchResults-test.tsx +++ b/server/sonar-web/src/main/js/apps/documentation/components/__tests__/SearchResults-test.tsx @@ -28,10 +28,28 @@ jest.mock('lunr', () => ({ { ref: 'lorem/origin', matchData: { - metadata: { from: { title: { position: [[19, 5]] }, text: { position: [[121, 4]] } } } + metadata: { + simply: { + title: { position: [[19, 5]] }, + text: { + position: [[15, 6], [28, 4]], + tokenContext: ['is simply dummy', 'simply dummy text'] + } + } + } } }, - { ref: 'foobar', matchData: { metadata: { from: { title: { position: [[23, 4]] } } } } } + { + ref: 'foobar', + matchData: { + metadata: { + simply: { + title: { position: [[23, 4]] }, + text: { position: [[111, 6], [118, 4]], tokenContext: ['keywords simply text'] } + } + } + } + } ]) })) })); @@ -54,7 +72,7 @@ const pages = [ createPage( 'Where does Foobar come from?', 'foobar', - 'Foobar is a universal variable understood to represent whatever is being discussed.' + 'Foobar is a universal variable understood to represent whatever is being discussed. Now we need some keywords: simply text.' ) ]; @@ -63,11 +81,13 @@ it('should search', () => { <SearchResults navigation={['lorem/index', 'lorem/origin', 'foobar']} pages={pages} - query="from" + query="simply text" splat="foobar" /> ); expect(wrapper).toMatchSnapshot(); expect(lunr).toBeCalled(); - expect((wrapper.instance() as SearchResults).index.search).toBeCalledWith('from~1 from*'); + expect((wrapper.instance() as SearchResults).index.search).toBeCalledWith( + 'simply~1 simply* text~1 text*' + ); }); diff --git a/server/sonar-web/src/main/js/apps/documentation/components/__tests__/__snapshots__/SearchResults-test.tsx.snap b/server/sonar-web/src/main/js/apps/documentation/components/__tests__/__snapshots__/SearchResults-test.tsx.snap index 4a79799119b..f82b3fdda85 100644 --- a/server/sonar-web/src/main/js/apps/documentation/components/__tests__/__snapshots__/SearchResults-test.tsx.snap +++ b/server/sonar-web/src/main/js/apps/documentation/components/__tests__/__snapshots__/SearchResults-test.tsx.snap @@ -3,57 +3,73 @@ exports[`should search 1`] = ` <Fragment> <SearchResultEntry - active={false} - key="lorem/origin" + active={true} + key="foobar" result={ Object { + "exactMatch": true, "highlights": Object { "text": Array [ Array [ - 121, + 111, + 6, + ], + Array [ + 118, 4, ], ], "title": Array [ Array [ - 19, - 5, + 23, + 4, ], ], }, - "longestTerm": "from", + "longestTerm": "simply", "page": Object { - "content": "Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words.", + "content": "Foobar is a universal variable understood to represent whatever is being discussed. Now we need some keywords: simply text.", "navTitle": undefined, - "relativeName": "lorem/origin", - "text": "Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words.", - "title": "Where does it come from?", - "url": "/lorem/origin", + "relativeName": "foobar", + "text": "Foobar is a universal variable understood to represent whatever is being discussed. Now we need some keywords: simply text.", + "title": "Where does Foobar come from?", + "url": "/foobar", }, } } /> <SearchResultEntry - active={true} - key="foobar" + active={false} + key="lorem/origin" result={ Object { + "exactMatch": false, "highlights": Object { - "title": Array [ + "text": Array [ Array [ - 23, + 15, + 6, + ], + Array [ + 28, 4, ], ], + "title": Array [ + Array [ + 19, + 5, + ], + ], }, - "longestTerm": "from", + "longestTerm": "simply", "page": Object { - "content": "Foobar is a universal variable understood to represent whatever is being discussed.", + "content": "Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words.", "navTitle": undefined, - "relativeName": "foobar", - "text": "Foobar is a universal variable understood to represent whatever is being discussed.", - "title": "Where does Foobar come from?", - "url": "/foobar", + "relativeName": "lorem/origin", + "text": "Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words.", + "title": "Where does it come from?", + "url": "/lorem/origin", }, } } |