Browse Source

SONAR-11472 Add support for exact search-query matching

Out of the box, Lunr doesn't support exact pattern matching for
search queries, meaning searching for "foo bar" will not boost
a sentence like "Foo bar baz" more than "Baz bar foo" (both
contain both keywords). We now do some crude pattern matching by
storing the token "context" upon indexing. It's not 100% correct,
but it gets the job done.
tags/7.6
Wouter Admiraal 5 years ago
parent
commit
42fdf73a49

+ 61
- 5
server/sonar-docs/src/layouts/components/Search.js View File

@@ -19,6 +19,7 @@
*/
import React, { Component } from 'react';
import lunr from 'lunr';
import { sortBy } from 'lodash';
import ClearIcon from './icons/ClearIcon';
import { getUrlsList } from '../utils';

@@ -31,11 +32,12 @@ export default class Search extends Component {
super(props);
this.state = { value: '' };
this.index = lunr(function() {
this.use(tokenContextPlugin);
this.ref('id');
this.field('title', { boost: 10 });
this.field('text');

this.metadataWhitelist = ['position'];
this.metadataWhitelist = ['position', 'tokenContext'];

props.pages
.filter(page =>
@@ -52,20 +54,34 @@ export default class Search extends Component {
}

getFormattedResults = (query, results) => {
return results.map(match => {
const formattedResults = results.map(match => {
const page = this.props.pages.find(page => page.id === match.ref);
const highlights = {};
let longestTerm = '';
let exactMatch = false;

// remember the longest term that matches the query *exactly*
// Loop over all matching terms/tokens.
Object.keys(match.matchData.metadata).forEach(term => {
if (query.toLowerCase().includes(term.toLowerCase()) && longestTerm.length < term.length) {
// Remember the longest term that matches the query as close as possible.
if (query.includes(term.toLowerCase()) && longestTerm.length < term.length) {
longestTerm = term;
}

Object.keys(match.matchData.metadata[term]).forEach(fieldName => {
const { position: positions } = match.matchData.metadata[term][fieldName];
const { position: positions, tokenContext: tokenContexts } = match.matchData.metadata[
term
][fieldName];

highlights[fieldName] = [...(highlights[fieldName] || []), ...positions];

// Check if we have an *exact match*.
if (!exactMatch && tokenContexts) {
tokenContexts.forEach(tokenContext => {
if (!exactMatch && tokenContext.includes(query)) {
exactMatch = true;
}
});
}
});
});

@@ -76,10 +92,22 @@ export default class Search extends Component {
title: page.frontmatter.title,
url: page.frontmatter.url || page.fields.slug
},
exactMatch,
highlights,
query,
longestTerm
};
});

// Re-order results by the length of the longest matched term and by exact
// match (if applicable). The longer the matched term is, the higher the
// chance the result is more relevant.
return sortBy(
// Sort by longest term.
sortBy(formattedResults, result => -result.longestTerm.length),
// Sort by exact match.
result => result.exactMatch && -1
);
};

handleClear = event => {
@@ -122,3 +150,31 @@ export default class Search extends Component {
);
}
}

// Lunr doesn't support exact multiple-term matching. Meaning "foo bar" will not
// boost a sentence like "Foo bar baz" more than "Baz bar foo". In order to
// provide more accurate results, we store the token context, to see if we can
// perform an "exact match". Unfortunately, we cannot extend the search logic,
// only the tokenizer at *index time*. This is why we store the context as
// meta-data, and post-process the matches before rendering (see above). For
// performance reasons, we only add 2 extra tokens, one in front, one after.
// This means we support "exact macthing" for up to 3 terms. More search terms
// would fallback to the regular matching algorithm, which is OK: the more terms
// searched for, the better the standard algorithm will perform anyway. In the
// end, the best would be for Lunr to support multi-term matching, as extending
// the search algorithm for this would be way too complicated.
function tokenContextPlugin(builder) {
const pipelineFunction = (token, index, tokens) => {
const prevToken = tokens[index - 1] || '';
const nextToken = tokens[index + 1] || '';
token.metadata['tokenContext'] = [prevToken.toString(), token.toString(), nextToken.toString()]
.filter(s => s.length)
.join(' ')
.toLowerCase();
return token;
};

lunr.Pipeline.registerFunction(pipelineFunction, 'tokenContext');
builder.pipeline.before(lunr.stemmer, pipelineFunction);
builder.metadataWhitelist.push('tokenContext');
}

+ 14
- 2
server/sonar-web/src/main/js/@types/lunr.d.ts View File

@@ -25,9 +25,20 @@ declare module 'lunr' {

ref(field: string): void;

use(fn: Function): void;

metadataWhitelist?: string[];
}

export interface LunrBuilder {
pipeline: any;
metadataWhitelist: string[];
}

export interface LunrIndex {
search(query: string): LunrMatch[];
}

export interface LunrInit {
(this: Lunr): void;
}
@@ -38,8 +49,9 @@ declare module 'lunr' {
matchData: { metadata: any };
}

export interface LunrIndex {
search(query: string): LunrMatch[];
export interface LunrToken {
str: string;
metadata: any;
}

function lunr(initializer: LunrInit): LunrIndex;

+ 3
- 8
server/sonar-web/src/main/js/apps/documentation/components/App.tsx View File

@@ -29,6 +29,7 @@ import ScreenPositionHelper from '../../../components/common/ScreenPositionHelpe
import DocMarkdownBlock from '../../../components/docs/DocMarkdownBlock';
import { translate } from '../../../helpers/l10n';
import { isSonarCloud } from '../../../helpers/system';
import { addSideBarClass, removeSideBarClass } from '../../../helpers/pages';
import { DocsNavigationItem } from '../utils';
import '../styles.css';

@@ -41,17 +42,11 @@ export default class App extends React.PureComponent<Props> {
pages = getPages();

componentDidMount() {
const footer = document.getElementById('footer');
if (footer) {
footer.classList.add('page-footer-with-sidebar', 'documentation-footer');
}
addSideBarClass();
}

componentWillUnmount() {
const footer = document.getElementById('footer');
if (footer) {
footer.classList.remove('page-footer-with-sidebar', 'documentation-footer');
}
removeSideBarClass();
}

render() {

+ 1
- 0
server/sonar-web/src/main/js/apps/documentation/components/SearchResultEntry.tsx View File

@@ -23,6 +23,7 @@ import { Link } from 'react-router';
import { highlightMarks, cutWords, DocumentationEntry } from '../utils';

export interface SearchResult {
exactMatch?: boolean;
highlights: { [field: string]: [number, number][] };
longestTerm: string;
page: DocumentationEntry;

+ 65
- 14
server/sonar-web/src/main/js/apps/documentation/components/SearchResults.tsx View File

@@ -18,7 +18,7 @@
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
import * as React from 'react';
import lunr, { LunrIndex } from 'lunr';
import lunr, { LunrBuilder, LunrIndex, LunrToken } from 'lunr';
import { sortBy } from 'lodash';
import SearchResultEntry, { SearchResult } from './SearchResultEntry';
import { DocumentationEntry, getUrlsList, DocsNavigationItem } from '../utils';
@@ -36,11 +36,12 @@ export default class SearchResults extends React.PureComponent<Props> {
constructor(props: Props) {
super(props);
this.index = lunr(function() {
this.use(tokenContextPlugin);
this.ref('relativeName');
this.field('title', { boost: 10 });
this.field('text');

this.metadataWhitelist = ['position'];
this.metadataWhitelist = ['position', 'tokenContext'];

props.pages
.filter(page => getUrlsList(props.navigation).includes(page.url))
@@ -49,36 +50,58 @@ export default class SearchResults extends React.PureComponent<Props> {
}

render() {
const { query } = this.props;
const query = this.props.query.toLowerCase();
const results = this.index
.search(`${query}~1 ${query}*`)
.search(
query
.split(/\s+/)
.map(s => `${s}~1 ${s}*`)
.join(' ')
)
.map(match => {
const page = this.props.pages.find(page => page.relativeName === match.ref);
const highlights: { [field: string]: [number, number][] } = {};
let longestTerm = '';
let exactMatch = false;

// remember the longest term that matches the query *exactly*
// Loop over all matching terms/tokens.
Object.keys(match.matchData.metadata).forEach(term => {
if (
query.toLowerCase().includes(term.toLowerCase()) &&
longestTerm.length < term.length
) {
// Remember the longest term that matches the query as close as possible.
if (query.includes(term.toLowerCase()) && longestTerm.length < term.length) {
longestTerm = term;
}

Object.keys(match.matchData.metadata[term]).forEach(fieldName => {
const { position: positions } = match.matchData.metadata[term][fieldName];
const { position: positions, tokenContext: tokenContexts } = match.matchData.metadata[
term
][fieldName];

highlights[fieldName] = [...(highlights[fieldName] || []), ...positions];

// Check if we have an *exact match*.
if (!exactMatch && tokenContexts) {
tokenContexts.forEach((tokenContext: string) => {
if (!exactMatch && tokenContext.includes(query)) {
exactMatch = true;
}
});
}
});
});

return { page, highlights, longestTerm };
return { page, highlights, longestTerm, exactMatch };
})
.filter(result => result.page) as SearchResult[];

// re-order results by the length of the longest matched term
// the longer term is the more chances the result is more relevant
const sortedResults = sortBy(results, result => -result.longestTerm.length);
// Re-order results by the length of the longest matched term and by exact
// match (if applicable). The longer the matched term is, the higher the
// chance the result is more relevant.
const sortedResults = sortBy(
// Sort by longest term.
sortBy(results, result => -result.longestTerm.length),
// Sort by exact match.
result => result.exactMatch && -1
);

return (
<>
@@ -93,3 +116,31 @@ export default class SearchResults extends React.PureComponent<Props> {
);
}
}

// Lunr doesn't support exact multiple-term matching. Meaning "foo bar" will not
// boost a sentence like "Foo bar baz" more than "Baz bar foo". In order to
// provide more accurate results, we store the token context, to see if we can
// perform an "exact match". Unfortunately, we cannot extend the search logic,
// only the tokenizer at *index time*. This is why we store the context as
// meta-data, and post-process the matches before rendering (see above). For
// performance reasons, we only add 2 extra tokens, one in front, one after.
// This means we support "exact macthing" for up to 3 terms. More search terms
// would fallback to the regular matching algorithm, which is OK: the more terms
// searched for, the better the standard algorithm will perform anyway. In the
// end, the best would be for Lunr to support multi-term matching, as extending
// the search algorithm for this would be way too complicated.
function tokenContextPlugin(builder: LunrBuilder) {
const pipelineFunction = (token: LunrToken, index: number, tokens: LunrToken[]) => {
const prevToken = tokens[index - 1] || '';
const nextToken = tokens[index + 1] || '';
token.metadata['tokenContext'] = [prevToken.toString(), token.toString(), nextToken.toString()]
.filter(s => s.length)
.join(' ')
.toLowerCase();
return token;
};

(lunr as any).Pipeline.registerFunction(pipelineFunction, 'tokenContext');
builder.pipeline.before((lunr as any).stemmer, pipelineFunction);
builder.metadataWhitelist.push('tokenContext');
}

+ 1
- 1
server/sonar-web/src/main/js/apps/documentation/components/Sidebar.tsx View File

@@ -37,7 +37,7 @@ export default class Sidebar extends React.PureComponent<Props, State> {
state: State = { query: '' };

handleSearch = (query: string) => {
this.setState({ query });
this.setState({ query: query.trim() });
};

render() {

+ 25
- 5
server/sonar-web/src/main/js/apps/documentation/components/__tests__/SearchResults-test.tsx View File

@@ -28,10 +28,28 @@ jest.mock('lunr', () => ({
{
ref: 'lorem/origin',
matchData: {
metadata: { from: { title: { position: [[19, 5]] }, text: { position: [[121, 4]] } } }
metadata: {
simply: {
title: { position: [[19, 5]] },
text: {
position: [[15, 6], [28, 4]],
tokenContext: ['is simply dummy', 'simply dummy text']
}
}
}
}
},
{ ref: 'foobar', matchData: { metadata: { from: { title: { position: [[23, 4]] } } } } }
{
ref: 'foobar',
matchData: {
metadata: {
simply: {
title: { position: [[23, 4]] },
text: { position: [[111, 6], [118, 4]], tokenContext: ['keywords simply text'] }
}
}
}
}
])
}))
}));
@@ -54,7 +72,7 @@ const pages = [
createPage(
'Where does Foobar come from?',
'foobar',
'Foobar is a universal variable understood to represent whatever is being discussed.'
'Foobar is a universal variable understood to represent whatever is being discussed. Now we need some keywords: simply text.'
)
];

@@ -63,11 +81,13 @@ it('should search', () => {
<SearchResults
navigation={['lorem/index', 'lorem/origin', 'foobar']}
pages={pages}
query="from"
query="simply text"
splat="foobar"
/>
);
expect(wrapper).toMatchSnapshot();
expect(lunr).toBeCalled();
expect((wrapper.instance() as SearchResults).index.search).toBeCalledWith('from~1 from*');
expect((wrapper.instance() as SearchResults).index.search).toBeCalledWith(
'simply~1 simply* text~1 text*'
);
});

+ 37
- 21
server/sonar-web/src/main/js/apps/documentation/components/__tests__/__snapshots__/SearchResults-test.tsx.snap View File

@@ -3,57 +3,73 @@
exports[`should search 1`] = `
<Fragment>
<SearchResultEntry
active={false}
key="lorem/origin"
active={true}
key="foobar"
result={
Object {
"exactMatch": true,
"highlights": Object {
"text": Array [
Array [
121,
111,
6,
],
Array [
118,
4,
],
],
"title": Array [
Array [
19,
5,
23,
4,
],
],
},
"longestTerm": "from",
"longestTerm": "simply",
"page": Object {
"content": "Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words.",
"content": "Foobar is a universal variable understood to represent whatever is being discussed. Now we need some keywords: simply text.",
"navTitle": undefined,
"relativeName": "lorem/origin",
"text": "Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words.",
"title": "Where does it come from?",
"url": "/lorem/origin",
"relativeName": "foobar",
"text": "Foobar is a universal variable understood to represent whatever is being discussed. Now we need some keywords: simply text.",
"title": "Where does Foobar come from?",
"url": "/foobar",
},
}
}
/>
<SearchResultEntry
active={true}
key="foobar"
active={false}
key="lorem/origin"
result={
Object {
"exactMatch": false,
"highlights": Object {
"title": Array [
"text": Array [
Array [
23,
15,
6,
],
Array [
28,
4,
],
],
"title": Array [
Array [
19,
5,
],
],
},
"longestTerm": "from",
"longestTerm": "simply",
"page": Object {
"content": "Foobar is a universal variable understood to represent whatever is being discussed.",
"content": "Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words.",
"navTitle": undefined,
"relativeName": "foobar",
"text": "Foobar is a universal variable understood to represent whatever is being discussed.",
"title": "Where does Foobar come from?",
"url": "/foobar",
"relativeName": "lorem/origin",
"text": "Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words.",
"title": "Where does it come from?",
"url": "/lorem/origin",
},
}
}

Loading…
Cancel
Save