SONAR-11472 Add support for exact search-query matching

author Wouter Admiraal <wouter.admiraal@sonarsource.com>

Wed, 26 Dec 2018 11:49:27 +0000 (12:49 +0100)

committer SonarTech <sonartech@sonarsource.com>

Thu, 10 Jan 2019 19:21:02 +0000 (20:21 +0100)
author Wouter Admiraal <wouter.admiraal@sonarsource.com>
Wed, 26 Dec 2018 11:49:27 +0000 (12:49 +0100)
committer SonarTech <sonartech@sonarsource.com>
Thu, 10 Jan 2019 19:21:02 +0000 (20:21 +0100)
diff --git a/server/sonar-docs/src/layouts/components/Search.js b/server/sonar-docs/src/layouts/components/Search.js

index 72d52b712c504138a22de7209d53d5b2c6d7302b..fe09b0f3d0bcc7100346ee31322814fc4e56f6ae 100644 (file)
--- a/server/sonar-docs/src/layouts/components/Search.js
+++ b/server/sonar-docs/src/layouts/components/Search.js
@@ -19,6 +19,7 @@
   */
  import React, { Component } from 'react';
  import lunr from 'lunr';
+import { sortBy } from 'lodash';
  import ClearIcon from './icons/ClearIcon';
  import { getUrlsList } from '../utils';
  
@@ -31,11 +32,12 @@ export default class Search extends Component {
      super(props);
      this.state = { value: '' };
      this.index = lunr(function() {
+      this.use(tokenContextPlugin);
        this.ref('id');
        this.field('title', { boost: 10 });
        this.field('text');
  
-      this.metadataWhitelist = ['position'];
+      this.metadataWhitelist = ['position', 'tokenContext'];
  
        props.pages
          .filter(page =>
@@ -52,20 +54,34 @@ export default class Search extends Component {
    }
  
    getFormattedResults = (query, results) => {
-    return results.map(match => {
+    const formattedResults = results.map(match => {
        const page = this.props.pages.find(page => page.id === match.ref);
        const highlights = {};
        let longestTerm = '';
+      let exactMatch = false;
  
-      // remember the longest term that matches the query *exactly*
+      // Loop over all matching terms/tokens.
        Object.keys(match.matchData.metadata).forEach(term => {
-        if (query.toLowerCase().includes(term.toLowerCase()) && longestTerm.length < term.length) {
+        // Remember the longest term that matches the query as close as possible.
+        if (query.includes(term.toLowerCase()) && longestTerm.length < term.length) {
            longestTerm = term;
          }
  
          Object.keys(match.matchData.metadata[term]).forEach(fieldName => {
-          const { position: positions } = match.matchData.metadata[term][fieldName];
+          const { position: positions, tokenContext: tokenContexts } = match.matchData.metadata[
+            term
+          ][fieldName];
+
            highlights[fieldName] = [...(highlights[fieldName] || []), ...positions];
+
+          // Check if we have an *exact match*.
+          if (!exactMatch && tokenContexts) {
+            tokenContexts.forEach(tokenContext => {
+              if (!exactMatch && tokenContext.includes(query)) {
+                exactMatch = true;
+              }
+            });
+          }
          });
        });
  
@@ -76,10 +92,22 @@ export default class Search extends Component {
            title: page.frontmatter.title,
            url: page.frontmatter.url || page.fields.slug
          },
+        exactMatch,
          highlights,
+        query,
          longestTerm
        };
      });
+
+    // Re-order results by the length of the longest matched term and by exact
+    // match (if applicable). The longer the matched term is, the higher the
+    // chance the result is more relevant.
+    return sortBy(
+      // Sort by longest term.
+      sortBy(formattedResults, result => -result.longestTerm.length),
+      // Sort by exact match.
+      result => result.exactMatch && -1
+    );
    };
  
    handleClear = event => {
@@ -122,3 +150,31 @@ export default class Search extends Component {
      );
    }
  }
+
+// Lunr doesn't support exact multiple-term matching. Meaning "foo bar" will not
+// boost a sentence like "Foo bar baz" more than "Baz bar foo". In order to
+// provide more accurate results, we store the token context, to see if we can
+// perform an "exact match". Unfortunately, we cannot extend the search logic,
+// only the tokenizer at *index time*. This is why we store the context as
+// meta-data, and post-process the matches before rendering (see above). For
+// performance reasons, we only add 2 extra tokens, one in front, one after.
+// This means we support "exact macthing" for up to 3 terms. More search terms
+// would fallback to the regular matching algorithm, which is OK: the more terms
+// searched for, the better the standard algorithm will perform anyway. In the
+// end, the best would be for Lunr to support multi-term matching, as extending
+// the search algorithm for this would be way too complicated.
+function tokenContextPlugin(builder) {
+  const pipelineFunction = (token, index, tokens) => {
+    const prevToken = tokens[index - 1] || '';
+    const nextToken = tokens[index + 1] || '';
+    token.metadata['tokenContext'] = [prevToken.toString(), token.toString(), nextToken.toString()]
+      .filter(s => s.length)
+      .join(' ')
+      .toLowerCase();
+    return token;
+  };
+
+  lunr.Pipeline.registerFunction(pipelineFunction, 'tokenContext');
+  builder.pipeline.before(lunr.stemmer, pipelineFunction);
+  builder.metadataWhitelist.push('tokenContext');
+}
diff --git a/server/sonar-web/src/main/js/@types/lunr.d.ts b/server/sonar-web/src/main/js/@types/lunr.d.ts

index c9c219b0a06c22e909026497bfb1f28592d383ad..26e2068abd47e086dce1b1a9997e75390c191fed 100644 (file)
--- a/server/sonar-web/src/main/js/@types/lunr.d.ts
+++ b/server/sonar-web/src/main/js/@types/lunr.d.ts
@@ -25,9 +25,20 @@ declare module 'lunr' {
  
      ref(field: string): void;
  
+    use(fn: Function): void;
+
      metadataWhitelist?: string[];
    }
  
+  export interface LunrBuilder {
+    pipeline: any;
+    metadataWhitelist: string[];
+  }
+
+  export interface LunrIndex {
+    search(query: string): LunrMatch[];
+  }
+
    export interface LunrInit {
      (this: Lunr): void;
    }
@@ -38,8 +49,9 @@ declare module 'lunr' {
      matchData: { metadata: any };
    }
  
-  export interface LunrIndex {
-    search(query: string): LunrMatch[];
+  export interface LunrToken {
+    str: string;
+    metadata: any;
    }
  
    function lunr(initializer: LunrInit): LunrIndex;
diff --git a/server/sonar-web/src/main/js/apps/documentation/components/App.tsx b/server/sonar-web/src/main/js/apps/documentation/components/App.tsx

index 7ce4f0c79ad172f3c654c5db72d8c7327edfd666..923be34ed2ecbfc6ba58742f09d08344b1168ff7 100644 (file)
--- a/server/sonar-web/src/main/js/apps/documentation/components/App.tsx
+++ b/server/sonar-web/src/main/js/apps/documentation/components/App.tsx
@@ -29,6 +29,7 @@ import ScreenPositionHelper from '../../../components/common/ScreenPositionHelpe
  import DocMarkdownBlock from '../../../components/docs/DocMarkdownBlock';
  import { translate } from '../../../helpers/l10n';
  import { isSonarCloud } from '../../../helpers/system';
+import { addSideBarClass, removeSideBarClass } from '../../../helpers/pages';
  import { DocsNavigationItem } from '../utils';
  import '../styles.css';
  
@@ -41,17 +42,11 @@ export default class App extends React.PureComponent<Props> {
    pages = getPages();
  
    componentDidMount() {
-    const footer = document.getElementById('footer');
-    if (footer) {
-      footer.classList.add('page-footer-with-sidebar', 'documentation-footer');
-    }
+    addSideBarClass();
    }
  
    componentWillUnmount() {
-    const footer = document.getElementById('footer');
-    if (footer) {
-      footer.classList.remove('page-footer-with-sidebar', 'documentation-footer');
-    }
+    removeSideBarClass();
    }
  
    render() {
diff --git a/server/sonar-web/src/main/js/apps/documentation/components/SearchResultEntry.tsx b/server/sonar-web/src/main/js/apps/documentation/components/SearchResultEntry.tsx

index 09f6c084025261ddfeaa2d6e5b50a9386dbdbdfa..a47e71e0041309f72e58f6ea89809e6a3bd279c8 100644 (file)
--- a/server/sonar-web/src/main/js/apps/documentation/components/SearchResultEntry.tsx
+++ b/server/sonar-web/src/main/js/apps/documentation/components/SearchResultEntry.tsx
@@ -23,6 +23,7 @@ import { Link } from 'react-router';
  import { highlightMarks, cutWords, DocumentationEntry } from '../utils';
  
  export interface SearchResult {
+  exactMatch?: boolean;
    highlights: { [field: string]: [number, number][] };
    longestTerm: string;
    page: DocumentationEntry;
diff --git a/server/sonar-web/src/main/js/apps/documentation/components/SearchResults.tsx b/server/sonar-web/src/main/js/apps/documentation/components/SearchResults.tsx

index 7e68f234c01971bc8479a8979c6b44f2d8774b57..7a0be949f76c1fd949e68a696127c1863d3724e0 100644 (file)
--- a/server/sonar-web/src/main/js/apps/documentation/components/SearchResults.tsx
+++ b/server/sonar-web/src/main/js/apps/documentation/components/SearchResults.tsx
@@ -18,7 +18,7 @@
   * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
   */
  import * as React from 'react';
-import lunr, { LunrIndex } from 'lunr';
+import lunr, { LunrBuilder, LunrIndex, LunrToken } from 'lunr';
  import { sortBy } from 'lodash';
  import SearchResultEntry, { SearchResult } from './SearchResultEntry';
  import { DocumentationEntry, getUrlsList, DocsNavigationItem } from '../utils';
@@ -36,11 +36,12 @@ export default class SearchResults extends React.PureComponent<Props> {
    constructor(props: Props) {
      super(props);
      this.index = lunr(function() {
+      this.use(tokenContextPlugin);
        this.ref('relativeName');
        this.field('title', { boost: 10 });
        this.field('text');
  
-      this.metadataWhitelist = ['position'];
+      this.metadataWhitelist = ['position', 'tokenContext'];
  
        props.pages
          .filter(page => getUrlsList(props.navigation).includes(page.url))
@@ -49,36 +50,58 @@ export default class SearchResults extends React.PureComponent<Props> {
    }
  
    render() {
-    const { query } = this.props;
+    const query = this.props.query.toLowerCase();
      const results = this.index
-      .search(`${query}~1 ${query}*`)
+      .search(
+        query
+          .split(/\s+/)
+          .map(s => `${s}~1 ${s}*`)
+          .join(' ')
+      )
        .map(match => {
          const page = this.props.pages.find(page => page.relativeName === match.ref);
          const highlights: { [field: string]: [number, number][] } = {};
          let longestTerm = '';
+        let exactMatch = false;
  
-        // remember the longest term that matches the query *exactly*
+        // Loop over all matching terms/tokens.
          Object.keys(match.matchData.metadata).forEach(term => {
-          if (
-            query.toLowerCase().includes(term.toLowerCase()) &&
-            longestTerm.length < term.length
-          ) {
+          // Remember the longest term that matches the query as close as possible.
+          if (query.includes(term.toLowerCase()) && longestTerm.length < term.length) {
              longestTerm = term;
            }
  
            Object.keys(match.matchData.metadata[term]).forEach(fieldName => {
-            const { position: positions } = match.matchData.metadata[term][fieldName];
+            const { position: positions, tokenContext: tokenContexts } = match.matchData.metadata[
+              term
+            ][fieldName];
+
              highlights[fieldName] = [...(highlights[fieldName] || []), ...positions];
+
+            // Check if we have an *exact match*.
+            if (!exactMatch && tokenContexts) {
+              tokenContexts.forEach((tokenContext: string) => {
+                if (!exactMatch && tokenContext.includes(query)) {
+                  exactMatch = true;
+                }
+              });
+            }
            });
          });
  
-        return { page, highlights, longestTerm };
+        return { page, highlights, longestTerm, exactMatch };
        })
        .filter(result => result.page) as SearchResult[];
  
-    // re-order results by the length of the longest matched term
-    // the longer term is the more chances the result is more relevant
-    const sortedResults = sortBy(results, result => -result.longestTerm.length);
+    // Re-order results by the length of the longest matched term and by exact
+    // match (if applicable). The longer the matched term is, the higher the
+    // chance the result is more relevant.
+    const sortedResults = sortBy(
+      // Sort by longest term.
+      sortBy(results, result => -result.longestTerm.length),
+      // Sort by exact match.
+      result => result.exactMatch && -1
+    );
  
      return (
        <>
@@ -93,3 +116,31 @@ export default class SearchResults extends React.PureComponent<Props> {
      );
    }
  }
+
+// Lunr doesn't support exact multiple-term matching. Meaning "foo bar" will not
+// boost a sentence like "Foo bar baz" more than "Baz bar foo". In order to
+// provide more accurate results, we store the token context, to see if we can
+// perform an "exact match". Unfortunately, we cannot extend the search logic,
+// only the tokenizer at *index time*. This is why we store the context as
+// meta-data, and post-process the matches before rendering (see above). For
+// performance reasons, we only add 2 extra tokens, one in front, one after.
+// This means we support "exact macthing" for up to 3 terms. More search terms
+// would fallback to the regular matching algorithm, which is OK: the more terms
+// searched for, the better the standard algorithm will perform anyway. In the
+// end, the best would be for Lunr to support multi-term matching, as extending
+// the search algorithm for this would be way too complicated.
+function tokenContextPlugin(builder: LunrBuilder) {
+  const pipelineFunction = (token: LunrToken, index: number, tokens: LunrToken[]) => {
+    const prevToken = tokens[index - 1] || '';
+    const nextToken = tokens[index + 1] || '';
+    token.metadata['tokenContext'] = [prevToken.toString(), token.toString(), nextToken.toString()]
+      .filter(s => s.length)
+      .join(' ')
+      .toLowerCase();
+    return token;
+  };
+
+  (lunr as any).Pipeline.registerFunction(pipelineFunction, 'tokenContext');
+  builder.pipeline.before((lunr as any).stemmer, pipelineFunction);
+  builder.metadataWhitelist.push('tokenContext');
+}
diff --git a/server/sonar-web/src/main/js/apps/documentation/components/Sidebar.tsx b/server/sonar-web/src/main/js/apps/documentation/components/Sidebar.tsx

index ba501d825ccf5369446d0e6dc9a5d5aa7097d2db..5dfdfe20e7f9c2d7600ea2be575bf3eb11414a34 100644 (file)
--- a/server/sonar-web/src/main/js/apps/documentation/components/Sidebar.tsx
+++ b/server/sonar-web/src/main/js/apps/documentation/components/Sidebar.tsx
@@ -37,7 +37,7 @@ export default class Sidebar extends React.PureComponent<Props, State> {
    state: State = { query: '' };
  
    handleSearch = (query: string) => {
-    this.setState({ query });
+    this.setState({ query: query.trim() });
    };
  
    render() {
diff --git a/server/sonar-web/src/main/js/apps/documentation/components/__tests__/SearchResults-test.tsx b/server/sonar-web/src/main/js/apps/documentation/components/__tests__/SearchResults-test.tsx

index 5706ed4131f1978c2aa24e38d0509fce4e4175d3..7a55e8c44d0d50b9901578be455512625ec1763f 100644 (file)
--- a/server/sonar-web/src/main/js/apps/documentation/components/__tests__/SearchResults-test.tsx
+++ b/server/sonar-web/src/main/js/apps/documentation/components/__tests__/SearchResults-test.tsx
@@ -28,10 +28,28 @@ jest.mock('lunr', () => ({
        {
          ref: 'lorem/origin',
          matchData: {
-          metadata: { from: { title: { position: [[19, 5]] }, text: { position: [[121, 4]] } } }
+          metadata: {
+            simply: {
+              title: { position: [[19, 5]] },
+              text: {
+                position: [[15, 6], [28, 4]],
+                tokenContext: ['is simply dummy', 'simply dummy text']
+              }
+            }
+          }
          }
        },
-      { ref: 'foobar', matchData: { metadata: { from: { title: { position: [[23, 4]] } } } } }
+      {
+        ref: 'foobar',
+        matchData: {
+          metadata: {
+            simply: {
+              title: { position: [[23, 4]] },
+              text: { position: [[111, 6], [118, 4]], tokenContext: ['keywords simply text'] }
+            }
+          }
+        }
+      }
      ])
    }))
  }));
@@ -54,7 +72,7 @@ const pages = [
    createPage(
      'Where does Foobar come from?',
      'foobar',
-    'Foobar is a universal variable understood to represent whatever is being discussed.'
+    'Foobar is a universal variable understood to represent whatever is being discussed. Now we need some keywords: simply text.'
    )
  ];
  
@@ -63,11 +81,13 @@ it('should search', () => {
      <SearchResults
        navigation={['lorem/index', 'lorem/origin', 'foobar']}
        pages={pages}
-      query="from"
+      query="simply text"
        splat="foobar"
      />
    );
    expect(wrapper).toMatchSnapshot();
    expect(lunr).toBeCalled();
-  expect((wrapper.instance() as SearchResults).index.search).toBeCalledWith('from~1 from*');
+  expect((wrapper.instance() as SearchResults).index.search).toBeCalledWith(
+    'simply~1 simply* text~1 text*'
+  );
  });
diff --git a/server/sonar-web/src/main/js/apps/documentation/components/__tests__/__snapshots__/SearchResults-test.tsx.snap b/server/sonar-web/src/main/js/apps/documentation/components/__tests__/__snapshots__/SearchResults-test.tsx.snap

index 4a79799119b75e6bff20d6b8cd8138bc07ef4e2f..f82b3fdda85deca509bf92c7c5e8e69dec6fa301 100644 (file)
--- a/server/sonar-web/src/main/js/apps/documentation/components/__tests__/__snapshots__/SearchResults-test.tsx.snap
+++ b/server/sonar-web/src/main/js/apps/documentation/components/__tests__/__snapshots__/SearchResults-test.tsx.snap
@@ -3,57 +3,73 @@
  exports[`should search 1`] = `
  <Fragment>
    <SearchResultEntry
-    active={false}
-    key="lorem/origin"
+    active={true}
+    key="foobar"
      result={
        Object {
+        "exactMatch": true,
          "highlights": Object {
            "text": Array [
              Array [
-              121,
+              111,
+              6,
+            ],
+            Array [
+              118,
                4,
              ],
            ],
            "title": Array [
              Array [
-              19,
-              5,
+              23,
+              4,
              ],
            ],
          },
-        "longestTerm": "from",
+        "longestTerm": "simply",
          "page": Object {
-          "content": "Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words.",
+          "content": "Foobar is a universal variable understood to represent whatever is being discussed. Now we need some keywords: simply text.",
            "navTitle": undefined,
-          "relativeName": "lorem/origin",
-          "text": "Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words.",
-          "title": "Where does it come from?",
-          "url": "/lorem/origin",
+          "relativeName": "foobar",
+          "text": "Foobar is a universal variable understood to represent whatever is being discussed. Now we need some keywords: simply text.",
+          "title": "Where does Foobar come from?",
+          "url": "/foobar",
          },
        }
      }
    />
    <SearchResultEntry
-    active={true}
-    key="foobar"
+    active={false}
+    key="lorem/origin"
      result={
        Object {
+        "exactMatch": false,
          "highlights": Object {
-          "title": Array [
+          "text": Array [
              Array [
-              23,
+              15,
+              6,
+            ],
+            Array [
+              28,
                4,
              ],
            ],
+          "title": Array [
+            Array [
+              19,
+              5,
+            ],
+          ],
          },
-        "longestTerm": "from",
+        "longestTerm": "simply",
          "page": Object {
-          "content": "Foobar is a universal variable understood to represent whatever is being discussed.",
+          "content": "Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words.",
            "navTitle": undefined,
-          "relativeName": "foobar",
-          "text": "Foobar is a universal variable understood to represent whatever is being discussed.",
-          "title": "Where does Foobar come from?",
-          "url": "/foobar",
+          "relativeName": "lorem/origin",
+          "text": "Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words.",
+          "title": "Where does it come from?",
+          "url": "/lorem/origin",
          },
        }
      }
author	Wouter Admiraal <wouter.admiraal@sonarsource.com>
	Wed, 26 Dec 2018 11:49:27 +0000 (12:49 +0100)
committer	SonarTech <sonartech@sonarsource.com>
	Thu, 10 Jan 2019 19:21:02 +0000 (20:21 +0100)
server/sonar-docs/src/layouts/components/Search.js		patch \| blob \| history
server/sonar-web/src/main/js/@types/lunr.d.ts		patch \| blob \| history
server/sonar-web/src/main/js/apps/documentation/components/App.tsx		patch \| blob \| history
server/sonar-web/src/main/js/apps/documentation/components/SearchResultEntry.tsx		patch \| blob \| history
server/sonar-web/src/main/js/apps/documentation/components/SearchResults.tsx		patch \| blob \| history
server/sonar-web/src/main/js/apps/documentation/components/Sidebar.tsx		patch \| blob \| history
server/sonar-web/src/main/js/apps/documentation/components/__tests__/SearchResults-test.tsx		patch \| blob \| history
server/sonar-web/src/main/js/apps/documentation/components/__tests__/__snapshots__/SearchResults-test.tsx.snap		patch \| blob \| history