Allows to search for partial names, and allows an intelligent "english" search in the description. Special syntax (like "+(a|b|c)") will not be interpreted anymore.
public static final String TOKENIZER = "tokenizer";
public static final String FILTER = "filter";
+ public static final String CHAR_FILTER = "char_filter";
public static final String ANALYZER = "analyzer";
public static final String SEARCH_ANALYZER = "search_analyzer";
public static final String MIN_GRAM = "min_gram";
public static final String MAX_GRAM = "max_gram";
public static final String LENGTH = "length";
+ public static final String HTML_STRIP = "html_strip";
private DefaultIndexSettings() {
// only static stuff
import static org.sonar.server.es.DefaultIndexSettings.ANALYSIS;
import static org.sonar.server.es.DefaultIndexSettings.ANALYZER;
import static org.sonar.server.es.DefaultIndexSettings.ASCIIFOLDING;
+import static org.sonar.server.es.DefaultIndexSettings.CHAR_FILTER;
import static org.sonar.server.es.DefaultIndexSettings.DELIMITER;
-import static org.sonar.server.es.DefaultIndexSettings.FIELD_FIELDDATA;
import static org.sonar.server.es.DefaultIndexSettings.FIELDDATA_ENABLED;
+import static org.sonar.server.es.DefaultIndexSettings.FIELD_FIELDDATA;
import static org.sonar.server.es.DefaultIndexSettings.FIELD_TYPE_TEXT;
import static org.sonar.server.es.DefaultIndexSettings.FILTER;
+import static org.sonar.server.es.DefaultIndexSettings.HTML_STRIP;
import static org.sonar.server.es.DefaultIndexSettings.INDEX;
import static org.sonar.server.es.DefaultIndexSettings.INDEX_SEARCHABLE;
import static org.sonar.server.es.DefaultIndexSettings.KEYWORD;
protected void setup() {
set(TOKENIZER, STANDARD);
setArray(FILTER, STANDARD, LOWERCASE, STOP, ASCIIFOLDING, PORTER_STEM);
- setArray("char_filter", "html_strip");
+ setArray(CHAR_FILTER, HTML_STRIP);
+ }
+
+ @Override
+ public SortedMap<String, String> fieldMapping() {
+ return ImmutableSortedMap.of(
+ TYPE, FIELD_TYPE_TEXT,
+ INDEX, INDEX_SEARCHABLE,
+ ANALYZER, getName());
}
},
PATH_ANALYZER(ANALYZER) {
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.atomic.AtomicBoolean;
-import org.apache.commons.lang.StringUtils;
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.QueryBuilder;
-import org.sonar.core.util.stream.MoreCollectors;
-import org.sonar.server.es.DefaultIndexSettings;
import org.sonar.server.es.textsearch.ComponentTextSearchFeature.UseCase;
import static com.google.common.base.Preconditions.checkArgument;
import static java.util.Objects.requireNonNull;
import static org.elasticsearch.index.query.QueryBuilders.boolQuery;
-import static org.sonar.server.es.DefaultIndexSettings.MINIMUM_NGRAM_LENGTH;
+import static org.sonar.server.es.textsearch.JavaTokenizer.split;
/**
* This class is used in order to do some advanced full text search in an index on component key and component name
this.favoriteKeys = builder.favoriteKeys;
}
- private static List<String> split(String queryText) {
- return Arrays.stream(
- queryText.split(DefaultIndexSettings.SEARCH_TERM_TOKENIZER_PATTERN))
- .filter(StringUtils::isNotEmpty)
- .filter(s -> s.length() >= MINIMUM_NGRAM_LENGTH)
- .collect(MoreCollectors.toList());
- }
-
public String getQueryText() {
return queryText;
}
--- /dev/null
+/*
+ * SonarQube
+ * Copyright (C) 2009-2017 SonarSource SA
+ * mailto:info AT sonarsource DOT com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+package org.sonar.server.es.textsearch;
+
+import java.util.Arrays;
+import java.util.List;
+import org.apache.commons.lang.StringUtils;
+import org.sonar.core.util.stream.MoreCollectors;
+import org.sonar.server.es.DefaultIndexSettings;
+
+import static org.sonar.server.es.DefaultIndexSettings.MINIMUM_NGRAM_LENGTH;
+
+/**
+ * Splits text queries into their tokens, for to use them in n_gram match queries later.
+ */
+public class JavaTokenizer {
+
+ private JavaTokenizer() {
+ // use static methods
+ }
+
+ public static List<String> split(String queryText) {
+ return Arrays.stream(
+ queryText.split(DefaultIndexSettings.SEARCH_TERM_TOKENIZER_PATTERN))
+ .filter(StringUtils::isNotEmpty)
+ .filter(s -> s.length() >= MINIMUM_NGRAM_LENGTH)
+ .collect(MoreCollectors.toList());
+ }
+}
import org.sonar.core.util.stream.MoreCollectors;
import org.sonar.db.organization.OrganizationDto;
import org.sonar.db.qualityprofile.QProfileDto;
+import org.sonar.server.es.DefaultIndexSettings;
import org.sonar.server.es.EsClient;
import org.sonar.server.es.EsUtils;
import org.sonar.server.es.SearchIdResult;
import org.sonar.server.es.SearchOptions;
import org.sonar.server.es.StickyFacetBuilder;
+import org.sonar.server.es.textsearch.JavaTokenizer;
import static com.google.common.base.Preconditions.checkArgument;
import static java.lang.Boolean.FALSE;
import static java.util.Optional.ofNullable;
import static org.elasticsearch.index.query.QueryBuilders.boolQuery;
import static org.elasticsearch.index.query.QueryBuilders.matchAllQuery;
+import static org.elasticsearch.index.query.QueryBuilders.matchPhraseQuery;
import static org.elasticsearch.index.query.QueryBuilders.matchQuery;
-import static org.elasticsearch.index.query.QueryBuilders.simpleQueryStringQuery;
import static org.elasticsearch.index.query.QueryBuilders.termsQuery;
+import static org.sonar.server.es.DefaultIndexSettingsElement.ENGLISH_HTML_ANALYZER;
+import static org.sonar.server.es.DefaultIndexSettingsElement.SEARCH_GRAMS_ANALYZER;
import static org.sonar.server.es.DefaultIndexSettingsElement.SEARCH_WORDS_ANALYZER;
import static org.sonar.server.es.DefaultIndexSettingsElement.SORTABLE_ANALYZER;
import static org.sonar.server.es.EsUtils.SCROLL_TIME_IN_MINUTES;
BoolQueryBuilder qb = boolQuery();
String queryString = query.getQueryText();
- // Human readable type of querying
- qb.should(simpleQueryStringQuery(query.getQueryText())
- .field(SEARCH_WORDS_ANALYZER.subField(FIELD_RULE_NAME), 20f)
- .field(FIELD_RULE_HTML_DESCRIPTION, 3f)
- .defaultOperator(Operator.AND)).boost(20f);
+ if (queryString != null && !queryString.isEmpty()) {
+ BoolQueryBuilder textQuery = boolQuery();
+ JavaTokenizer.split(queryString)
+ .stream().map(token -> boolQuery().should(
+ matchQuery(
+ SEARCH_GRAMS_ANALYZER.subField(FIELD_RULE_NAME),
+ StringUtils.left(token, DefaultIndexSettings.MAXIMUM_NGRAM_LENGTH)
+ ).boost(20f)).should(
+ matchPhraseQuery(
+ ENGLISH_HTML_ANALYZER.subField(FIELD_RULE_HTML_DESCRIPTION),
+ StringUtils.left(token, DefaultIndexSettings.MAXIMUM_NGRAM_LENGTH)
+ ).boost(3f))
+ ).forEach(textQuery::must);
+ qb.should(textQuery.boost(20f));
+ }
// Match and partial Match queries
// Search by key uses the "sortable" sub-field as it requires to be case-insensitive (lower-case filtering)
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
-import com.google.common.collect.ImmutableSortedMap;
import java.util.Set;
import org.sonar.api.config.Configuration;
-import org.sonar.server.es.DefaultIndexSettings;
import org.sonar.server.es.IndexDefinition;
import org.sonar.server.es.IndexType;
import org.sonar.server.es.NewIndex;
import static org.sonar.server.es.DefaultIndexSettingsElement.ENGLISH_HTML_ANALYZER;
-import static org.sonar.server.es.DefaultIndexSettingsElement.SEARCH_WORDS_ANALYZER;
+import static org.sonar.server.es.DefaultIndexSettingsElement.SEARCH_GRAMS_ANALYZER;
import static org.sonar.server.es.DefaultIndexSettingsElement.SORTABLE_ANALYZER;
import static org.sonar.server.es.NewIndex.SettingsConfiguration.MANUAL_REFRESH_INTERVAL;
import static org.sonar.server.es.NewIndex.SettingsConfiguration.newBuilder;
ruleMapping.keywordFieldBuilder(FIELD_RULE_REPOSITORY).build();
ruleMapping.keywordFieldBuilder(FIELD_RULE_INTERNAL_KEY).disableNorms().disableSearch().build();
- ruleMapping.keywordFieldBuilder(FIELD_RULE_NAME).addSubFields(SORTABLE_ANALYZER, SEARCH_WORDS_ANALYZER).build();
- ruleMapping.setProperty(FIELD_RULE_HTML_DESCRIPTION, ImmutableSortedMap.of(
- DefaultIndexSettings.TYPE, DefaultIndexSettings.FIELD_TYPE_TEXT,
- DefaultIndexSettings.INDEX, DefaultIndexSettings.INDEX_SEARCHABLE,
- DefaultIndexSettings.ANALYZER, ENGLISH_HTML_ANALYZER.getName(),
- DefaultIndexSettings.SEARCH_ANALYZER, ENGLISH_HTML_ANALYZER.getName()));
+ ruleMapping.keywordFieldBuilder(FIELD_RULE_NAME).addSubFields(SORTABLE_ANALYZER, SEARCH_GRAMS_ANALYZER).build();
+ ruleMapping.keywordFieldBuilder(FIELD_RULE_HTML_DESCRIPTION).addSubFields(ENGLISH_HTML_ANALYZER).build();
ruleMapping.keywordFieldBuilder(FIELD_RULE_SEVERITY).disableNorms().build();
ruleMapping.keywordFieldBuilder(FIELD_RULE_STATUS).disableNorms().build();
ruleMapping.keywordFieldBuilder(FIELD_RULE_LANGUAGE).disableNorms().build();
import org.sonar.server.es.NewIndex;
import static org.assertj.core.api.Assertions.assertThat;
+import static org.elasticsearch.index.query.QueryBuilders.matchQuery;
+import static org.sonar.server.es.DefaultIndexSettingsElement.ENGLISH_HTML_ANALYZER;
import static org.sonar.server.rule.index.RuleIndexDefinition.FIELD_RULE_HTML_DESCRIPTION;
import static org.sonar.server.rule.index.RuleIndexDefinition.FIELD_RULE_KEY;
import static org.sonar.server.rule.index.RuleIndexDefinition.FIELD_RULE_REPOSITORY;
-import static org.sonar.server.rule.index.RuleIndexDefinition.INDEX_TYPE_RULE;;
+import static org.sonar.server.rule.index.RuleIndexDefinition.INDEX_TYPE_RULE;
public class RuleIndexDefinitionTest {
@Test
public void support_long_html_description() throws Exception {
- String longText = StringUtils.repeat("hello ", 10_000);
+ String longText = StringUtils.repeat("The quick brown fox jumps over the lazy dog ", 700);
+
+ List<AnalyzeResponse.AnalyzeToken> tokens = analyzeIndexedTokens(longText);
+ assertThat(tokens).extracting(AnalyzeResponse.AnalyzeToken::getTerm).containsOnly(
+ "quick", "brown", "fox", "jump", "over", "lazi", "dog"
+ );
+
// the following method fails if PUT fails
tester.putDocuments(INDEX_TYPE_RULE, new RuleDoc(ImmutableMap.of(
FIELD_RULE_HTML_DESCRIPTION, longText,
FIELD_RULE_REPOSITORY, "squid",
FIELD_RULE_KEY, "squid:S001")));
assertThat(tester.countDocuments(INDEX_TYPE_RULE)).isEqualTo(1);
-
- List<AnalyzeResponse.AnalyzeToken> tokens = analyzeIndexedTokens(longText);
- for (AnalyzeResponse.AnalyzeToken token : tokens) {
- assertThat(token.getTerm().length()).isEqualTo("hello".length());
- }
+ assertThat(tester.client().prepareSearch(INDEX_TYPE_RULE.getIndex()).setQuery(matchQuery(ENGLISH_HTML_ANALYZER.subField(FIELD_RULE_HTML_DESCRIPTION), "brown fox jumps lazy"))
+ .get().getHits().getTotalHits()).isEqualTo(1);
}
@Test
private List<AnalyzeResponse.AnalyzeToken> analyzeIndexedTokens(String text) {
return tester.client().nativeClient().admin().indices().prepareAnalyze(INDEX_TYPE_RULE.getIndex(),
text)
- .setField(FIELD_RULE_HTML_DESCRIPTION)
+ .setField(ENGLISH_HTML_ANALYZER.subField(FIELD_RULE_HTML_DESCRIPTION))
.execute().actionGet().getTokens();
}
}
assertThat(results).containsOnly(rule.getKey());
}
+ @Test
+ public void search_content_by_query() {
+ RuleDefinitionDto rule1 = createRule(rule -> rule.setRuleKey("123").setDescription("My great rule CWE-123 which makes your code 1000 times better!"));
+ RuleDefinitionDto rule2 = createRule(rule -> rule.setRuleKey("124").setDescription("Another great and shiny rule CWE-124"));
+ RuleDefinitionDto rule3 = createRule(rule -> rule.setRuleKey("1000").setDescription("Another great rule CWE-1000"));
+ index();
+
+ // partial match at word boundary
+ assertThat(underTest.search(new RuleQuery().setQueryText("CWE"), new SearchOptions()).getIds()).containsExactlyInAnyOrder(rule1.getKey(), rule2.getKey(), rule3.getKey());
+
+ // full match
+ assertThat(underTest.search(new RuleQuery().setQueryText("CWE-123"), new SearchOptions()).getIds()).containsExactly(rule1.getKey());
+
+ // match somewhere else in the text
+ assertThat(underTest.search(new RuleQuery().setQueryText("CWE-1000"), new SearchOptions()).getIds()).containsExactly(rule3.getKey());
+ assertThat(underTest.search(new RuleQuery().setQueryText("CWE 1000"), new SearchOptions()).getIds()).containsExactlyInAnyOrder(rule3.getKey(), rule1.getKey());
+
+ // several words
+ assertThat(underTest.search(new RuleQuery().setQueryText("great rule"), new SearchOptions()).getIds()).containsExactlyInAnyOrder(rule1.getKey(), rule2.getKey(), rule3.getKey());
+ assertThat(underTest.search(new RuleQuery().setQueryText("rule Another"), new SearchOptions()).getIds()).containsExactlyInAnyOrder(rule2.getKey(), rule3.getKey());
+
+ // no matches
+ assertThat(underTest.search(new RuleQuery().setQueryText("unexisting"), new SearchOptions()).getIds()).isEmpty();
+ assertThat(underTest.search(new RuleQuery().setQueryText("great rule unexisting"), new SearchOptions()).getIds()).isEmpty();
+
+ // stopwords
+ assertThat(underTest.search(new RuleQuery().setQueryText("and"), new SearchOptions()).getIds()).isEmpty();
+ assertThat(underTest.search(new RuleQuery().setQueryText("great and shiny"), new SearchOptions()).getIds()).isEmpty();
+ }
+
@Test
public void search_by_any_of_repositories() {
RuleDefinitionDto findbugs = createRule(
verifyNoResults(r -> r.setParam(PARAM_RULE_KEY, "missing"));
}
+ @Test
+ public void filter_by_rule_name() {
+ RuleDefinitionDto rule1 = db.rules().insert(r1 -> r1.setName("Best rule ever"));
+ RuleDefinitionDto rule2 = db.rules().insert(r1 -> r1.setName("Some other stuff"));
+ indexRules();
+
+ verify(r -> r.setParam("q", "Be"), rule1);
+ verify(r -> r.setParam("q", "Bes"), rule1);
+ verify(r -> r.setParam("q", "Best"), rule1);
+ verify(r -> r.setParam("q", "Best "), rule1);
+ verify(r -> r.setParam("q", "Best rule"), rule1);
+ verify(r -> r.setParam("q", "Best rule eve"), rule1);
+ verify(r -> r.setParam("q", "Best rule ever"), rule1);
+ verify(r -> r.setParam("q", "ru ev"), rule1);
+ verify(r -> r.setParam("q", "ru ever"), rule1);
+ verify(r -> r.setParam("q", "ev ve ver ru le"), rule1);
+ verify(r -> r.setParam("q", "other"), rule2);
+ }
+
+ @Test
+ public void filter_by_rule_name_requires_all_words_to_match() {
+ RuleDefinitionDto rule1 = db.rules().insert(r1 -> r1.setName("Best rule ever"));
+ RuleDefinitionDto rule2 = db.rules().insert(r1 -> r1.setName("Some other stuff"));
+ indexRules();
+
+ verify(r -> r.setParam("q", "Best other"), new RuleDefinitionDto[0]);
+ verify(r -> r.setParam("q", "Best rule"), rule1);
+ verify(r -> r.setParam("q", "rule ever"), rule1);
+ }
+
+ @Test
+ public void filter_by_rule_name_does_not_interpret_query() {
+ RuleDefinitionDto rule1 = db.rules().insert(r1 -> r1.setName("Best rule for-ever"));
+ RuleDefinitionDto rule2 = db.rules().insert(r1 -> r1.setName("Some other stuff"));
+ indexRules();
+
+ // do not interpret "-" as a "not"
+ verify(r -> r.setParam("q", "-ever"), rule1);
+ }
+
+ @Test
+ public void filter_by_rule_description() {
+ RuleDefinitionDto rule1 = db.rules().insert(r1 -> r1.setDescription("This is the <bold>best</bold> rule now&for<b>ever</b>"));
+ RuleDefinitionDto rule2 = db.rules().insert(r1 -> r1.setName("Some other stuff"));
+ indexRules();
+
+ verify(r -> r.setParam("q", "Best "), rule1);
+ verify(r -> r.setParam("q", "bold"), new RuleDefinitionDto[0]);
+ verify(r -> r.setParam("q", "now&forever"), rule1);
+ }
+
+ @Test
+ public void filter_by_rule_name_or_descriptions_requires_all_words_to_match_anywhere() {
+ RuleDefinitionDto rule1 = db.rules().insert(r1 -> r1.setName("Best rule ever").setDescription("This is a good rule"));
+ RuleDefinitionDto rule2 = db.rules().insert(r1 -> r1.setName("Some other stuff").setDescription("Another thing"));
+ indexRules();
+
+ verify(r -> r.setParam("q", "Best good"), rule1);
+ verify(r -> r.setParam("q", "Best Another"), new RuleDefinitionDto[0]);
+ }
+
@Test
public void return_all_rule_fields_by_default() {
RuleDefinitionDto rule = createJavaRule();
.executeProtobuf(Rules.SearchResponse.class);
assertThat(response.getP()).isEqualTo(1);
- assertThat(response.getTotal()).isEqualTo(expectedRules.length);
- assertThat(response.getRulesCount()).isEqualTo(expectedRules.length);
RuleKey[] expectedRuleKeys = stream(expectedRules).map(RuleDefinitionDto::getKey).collect(MoreCollectors.toList()).toArray(new RuleKey[0]);
assertThat(response.getRulesList())
.extracting(r -> RuleKey.parse(r.getKey()))
.containsExactlyInAnyOrder(expectedRuleKeys);
+ assertThat(response.getTotal()).isEqualTo(expectedRules.length);
+ assertThat(response.getRulesCount()).isEqualTo(expectedRules.length);
}
private void indexRules() {