You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

JavaTokenizer.java 1.5KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445
  1. /*
  2. * SonarQube
  3. * Copyright (C) 2009-2023 SonarSource SA
  4. * mailto:info AT sonarsource DOT com
  5. *
  6. * This program is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 3 of the License, or (at your option) any later version.
  10. *
  11. * This program is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public License
  17. * along with this program; if not, write to the Free Software Foundation,
  18. * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  19. */
  20. package org.sonar.server.es.textsearch;
  21. import java.util.Arrays;
  22. import java.util.List;
  23. import org.apache.commons.lang.StringUtils;
  24. import org.sonar.server.es.newindex.DefaultIndexSettings;
  25. import static org.sonar.server.es.newindex.DefaultIndexSettings.MINIMUM_NGRAM_LENGTH;
  26. /**
  27. * Splits text queries into their tokens, for to use them in n_gram match queries later.
  28. */
  29. public class JavaTokenizer {
  30. private JavaTokenizer() {
  31. // use static methods
  32. }
  33. public static List<String> split(String queryText) {
  34. return Arrays.stream(
  35. queryText.split(DefaultIndexSettings.SEARCH_TERM_TOKENIZER_PATTERN))
  36. .filter(StringUtils::isNotEmpty)
  37. .filter(s -> s.length() >= MINIMUM_NGRAM_LENGTH)
  38. .toList();
  39. }
  40. }