You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

JavaTokenProducer.java 3.6KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. /*
  2. * SonarQube
  3. * Copyright (C) 2009-2021 SonarSource SA
  4. * mailto:info AT sonarsource DOT com
  5. *
  6. * This program is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 3 of the License, or (at your option) any later version.
  10. *
  11. * This program is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public License
  17. * along with this program; if not, write to the Free Software Foundation,
  18. * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  19. */
  20. package org.sonar.duplications.java;
  21. import org.sonar.duplications.token.TokenChunker;
  22. /**
  23. * See <a href="http://java.sun.com/docs/books/jls/third_edition/html/lexical.html">The Java Language Specification, Third Edition: Lexical Structure</a>
  24. * and <a href="http://www.jcp.org/en/jsr/detail?id=334">JSR334 (Java 7 - binary integral literals and underscores in numeric literals)</a>.
  25. *
  26. * <p>
  27. * We decided to use dollar sign as a prefix for normalization, even if it can be a part of an identifier,
  28. * because according to Java Language Specification it supposed to be used only in mechanically generated source code.
  29. * Thus probability to find it within a normal code should be low.
  30. * </p>
  31. */
  32. public final class JavaTokenProducer {
  33. private static final String NORMALIZED_CHARACTER_LITERAL = "$CHARS";
  34. private static final String NORMALIZED_NUMERIC_LITERAL = "$NUMBER";
  35. private static final String EXP = "([Ee][+-]?+[0-9_]++)";
  36. private static final String BINARY_EXP = "([Pp][+-]?+[0-9_]++)";
  37. private static final String FLOAT_SUFFIX = "[fFdD]";
  38. private static final String INT_SUFFIX = "[lL]";
  39. private JavaTokenProducer() {
  40. }
  41. public static TokenChunker build() {
  42. return TokenChunker.builder()
  43. // White Space
  44. .ignore("\\s")
  45. // Comments
  46. .ignore("//[^\\n\\r]*+")
  47. .ignore("/\\*[\\s\\S]*?\\*/")
  48. // String Literals
  49. .token("\"([^\"\\\\]*+(\\\\[\\s\\S])?+)*+\"", NORMALIZED_CHARACTER_LITERAL)
  50. // Character Literals
  51. .token("'([^'\\n\\\\]*+(\\\\.)?+)*+'", NORMALIZED_CHARACTER_LITERAL)
  52. // Identifiers, Keywords, Boolean Literals, The Null Literal
  53. .token("\\p{javaJavaIdentifierStart}++\\p{javaJavaIdentifierPart}*+")
  54. // Floating-Point Literals
  55. // Decimal
  56. .token("[0-9_]++\\.([0-9_]++)?+" + EXP + "?+" + FLOAT_SUFFIX + "?+", NORMALIZED_NUMERIC_LITERAL)
  57. // Decimal
  58. .token("\\.[0-9_]++" + EXP + "?+" + FLOAT_SUFFIX + "?+", NORMALIZED_NUMERIC_LITERAL)
  59. // Decimal
  60. .token("[0-9_]++" + EXP + FLOAT_SUFFIX + "?+", NORMALIZED_NUMERIC_LITERAL)
  61. // Hexadecimal
  62. .token("0[xX][0-9a-fA-F_]++\\.[0-9a-fA-F_]*+" + BINARY_EXP + "?+" + FLOAT_SUFFIX + "?+", NORMALIZED_NUMERIC_LITERAL)
  63. // Hexadecimal
  64. .token("0[xX][0-9a-fA-F_]++" + BINARY_EXP + FLOAT_SUFFIX + "?+", NORMALIZED_NUMERIC_LITERAL)
  65. // Integer Literals
  66. // Hexadecimal
  67. .token("0[xX][0-9a-fA-F_]++" + INT_SUFFIX + "?+", NORMALIZED_NUMERIC_LITERAL)
  68. // Binary (Java 7)
  69. .token("0[bB][01_]++" + INT_SUFFIX + "?+", NORMALIZED_NUMERIC_LITERAL)
  70. // Decimal and Octal
  71. .token("[0-9_]++" + INT_SUFFIX + "?+", NORMALIZED_NUMERIC_LITERAL)
  72. // Any other character
  73. .token(".")
  74. .build();
  75. }
  76. }