You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

common.go 15KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480
  1. package enry
  2. import (
  3. "bufio"
  4. "bytes"
  5. "path/filepath"
  6. "strings"
  7. "github.com/go-enry/go-enry/v2/data"
  8. "github.com/go-enry/go-enry/v2/regex"
  9. )
  10. // OtherLanguage is used as a zero value when a function can not return a specific language.
  11. const OtherLanguage = ""
  12. // Strategy type fix the signature for the functions that can be used as a strategy.
  13. type Strategy func(filename string, content []byte, candidates []string) (languages []string)
  14. // DefaultStrategies is a sequence of strategies used by GetLanguage to detect languages.
  15. var DefaultStrategies = []Strategy{
  16. GetLanguagesByModeline,
  17. GetLanguagesByFilename,
  18. GetLanguagesByShebang,
  19. GetLanguagesByExtension,
  20. GetLanguagesByContent,
  21. GetLanguagesByClassifier,
  22. }
  23. // defaultClassifier is a Naive Bayes classifier trained on Linguist samples.
  24. var defaultClassifier classifier = &naiveBayes{
  25. languagesLogProbabilities: data.LanguagesLogProbabilities,
  26. tokensLogProbabilities: data.TokensLogProbabilities,
  27. tokensTotal: data.TokensTotal,
  28. }
  29. // GetLanguage applies a sequence of strategies based on the given filename and content
  30. // to find out the most probably language to return.
  31. func GetLanguage(filename string, content []byte) (language string) {
  32. languages := GetLanguages(filename, content)
  33. return firstLanguage(languages)
  34. }
  35. func firstLanguage(languages []string) string {
  36. for _, l := range languages {
  37. if l != "" {
  38. return l
  39. }
  40. }
  41. return OtherLanguage
  42. }
  43. // GetLanguageByModeline returns detected language. If there are more than one possibles languages
  44. // it returns the first language by alphabetically order and safe to false.
  45. func GetLanguageByModeline(content []byte) (language string, safe bool) {
  46. return getLanguageByStrategy(GetLanguagesByModeline, "", content, nil)
  47. }
  48. // GetLanguageByEmacsModeline returns detected language. If there are more than one possibles languages
  49. // it returns the first language by alphabetically order and safe to false.
  50. func GetLanguageByEmacsModeline(content []byte) (language string, safe bool) {
  51. return getLanguageByStrategy(GetLanguagesByEmacsModeline, "", content, nil)
  52. }
  53. // GetLanguageByVimModeline returns detected language. If there are more than one possibles languages
  54. // it returns the first language by alphabetically order and safe to false.
  55. func GetLanguageByVimModeline(content []byte) (language string, safe bool) {
  56. return getLanguageByStrategy(GetLanguagesByVimModeline, "", content, nil)
  57. }
  58. // GetLanguageByFilename returns detected language. If there are more than one possibles languages
  59. // it returns the first language by alphabetically order and safe to false.
  60. func GetLanguageByFilename(filename string) (language string, safe bool) {
  61. return getLanguageByStrategy(GetLanguagesByFilename, filename, nil, nil)
  62. }
  63. // GetLanguageByShebang returns detected language. If there are more than one possibles languages
  64. // it returns the first language by alphabetically order and safe to false.
  65. func GetLanguageByShebang(content []byte) (language string, safe bool) {
  66. return getLanguageByStrategy(GetLanguagesByShebang, "", content, nil)
  67. }
  68. // GetLanguageByExtension returns detected language. If there are more than one possibles languages
  69. // it returns the first language by alphabetically order and safe to false.
  70. func GetLanguageByExtension(filename string) (language string, safe bool) {
  71. return getLanguageByStrategy(GetLanguagesByExtension, filename, nil, nil)
  72. }
  73. // GetLanguageByContent returns detected language. If there are more than one possibles languages
  74. // it returns the first language by alphabetically order and safe to false.
  75. func GetLanguageByContent(filename string, content []byte) (language string, safe bool) {
  76. return getLanguageByStrategy(GetLanguagesByContent, filename, content, nil)
  77. }
  78. // GetLanguageByClassifier returns the most probably language detected for the given content. It uses
  79. // defaultClassifier, if no candidates are provided it returns OtherLanguage.
  80. func GetLanguageByClassifier(content []byte, candidates []string) (language string, safe bool) {
  81. return getLanguageByStrategy(GetLanguagesByClassifier, "", content, candidates)
  82. }
  83. func getLanguageByStrategy(strategy Strategy, filename string, content []byte, candidates []string) (string, bool) {
  84. languages := strategy(filename, content, candidates)
  85. return getFirstLanguageAndSafe(languages)
  86. }
  87. func getFirstLanguageAndSafe(languages []string) (language string, safe bool) {
  88. language = firstLanguage(languages)
  89. safe = len(languages) == 1
  90. return
  91. }
  92. // getLanguageBySpecificClassifier returns the most probably language for the given content using
  93. // classifier to detect language.
  94. func getLanguageBySpecificClassifier(content []byte, candidates []string, classifier classifier) (language string, safe bool) {
  95. languages := getLanguagesBySpecificClassifier(content, candidates, classifier)
  96. return getFirstLanguageAndSafe(languages)
  97. }
  98. // GetLanguages applies a sequence of strategies based on the given filename and content
  99. // to find out the most probably languages to return.
  100. // At least one of arguments should be set. If content is missing, language detection will be based on the filename.
  101. // The function won't read the file, given an empty content.
  102. func GetLanguages(filename string, content []byte) []string {
  103. if IsBinary(content) {
  104. return nil
  105. }
  106. var languages []string
  107. candidates := []string{}
  108. for _, strategy := range DefaultStrategies {
  109. languages = strategy(filename, content, candidates)
  110. if len(languages) == 1 {
  111. return languages
  112. }
  113. if len(languages) > 0 {
  114. candidates = append(candidates, languages...)
  115. }
  116. }
  117. return languages
  118. }
  119. // GetLanguagesByModeline returns a slice of possible languages for the given content.
  120. // It complies with the signature to be a Strategy type.
  121. func GetLanguagesByModeline(_ string, content []byte, candidates []string) []string {
  122. headFoot := getHeaderAndFooter(content)
  123. var languages []string
  124. for _, getLang := range modelinesFunc {
  125. languages = getLang("", headFoot, candidates)
  126. if len(languages) > 0 {
  127. break
  128. }
  129. }
  130. return languages
  131. }
  132. var modelinesFunc = []Strategy{
  133. GetLanguagesByEmacsModeline,
  134. GetLanguagesByVimModeline,
  135. }
  136. func getHeaderAndFooter(content []byte) []byte {
  137. const searchScope = 5
  138. if len(content) == 0 {
  139. return content
  140. }
  141. if bytes.Count(content, []byte("\n")) < 2*searchScope {
  142. return content
  143. }
  144. header := headScope(content, searchScope)
  145. footer := footScope(content, searchScope)
  146. headerAndFooter := make([]byte, 0, len(content[:header])+len(content[footer:]))
  147. headerAndFooter = append(headerAndFooter, content[:header]...)
  148. headerAndFooter = append(headerAndFooter, content[footer:]...)
  149. return headerAndFooter
  150. }
  151. func headScope(content []byte, scope int) (index int) {
  152. for i := 0; i < scope; i++ {
  153. eol := bytes.IndexAny(content, "\n")
  154. content = content[eol+1:]
  155. index += eol
  156. }
  157. return index + scope - 1
  158. }
  159. func footScope(content []byte, scope int) (index int) {
  160. for i := 0; i < scope; i++ {
  161. index = bytes.LastIndexAny(content, "\n")
  162. content = content[:index]
  163. }
  164. return index + 1
  165. }
  166. var (
  167. reEmacsModeline = regex.MustCompile(`.*-\*-\s*(.+?)\s*-\*-.*(?m:$)`)
  168. reEmacsLang = regex.MustCompile(`.*(?i:mode)\s*:\s*([^\s;]+)\s*;*.*`)
  169. reVimModeline = regex.MustCompile(`(?:(?m:\s|^)vi(?:m[<=>]?\d+|m)?|[\t\x20]*ex)\s*[:]\s*(.*)(?m:$)`)
  170. reVimLang = regex.MustCompile(`(?i:filetype|ft|syntax)\s*=(\w+)(?:\s|:|$)`)
  171. )
  172. // GetLanguagesByEmacsModeline returns a slice of possible languages for the given content.
  173. // It complies with the signature to be a Strategy type.
  174. func GetLanguagesByEmacsModeline(_ string, content []byte, _ []string) []string {
  175. matched := reEmacsModeline.FindAllSubmatch(content, -1)
  176. if matched == nil {
  177. return nil
  178. }
  179. // only take the last matched line, discard previous lines
  180. lastLineMatched := matched[len(matched)-1][1]
  181. matchedAlias := reEmacsLang.FindSubmatch(lastLineMatched)
  182. var alias string
  183. if matchedAlias != nil {
  184. alias = string(matchedAlias[1])
  185. } else {
  186. alias = string(lastLineMatched)
  187. }
  188. language, ok := GetLanguageByAlias(alias)
  189. if !ok {
  190. return nil
  191. }
  192. return []string{language}
  193. }
  194. // GetLanguagesByVimModeline returns a slice of possible languages for the given content.
  195. // It complies with the signature to be a Strategy type.
  196. func GetLanguagesByVimModeline(_ string, content []byte, _ []string) []string {
  197. matched := reVimModeline.FindAllSubmatch(content, -1)
  198. if matched == nil {
  199. return nil
  200. }
  201. // only take the last matched line, discard previous lines
  202. lastLineMatched := matched[len(matched)-1][1]
  203. matchedAlias := reVimLang.FindAllSubmatch(lastLineMatched, -1)
  204. if matchedAlias == nil {
  205. return nil
  206. }
  207. alias := string(matchedAlias[0][1])
  208. if len(matchedAlias) > 1 {
  209. // cases:
  210. // matchedAlias = [["syntax=ruby " "ruby"] ["ft=python " "python"] ["filetype=perl " "perl"]] returns OtherLanguage;
  211. // matchedAlias = [["syntax=python " "python"] ["ft=python " "python"] ["filetype=python " "python"]] returns "Python";
  212. for _, match := range matchedAlias {
  213. otherAlias := string(match[1])
  214. if otherAlias != alias {
  215. return nil
  216. }
  217. }
  218. }
  219. language, ok := GetLanguageByAlias(alias)
  220. if !ok {
  221. return nil
  222. }
  223. return []string{language}
  224. }
  225. // GetLanguagesByFilename returns a slice of possible languages for the given filename.
  226. // It complies with the signature to be a Strategy type.
  227. func GetLanguagesByFilename(filename string, _ []byte, _ []string) []string {
  228. if filename == "" {
  229. return nil
  230. }
  231. return data.LanguagesByFilename[filepath.Base(filename)]
  232. }
  233. // GetLanguagesByShebang returns a slice of possible languages for the given content.
  234. // It complies with the signature to be a Strategy type.
  235. func GetLanguagesByShebang(_ string, content []byte, _ []string) (languages []string) {
  236. interpreter := getInterpreter(content)
  237. return data.LanguagesByInterpreter[interpreter]
  238. }
  239. var (
  240. shebangExecHack = regex.MustCompile(`exec (\w+).+\$0.+\$@`)
  241. pythonVersion = regex.MustCompile(`python\d\.\d+`)
  242. )
  243. func getInterpreter(data []byte) (interpreter string) {
  244. line := getFirstLine(data)
  245. if !hasShebang(line) {
  246. return ""
  247. }
  248. // skip shebang
  249. line = bytes.TrimSpace(line[2:])
  250. splitted := bytes.Fields(line)
  251. if len(splitted) == 0 {
  252. return ""
  253. }
  254. if bytes.Contains(splitted[0], []byte("env")) {
  255. if len(splitted) > 1 {
  256. interpreter = string(splitted[1])
  257. }
  258. } else {
  259. splittedPath := bytes.Split(splitted[0], []byte{'/'})
  260. interpreter = string(splittedPath[len(splittedPath)-1])
  261. }
  262. if interpreter == "sh" {
  263. interpreter = lookForMultilineExec(data)
  264. }
  265. if pythonVersion.MatchString(interpreter) {
  266. interpreter = interpreter[:strings.Index(interpreter, `.`)]
  267. }
  268. // If osascript is called with argument -l it could be different language so do not relay on it
  269. // To match linguist behaviour, see ref https://github.com/github/linguist/blob/d95bae794576ab0ef2fcb41a39eb61ea5302c5b5/lib/linguist/shebang.rb#L63
  270. if interpreter == "osascript" && bytes.Contains(line, []byte("-l")) {
  271. interpreter = ""
  272. }
  273. return
  274. }
  275. func getFirstLine(content []byte) []byte {
  276. nlpos := bytes.IndexByte(content, '\n')
  277. if nlpos < 0 {
  278. return content
  279. }
  280. return content[:nlpos]
  281. }
  282. func hasShebang(line []byte) bool {
  283. const shebang = `#!`
  284. prefix := []byte(shebang)
  285. return bytes.HasPrefix(line, prefix)
  286. }
  287. func lookForMultilineExec(data []byte) string {
  288. const magicNumOfLines = 5
  289. interpreter := "sh"
  290. buf := bufio.NewScanner(bytes.NewReader(data))
  291. for i := 0; i < magicNumOfLines && buf.Scan(); i++ {
  292. line := buf.Bytes()
  293. if shebangExecHack.Match(line) {
  294. interpreter = shebangExecHack.FindStringSubmatch(string(line))[1]
  295. break
  296. }
  297. }
  298. if err := buf.Err(); err != nil {
  299. return interpreter
  300. }
  301. return interpreter
  302. }
  303. // GetLanguagesByExtension returns a slice of possible languages for the given filename.
  304. // It complies with the signature to be a Strategy type.
  305. func GetLanguagesByExtension(filename string, _ []byte, _ []string) []string {
  306. if !strings.Contains(filename, ".") {
  307. return nil
  308. }
  309. filename = strings.ToLower(filename)
  310. dots := getDotIndexes(filename)
  311. for _, dot := range dots {
  312. ext := filename[dot:]
  313. languages, ok := data.LanguagesByExtension[ext]
  314. if ok {
  315. return languages
  316. }
  317. }
  318. return nil
  319. }
  320. func getDotIndexes(filename string) []int {
  321. dots := make([]int, 0, 2)
  322. for i, letter := range filename {
  323. if letter == rune('.') {
  324. dots = append(dots, i)
  325. }
  326. }
  327. return dots
  328. }
  329. // GetLanguagesByContent returns a slice of languages for the given content.
  330. // It is a Strategy that uses content-based regexp heuristics and a filename extension.
  331. func GetLanguagesByContent(filename string, content []byte, _ []string) []string {
  332. if filename == "" {
  333. return nil
  334. }
  335. ext := strings.ToLower(filepath.Ext(filename))
  336. heuristic, ok := data.ContentHeuristics[ext]
  337. if !ok {
  338. return nil
  339. }
  340. return heuristic.Match(content)
  341. }
  342. // GetLanguagesByClassifier returns a sorted slice of possible languages ordered by
  343. // decreasing language's probability. If there are not candidates it returns nil.
  344. // It is a Strategy that uses a pre-trained defaultClassifier.
  345. func GetLanguagesByClassifier(filename string, content []byte, candidates []string) (languages []string) {
  346. if len(candidates) == 0 {
  347. return nil
  348. }
  349. return getLanguagesBySpecificClassifier(content, candidates, defaultClassifier)
  350. }
  351. // getLanguagesBySpecificClassifier returns a slice of possible languages. It takes in a Classifier to be used.
  352. func getLanguagesBySpecificClassifier(content []byte, candidates []string, classifier classifier) (languages []string) {
  353. mapCandidates := make(map[string]float64)
  354. for _, candidate := range candidates {
  355. mapCandidates[candidate]++
  356. }
  357. return classifier.classify(content, mapCandidates)
  358. }
  359. // GetLanguageExtensions returns all extensions associated with the given language.
  360. func GetLanguageExtensions(language string) []string {
  361. return data.ExtensionsByLanguage[language]
  362. }
  363. // Type represent language's type. Either data, programming, markup, prose, or unknown.
  364. type Type int
  365. // Type's values.
  366. const (
  367. Unknown Type = iota
  368. Data
  369. Programming
  370. Markup
  371. Prose
  372. )
  373. // GetLanguageType returns the type of the given language.
  374. func GetLanguageType(language string) (langType Type) {
  375. intType, ok := data.LanguagesType[language]
  376. langType = Type(intType)
  377. if !ok {
  378. langType = Unknown
  379. }
  380. return langType
  381. }
  382. // GetLanguageByAlias returns either the language related to the given alias and ok set to true
  383. // or Otherlanguage and ok set to false if the alias is not recognized.
  384. func GetLanguageByAlias(alias string) (lang string, ok bool) {
  385. lang, ok = data.LanguageByAlias(alias)
  386. if !ok {
  387. lang = OtherLanguage
  388. }
  389. return
  390. }
  391. // GetLanguageGroup returns language group or empty string if language does not have group.
  392. func GetLanguageGroup(language string) string {
  393. if group, ok := data.LanguagesGroup[language]; ok {
  394. return group
  395. }
  396. return ""
  397. }