123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480 |
- package enry
-
- import (
- "bufio"
- "bytes"
- "path/filepath"
- "strings"
-
- "github.com/go-enry/go-enry/v2/data"
- "github.com/go-enry/go-enry/v2/regex"
- )
-
- // OtherLanguage is used as a zero value when a function can not return a specific language.
- const OtherLanguage = ""
-
- // Strategy type fix the signature for the functions that can be used as a strategy.
- type Strategy func(filename string, content []byte, candidates []string) (languages []string)
-
- // DefaultStrategies is a sequence of strategies used by GetLanguage to detect languages.
- var DefaultStrategies = []Strategy{
- GetLanguagesByModeline,
- GetLanguagesByFilename,
- GetLanguagesByShebang,
- GetLanguagesByExtension,
- GetLanguagesByContent,
- GetLanguagesByClassifier,
- }
-
- // defaultClassifier is a Naive Bayes classifier trained on Linguist samples.
- var defaultClassifier classifier = &naiveBayes{
- languagesLogProbabilities: data.LanguagesLogProbabilities,
- tokensLogProbabilities: data.TokensLogProbabilities,
- tokensTotal: data.TokensTotal,
- }
-
- // GetLanguage applies a sequence of strategies based on the given filename and content
- // to find out the most probably language to return.
- func GetLanguage(filename string, content []byte) (language string) {
- languages := GetLanguages(filename, content)
- return firstLanguage(languages)
- }
-
- func firstLanguage(languages []string) string {
- for _, l := range languages {
- if l != "" {
- return l
- }
- }
- return OtherLanguage
- }
-
- // GetLanguageByModeline returns detected language. If there are more than one possibles languages
- // it returns the first language by alphabetically order and safe to false.
- func GetLanguageByModeline(content []byte) (language string, safe bool) {
- return getLanguageByStrategy(GetLanguagesByModeline, "", content, nil)
- }
-
- // GetLanguageByEmacsModeline returns detected language. If there are more than one possibles languages
- // it returns the first language by alphabetically order and safe to false.
- func GetLanguageByEmacsModeline(content []byte) (language string, safe bool) {
- return getLanguageByStrategy(GetLanguagesByEmacsModeline, "", content, nil)
- }
-
- // GetLanguageByVimModeline returns detected language. If there are more than one possibles languages
- // it returns the first language by alphabetically order and safe to false.
- func GetLanguageByVimModeline(content []byte) (language string, safe bool) {
- return getLanguageByStrategy(GetLanguagesByVimModeline, "", content, nil)
- }
-
- // GetLanguageByFilename returns detected language. If there are more than one possibles languages
- // it returns the first language by alphabetically order and safe to false.
- func GetLanguageByFilename(filename string) (language string, safe bool) {
- return getLanguageByStrategy(GetLanguagesByFilename, filename, nil, nil)
- }
-
- // GetLanguageByShebang returns detected language. If there are more than one possibles languages
- // it returns the first language by alphabetically order and safe to false.
- func GetLanguageByShebang(content []byte) (language string, safe bool) {
- return getLanguageByStrategy(GetLanguagesByShebang, "", content, nil)
- }
-
- // GetLanguageByExtension returns detected language. If there are more than one possibles languages
- // it returns the first language by alphabetically order and safe to false.
- func GetLanguageByExtension(filename string) (language string, safe bool) {
- return getLanguageByStrategy(GetLanguagesByExtension, filename, nil, nil)
- }
-
- // GetLanguageByContent returns detected language. If there are more than one possibles languages
- // it returns the first language by alphabetically order and safe to false.
- func GetLanguageByContent(filename string, content []byte) (language string, safe bool) {
- return getLanguageByStrategy(GetLanguagesByContent, filename, content, nil)
- }
-
- // GetLanguageByClassifier returns the most probably language detected for the given content. It uses
- // defaultClassifier, if no candidates are provided it returns OtherLanguage.
- func GetLanguageByClassifier(content []byte, candidates []string) (language string, safe bool) {
- return getLanguageByStrategy(GetLanguagesByClassifier, "", content, candidates)
- }
-
- func getLanguageByStrategy(strategy Strategy, filename string, content []byte, candidates []string) (string, bool) {
- languages := strategy(filename, content, candidates)
- return getFirstLanguageAndSafe(languages)
- }
-
- func getFirstLanguageAndSafe(languages []string) (language string, safe bool) {
- language = firstLanguage(languages)
- safe = len(languages) == 1
- return
- }
-
- // getLanguageBySpecificClassifier returns the most probably language for the given content using
- // classifier to detect language.
- func getLanguageBySpecificClassifier(content []byte, candidates []string, classifier classifier) (language string, safe bool) {
- languages := getLanguagesBySpecificClassifier(content, candidates, classifier)
- return getFirstLanguageAndSafe(languages)
- }
-
- // GetLanguages applies a sequence of strategies based on the given filename and content
- // to find out the most probably languages to return.
- // At least one of arguments should be set. If content is missing, language detection will be based on the filename.
- // The function won't read the file, given an empty content.
- func GetLanguages(filename string, content []byte) []string {
- if IsBinary(content) {
- return nil
- }
-
- var languages []string
- candidates := []string{}
- for _, strategy := range DefaultStrategies {
- languages = strategy(filename, content, candidates)
- if len(languages) == 1 {
- return languages
- }
-
- if len(languages) > 0 {
- candidates = append(candidates, languages...)
- }
- }
-
- return languages
- }
-
- // GetLanguagesByModeline returns a slice of possible languages for the given content.
- // It complies with the signature to be a Strategy type.
- func GetLanguagesByModeline(_ string, content []byte, candidates []string) []string {
- headFoot := getHeaderAndFooter(content)
- var languages []string
- for _, getLang := range modelinesFunc {
- languages = getLang("", headFoot, candidates)
- if len(languages) > 0 {
- break
- }
- }
-
- return languages
- }
-
- var modelinesFunc = []Strategy{
- GetLanguagesByEmacsModeline,
- GetLanguagesByVimModeline,
- }
-
- func getHeaderAndFooter(content []byte) []byte {
- const searchScope = 5
-
- if len(content) == 0 {
- return content
- }
-
- if bytes.Count(content, []byte("\n")) < 2*searchScope {
- return content
- }
-
- header := headScope(content, searchScope)
- footer := footScope(content, searchScope)
- headerAndFooter := make([]byte, 0, len(content[:header])+len(content[footer:]))
- headerAndFooter = append(headerAndFooter, content[:header]...)
- headerAndFooter = append(headerAndFooter, content[footer:]...)
- return headerAndFooter
- }
-
- func headScope(content []byte, scope int) (index int) {
- for i := 0; i < scope; i++ {
- eol := bytes.IndexAny(content, "\n")
- content = content[eol+1:]
- index += eol
- }
-
- return index + scope - 1
- }
-
- func footScope(content []byte, scope int) (index int) {
- for i := 0; i < scope; i++ {
- index = bytes.LastIndexAny(content, "\n")
- content = content[:index]
- }
-
- return index + 1
- }
-
- var (
- reEmacsModeline = regex.MustCompile(`.*-\*-\s*(.+?)\s*-\*-.*(?m:$)`)
- reEmacsLang = regex.MustCompile(`.*(?i:mode)\s*:\s*([^\s;]+)\s*;*.*`)
- reVimModeline = regex.MustCompile(`(?:(?m:\s|^)vi(?:m[<=>]?\d+|m)?|[\t\x20]*ex)\s*[:]\s*(.*)(?m:$)`)
- reVimLang = regex.MustCompile(`(?i:filetype|ft|syntax)\s*=(\w+)(?:\s|:|$)`)
- )
-
- // GetLanguagesByEmacsModeline returns a slice of possible languages for the given content.
- // It complies with the signature to be a Strategy type.
- func GetLanguagesByEmacsModeline(_ string, content []byte, _ []string) []string {
- matched := reEmacsModeline.FindAllSubmatch(content, -1)
- if matched == nil {
- return nil
- }
-
- // only take the last matched line, discard previous lines
- lastLineMatched := matched[len(matched)-1][1]
- matchedAlias := reEmacsLang.FindSubmatch(lastLineMatched)
- var alias string
- if matchedAlias != nil {
- alias = string(matchedAlias[1])
- } else {
- alias = string(lastLineMatched)
- }
-
- language, ok := GetLanguageByAlias(alias)
- if !ok {
- return nil
- }
-
- return []string{language}
- }
-
- // GetLanguagesByVimModeline returns a slice of possible languages for the given content.
- // It complies with the signature to be a Strategy type.
- func GetLanguagesByVimModeline(_ string, content []byte, _ []string) []string {
- matched := reVimModeline.FindAllSubmatch(content, -1)
- if matched == nil {
- return nil
- }
-
- // only take the last matched line, discard previous lines
- lastLineMatched := matched[len(matched)-1][1]
- matchedAlias := reVimLang.FindAllSubmatch(lastLineMatched, -1)
- if matchedAlias == nil {
- return nil
- }
-
- alias := string(matchedAlias[0][1])
- if len(matchedAlias) > 1 {
- // cases:
- // matchedAlias = [["syntax=ruby " "ruby"] ["ft=python " "python"] ["filetype=perl " "perl"]] returns OtherLanguage;
- // matchedAlias = [["syntax=python " "python"] ["ft=python " "python"] ["filetype=python " "python"]] returns "Python";
- for _, match := range matchedAlias {
- otherAlias := string(match[1])
- if otherAlias != alias {
- return nil
- }
- }
- }
-
- language, ok := GetLanguageByAlias(alias)
- if !ok {
- return nil
- }
-
- return []string{language}
- }
-
- // GetLanguagesByFilename returns a slice of possible languages for the given filename.
- // It complies with the signature to be a Strategy type.
- func GetLanguagesByFilename(filename string, _ []byte, _ []string) []string {
- if filename == "" {
- return nil
- }
-
- return data.LanguagesByFilename[filepath.Base(filename)]
- }
-
- // GetLanguagesByShebang returns a slice of possible languages for the given content.
- // It complies with the signature to be a Strategy type.
- func GetLanguagesByShebang(_ string, content []byte, _ []string) (languages []string) {
- interpreter := getInterpreter(content)
- return data.LanguagesByInterpreter[interpreter]
- }
-
- var (
- shebangExecHack = regex.MustCompile(`exec (\w+).+\$0.+\$@`)
- pythonVersion = regex.MustCompile(`python\d\.\d+`)
- )
-
- func getInterpreter(data []byte) (interpreter string) {
- line := getFirstLine(data)
- if !hasShebang(line) {
- return ""
- }
-
- // skip shebang
- line = bytes.TrimSpace(line[2:])
- splitted := bytes.Fields(line)
- if len(splitted) == 0 {
- return ""
- }
-
- if bytes.Contains(splitted[0], []byte("env")) {
- if len(splitted) > 1 {
- interpreter = string(splitted[1])
- }
- } else {
- splittedPath := bytes.Split(splitted[0], []byte{'/'})
- interpreter = string(splittedPath[len(splittedPath)-1])
- }
-
- if interpreter == "sh" {
- interpreter = lookForMultilineExec(data)
- }
-
- if pythonVersion.MatchString(interpreter) {
- interpreter = interpreter[:strings.Index(interpreter, `.`)]
- }
-
- // If osascript is called with argument -l it could be different language so do not relay on it
- // To match linguist behaviour, see ref https://github.com/github/linguist/blob/d95bae794576ab0ef2fcb41a39eb61ea5302c5b5/lib/linguist/shebang.rb#L63
- if interpreter == "osascript" && bytes.Contains(line, []byte("-l")) {
- interpreter = ""
- }
-
- return
- }
-
- func getFirstLine(content []byte) []byte {
- nlpos := bytes.IndexByte(content, '\n')
- if nlpos < 0 {
- return content
- }
-
- return content[:nlpos]
- }
-
- func hasShebang(line []byte) bool {
- const shebang = `#!`
- prefix := []byte(shebang)
- return bytes.HasPrefix(line, prefix)
- }
-
- func lookForMultilineExec(data []byte) string {
- const magicNumOfLines = 5
- interpreter := "sh"
-
- buf := bufio.NewScanner(bytes.NewReader(data))
- for i := 0; i < magicNumOfLines && buf.Scan(); i++ {
- line := buf.Bytes()
- if shebangExecHack.Match(line) {
- interpreter = shebangExecHack.FindStringSubmatch(string(line))[1]
- break
- }
- }
-
- if err := buf.Err(); err != nil {
- return interpreter
- }
-
- return interpreter
- }
-
- // GetLanguagesByExtension returns a slice of possible languages for the given filename.
- // It complies with the signature to be a Strategy type.
- func GetLanguagesByExtension(filename string, _ []byte, _ []string) []string {
- if !strings.Contains(filename, ".") {
- return nil
- }
-
- filename = strings.ToLower(filename)
- dots := getDotIndexes(filename)
- for _, dot := range dots {
- ext := filename[dot:]
- languages, ok := data.LanguagesByExtension[ext]
- if ok {
- return languages
- }
- }
-
- return nil
- }
-
- func getDotIndexes(filename string) []int {
- dots := make([]int, 0, 2)
- for i, letter := range filename {
- if letter == rune('.') {
- dots = append(dots, i)
- }
- }
-
- return dots
- }
-
- // GetLanguagesByContent returns a slice of languages for the given content.
- // It is a Strategy that uses content-based regexp heuristics and a filename extension.
- func GetLanguagesByContent(filename string, content []byte, _ []string) []string {
- if filename == "" {
- return nil
- }
-
- ext := strings.ToLower(filepath.Ext(filename))
-
- heuristic, ok := data.ContentHeuristics[ext]
- if !ok {
- return nil
- }
-
- return heuristic.Match(content)
- }
-
- // GetLanguagesByClassifier returns a sorted slice of possible languages ordered by
- // decreasing language's probability. If there are not candidates it returns nil.
- // It is a Strategy that uses a pre-trained defaultClassifier.
- func GetLanguagesByClassifier(filename string, content []byte, candidates []string) (languages []string) {
- if len(candidates) == 0 {
- return nil
- }
-
- return getLanguagesBySpecificClassifier(content, candidates, defaultClassifier)
- }
-
- // getLanguagesBySpecificClassifier returns a slice of possible languages. It takes in a Classifier to be used.
- func getLanguagesBySpecificClassifier(content []byte, candidates []string, classifier classifier) (languages []string) {
- mapCandidates := make(map[string]float64)
- for _, candidate := range candidates {
- mapCandidates[candidate]++
- }
-
- return classifier.classify(content, mapCandidates)
- }
-
- // GetLanguageExtensions returns all extensions associated with the given language.
- func GetLanguageExtensions(language string) []string {
- return data.ExtensionsByLanguage[language]
- }
-
- // Type represent language's type. Either data, programming, markup, prose, or unknown.
- type Type int
-
- // Type's values.
- const (
- Unknown Type = iota
- Data
- Programming
- Markup
- Prose
- )
-
- // GetLanguageType returns the type of the given language.
- func GetLanguageType(language string) (langType Type) {
- intType, ok := data.LanguagesType[language]
- langType = Type(intType)
- if !ok {
- langType = Unknown
- }
- return langType
- }
-
- // GetLanguageByAlias returns either the language related to the given alias and ok set to true
- // or Otherlanguage and ok set to false if the alias is not recognized.
- func GetLanguageByAlias(alias string) (lang string, ok bool) {
- lang, ok = data.LanguageByAlias(alias)
- if !ok {
- lang = OtherLanguage
- }
-
- return
- }
-
- // GetLanguageGroup returns language group or empty string if language does not have group.
- func GetLanguageGroup(language string) string {
- if group, ok := data.LanguagesGroup[language]; ok {
- return group
- }
-
- return ""
- }
|