* Update go-enry to v2.5.2tags/v1.13.0-rc1
github.com/facebookgo/subset v0.0.0-20150612182917-8dac2c3c4870 // indirect | github.com/facebookgo/subset v0.0.0-20150612182917-8dac2c3c4870 // indirect | ||||
github.com/gliderlabs/ssh v0.2.2 | github.com/gliderlabs/ssh v0.2.2 | ||||
github.com/glycerine/go-unsnap-stream v0.0.0-20190901134440-81cf024a9e0a // indirect | github.com/glycerine/go-unsnap-stream v0.0.0-20190901134440-81cf024a9e0a // indirect | ||||
github.com/go-enry/go-enry/v2 v2.3.0 | |||||
github.com/go-enry/go-enry/v2 v2.5.2 | |||||
github.com/go-git/go-billy/v5 v5.0.0 | github.com/go-git/go-billy/v5 v5.0.0 | ||||
github.com/go-git/go-git/v5 v5.0.0 | github.com/go-git/go-git/v5 v5.0.0 | ||||
github.com/go-openapi/jsonreference v0.19.3 // indirect | github.com/go-openapi/jsonreference v0.19.3 // indirect |
github.com/glycerine/go-unsnap-stream v0.0.0-20190901134440-81cf024a9e0a/go.mod h1:/20jfyN9Y5QPEAprSgKAUr+glWDY39ZiUEAYOEv5dsE= | github.com/glycerine/go-unsnap-stream v0.0.0-20190901134440-81cf024a9e0a/go.mod h1:/20jfyN9Y5QPEAprSgKAUr+glWDY39ZiUEAYOEv5dsE= | ||||
github.com/glycerine/goconvey v0.0.0-20190410193231-58a59202ab31 h1:gclg6gY70GLy3PbkQ1AERPfmLMMagS60DKF78eWwLn8= | github.com/glycerine/goconvey v0.0.0-20190410193231-58a59202ab31 h1:gclg6gY70GLy3PbkQ1AERPfmLMMagS60DKF78eWwLn8= | ||||
github.com/glycerine/goconvey v0.0.0-20190410193231-58a59202ab31/go.mod h1:Ogl1Tioa0aV7gstGFO7KhffUsb9M4ydbEbbxpcEDc24= | github.com/glycerine/goconvey v0.0.0-20190410193231-58a59202ab31/go.mod h1:Ogl1Tioa0aV7gstGFO7KhffUsb9M4ydbEbbxpcEDc24= | ||||
github.com/go-enry/go-enry/v2 v2.3.0 h1:o8KwgY6uSplysrIpj+Y42J/xGPp90ogVpxE2Z3s8Unk= | |||||
github.com/go-enry/go-enry/v2 v2.3.0/go.mod h1:+xFJwbqWi15bvqFHb2ELUWVRKFQtwB61+sDrkvvxxGI= | |||||
github.com/go-enry/go-oniguruma v1.2.0 h1:oBO9XC1IDT9+AoWW5oFsa/7gFeOPacEqDbyXZKWXuDs= | |||||
github.com/go-enry/go-oniguruma v1.2.0/go.mod h1:bWDhYP+S6xZQgiRL7wlTScFYBe023B6ilRZbCAD5Hf4= | |||||
github.com/go-enry/go-enry/v2 v2.5.2 h1:3f3PFAO6JitWkPi1GQ5/m6Xu4gNL1U5soJ8QaYqJ0YQ= | |||||
github.com/go-enry/go-enry/v2 v2.5.2/go.mod h1:GVzIiAytiS5uT/QiuakK7TF1u4xDab87Y8V5EJRpsIQ= | |||||
github.com/go-enry/go-oniguruma v1.2.1 h1:k8aAMuJfMrqm/56SG2lV9Cfti6tC4x8673aHCcBk+eo= | |||||
github.com/go-enry/go-oniguruma v1.2.1/go.mod h1:bWDhYP+S6xZQgiRL7wlTScFYBe023B6ilRZbCAD5Hf4= | |||||
github.com/go-git/gcfg v1.5.0 h1:Q5ViNfGF8zFgyJWPqYwA7qGFoMTEiBmdlkcfRmpIMa4= | github.com/go-git/gcfg v1.5.0 h1:Q5ViNfGF8zFgyJWPqYwA7qGFoMTEiBmdlkcfRmpIMa4= | ||||
github.com/go-git/gcfg v1.5.0/go.mod h1:5m20vg6GwYabIxaOonVkTdrILxQMpEShl1xiMF4ua+E= | github.com/go-git/gcfg v1.5.0/go.mod h1:5m20vg6GwYabIxaOonVkTdrILxQMpEShl1xiMF4ua+E= | ||||
github.com/go-git/go-billy/v5 v5.0.0 h1:7NQHvd9FVid8VL4qVUMm8XifBK+2xCoZ2lSk0agRrHM= | github.com/go-git/go-billy/v5 v5.0.0 h1:7NQHvd9FVid8VL4qVUMm8XifBK+2xCoZ2lSk0agRrHM= | ||||
github.com/tinylib/msgp v1.1.2 h1:gWmO7n0Ys2RBEb7GPYB9Ujq8Mk5p2U08lRnmMcGy6BQ= | github.com/tinylib/msgp v1.1.2 h1:gWmO7n0Ys2RBEb7GPYB9Ujq8Mk5p2U08lRnmMcGy6BQ= | ||||
github.com/tinylib/msgp v1.1.2/go.mod h1:+d+yLhGm8mzTaHzB+wgMYrodPfmZrzkirds8fDWklFE= | github.com/tinylib/msgp v1.1.2/go.mod h1:+d+yLhGm8mzTaHzB+wgMYrodPfmZrzkirds8fDWklFE= | ||||
github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U= | github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U= | ||||
github.com/toqueteos/trie v1.0.0 h1:8i6pXxNUXNRAqP246iibb7w/pSFquNTQ+uNfriG7vlk= | |||||
github.com/toqueteos/trie v1.0.0/go.mod h1:Ywk48QhEqhU1+DwhMkJ2x7eeGxDHiGkAdc9+0DYcbsM= | |||||
github.com/toqueteos/webbrowser v1.2.0 h1:tVP/gpK69Fx+qMJKsLE7TD8LuGWPnEV71wBN9rrstGQ= | github.com/toqueteos/webbrowser v1.2.0 h1:tVP/gpK69Fx+qMJKsLE7TD8LuGWPnEV71wBN9rrstGQ= | ||||
github.com/toqueteos/webbrowser v1.2.0/go.mod h1:XWoZq4cyp9WeUeak7w7LXRUQf1F1ATJMir8RTqb4ayM= | github.com/toqueteos/webbrowser v1.2.0/go.mod h1:XWoZq4cyp9WeUeak7w7LXRUQf1F1ATJMir8RTqb4ayM= | ||||
github.com/tstranex/u2f v1.0.0 h1:HhJkSzDDlVSVIVt7pDJwCHQj67k7A5EeBgPmeD+pVsQ= | github.com/tstranex/u2f v1.0.0 h1:HhJkSzDDlVSVIVt7pDJwCHQj67k7A5EeBgPmeD+pVsQ= | ||||
gopkg.in/testfixtures.v2 v2.5.0/go.mod h1:vyAq+MYCgNpR29qitQdLZhdbLFf4mR/2MFJRFoQZZ2M= | gopkg.in/testfixtures.v2 v2.5.0/go.mod h1:vyAq+MYCgNpR29qitQdLZhdbLFf4mR/2MFJRFoQZZ2M= | ||||
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ= | gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ= | ||||
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= | gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= | ||||
gopkg.in/toqueteos/substring.v1 v1.0.2 h1:urLqCeMm6x/eTuQa1oZerNw8N1KNOIp5hD5kGL7lFsE= | |||||
gopkg.in/toqueteos/substring.v1 v1.0.2/go.mod h1:Eb2Z1UYehlVK8LYW2WBVR2rwbujsz3aX8XDrM1vbNew= | |||||
gopkg.in/warnings.v0 v0.1.2 h1:wFXVbFY8DY5/xOe1ECiWdKCzZlxgshcYVNkBHstARME= | gopkg.in/warnings.v0 v0.1.2 h1:wFXVbFY8DY5/xOe1ECiWdKCzZlxgshcYVNkBHstARME= | ||||
gopkg.in/warnings.v0 v0.1.2/go.mod h1:jksf8JmL6Qr/oQM2OXTHunEvvTAsrWBLb6OOjuVWRNI= | gopkg.in/warnings.v0 v0.1.2/go.mod h1:jksf8JmL6Qr/oQM2OXTHunEvvTAsrWBLb6OOjuVWRNI= | ||||
gopkg.in/yaml.v2 v2.0.0-20170812160011-eb3733d160e7/go.mod h1:JAlM8MvJe8wmxCU4Bli9HhUf9+ttbYbLASfIpnQbh74= | gopkg.in/yaml.v2 v2.0.0-20170812160011-eb3733d160e7/go.mod h1:JAlM8MvJe8wmxCU4Bli9HhUf9+ttbYbLASfIpnQbh74= |
"github.com/go-enry/go-enry/v2" | "github.com/go-enry/go-enry/v2" | ||||
) | ) | ||||
// GetCodeLanguageWithCallback detects code language based on file name and content using callback | |||||
func GetCodeLanguageWithCallback(filename string, contentFunc func() ([]byte, error)) string { | |||||
// GetCodeLanguage detects code language based on file name and content | |||||
func GetCodeLanguage(filename string, content []byte) string { | |||||
if language, ok := enry.GetLanguageByExtension(filename); ok { | if language, ok := enry.GetLanguageByExtension(filename); ok { | ||||
return language | return language | ||||
} | } | ||||
return language | return language | ||||
} | } | ||||
content, err := contentFunc() | |||||
if err != nil { | |||||
if len(content) == 0 { | |||||
return enry.OtherLanguage | return enry.OtherLanguage | ||||
} | } | ||||
return enry.GetLanguage(filepath.Base(filename), content) | return enry.GetLanguage(filepath.Base(filename), content) | ||||
} | } | ||||
// GetCodeLanguage detects code language based on file name and content | |||||
func GetCodeLanguage(filename string, content []byte) string { | |||||
return GetCodeLanguageWithCallback(filename, func() ([]byte, error) { | |||||
return content, nil | |||||
}) | |||||
} |
return nil | return nil | ||||
} | } | ||||
// If content can not be read just do detection by filename | |||||
content, _ := readFile(f, fileSizeLimit) | |||||
if enry.IsGenerated(f.Name, content) { | |||||
return nil | |||||
} | |||||
// TODO: Use .gitattributes file for linguist overrides | // TODO: Use .gitattributes file for linguist overrides | ||||
language := analyze.GetCodeLanguageWithCallback(f.Name, func() ([]byte, error) { | |||||
return readFile(f, fileSizeLimit) | |||||
}) | |||||
language := analyze.GetCodeLanguage(f.Name, content) | |||||
if language == enry.OtherLanguage || language == "" { | if language == enry.OtherLanguage || language == "" { | ||||
return nil | return nil | ||||
} | } |
# go-enry [![GoDoc](https://godoc.org/github.com/go-enry/go-enry?status.svg)](https://pkg.go.dev/github.com/go-enry/go-enry/v2) [![Test](https://github.com/go-enry/go-enry/workflows/Test/badge.svg)](https://github.com/go-enry/go-enry/actions?query=workflow%3ATest+branch%3Amaster) [![codecov](https://codecov.io/gh/go-enry/go-enry/branch/master/graph/badge.svg)](https://codecov.io/gh/go-enry/go-enry) | # go-enry [![GoDoc](https://godoc.org/github.com/go-enry/go-enry?status.svg)](https://pkg.go.dev/github.com/go-enry/go-enry/v2) [![Test](https://github.com/go-enry/go-enry/workflows/Test/badge.svg)](https://github.com/go-enry/go-enry/actions?query=workflow%3ATest+branch%3Amaster) [![codecov](https://codecov.io/gh/go-enry/go-enry/branch/master/graph/badge.svg)](https://codecov.io/gh/go-enry/go-enry) | ||||
Programming language detector and toolbox to ignore binary or vendored files. *enry*, started as a port to _Go_ of the original [Linguist](https://github.com/github/linguist) _Ruby_ library, that has an improved *2x performance*. | |||||
* [CLI](#cli) | |||||
* [Library](#library) | |||||
* [Use cases](#use-cases) | |||||
* [By filename](#by-filename) | |||||
* [By text](#by-text) | |||||
* [By file](#by-file) | |||||
* [Filtering](#filtering-vendoring-binaries-etc) | |||||
* [Coloring](#language-colors-and-groups) | |||||
* [Languages](#languages) | |||||
* [Go](#go) | |||||
* [Java bindings](#java-bindings) | |||||
* [Python bindings](#python-bindings) | |||||
* [Divergences from linguist](#divergences-from-linguist) | |||||
* [Benchmarks](#benchmarks) | |||||
* [Why Enry?](#why-enry) | |||||
* [Development](#development) | |||||
* [Sync with github/linguist upstream](#sync-with-githublinguist-upstream) | |||||
* [Misc](#misc) | |||||
* [License](#license) | |||||
Programming language detector and toolbox to ignore binary or vendored files. _enry_, started as a port to _Go_ of the original [Linguist](https://github.com/github/linguist) _Ruby_ library, that has an improved _2x performance_. | |||||
- [CLI](#cli) | |||||
- [Library](#library) | |||||
- [Use cases](#use-cases) | |||||
- [By filename](#by-filename) | |||||
- [By text](#by-text) | |||||
- [By file](#by-file) | |||||
- [Filtering](#filtering-vendoring-binaries-etc) | |||||
- [Coloring](#language-colors-and-groups) | |||||
- [Languages](#languages) | |||||
- [Go](#go) | |||||
- [Java bindings](#java-bindings) | |||||
- [Python bindings](#python-bindings) | |||||
- [Divergences from linguist](#divergences-from-linguist) | |||||
- [Benchmarks](#benchmarks) | |||||
- [Why Enry?](#why-enry) | |||||
- [Development](#development) | |||||
- [Sync with github/linguist upstream](#sync-with-githublinguist-upstream) | |||||
- [Misc](#misc) | |||||
- [License](#license) | |||||
# CLI | # CLI | ||||
# Library | # Library | ||||
*enry* is also a Go library for guessing a programming language that exposes API through FFI to multiple programming environments. | |||||
_enry_ is also a Go library for guessing a programming language that exposes API through FFI to multiple programming environments. | |||||
## Use cases | ## Use cases | ||||
*enry* guesses a programming language using a sequence of matching *strategies* that are | |||||
applied progressively to narrow down the possible options. Each *strategy* varies on the type | |||||
_enry_ guesses a programming language using a sequence of matching _strategies_ that are | |||||
applied progressively to narrow down the possible options. Each _strategy_ varies on the type | |||||
of input data that it needs to make a decision: file name, extension, the first line of the file, the full content of the file, etc. | of input data that it needs to make a decision: file name, extension, the first line of the file, the full content of the file, etc. | ||||
Depending on available input data, enry API can be roughly divided into the next categories or use cases. | Depending on available input data, enry API can be roughly divided into the next categories or use cases. | ||||
### By filename | ### By filename | ||||
Next functions require only a name of the file to make a guess: | Next functions require only a name of the file to make a guess: | ||||
- `GetLanguageByExtension` uses only file extension (wich may be ambiguous) | |||||
- `GetLanguageByFilename` useful for cases like `.gitignore`, `.bashrc`, etc | |||||
- all [filtering helpers](#filtering) | |||||
Please note that such guesses are expected not to be very accurate. | |||||
- `GetLanguageByExtension` uses only file extension (wich may be ambiguous) | |||||
- `GetLanguageByFilename` useful for cases like `.gitignore`, `.bashrc`, etc | |||||
- all [filtering helpers](#filtering) | |||||
Please note that such guesses are expected not to be very accurate. | |||||
### By text | ### By text | ||||
To make a guess only based on the content of the file or a text snippet, use | To make a guess only based on the content of the file or a text snippet, use | ||||
- `GetLanguageByShebang` reads only the first line of text to identify the [shebang](https://en.wikipedia.org/wiki/Shebang_(Unix)). | |||||
- `GetLanguageByModeline` for cases when Vim/Emacs modeline e.g. `/* vim: set ft=cpp: */` may be present at a head or a tail of the text. | |||||
- `GetLanguageByClassifier` uses a Bayesian classifier trained on all the `./samples/` from Linguist. | |||||
It usually is a last-resort strategy that is used to disambiguate the guess of the previous strategies, and thus it requires a list of "candidate" guesses. One can provide a list of all known languages - keys from the `data.LanguagesLogProbabilities` as possible candidates if more intelligent hypotheses are not available, at the price of possibly suboptimal accuracy. | |||||
- `GetLanguageByShebang` reads only the first line of text to identify the [shebang](<https://en.wikipedia.org/wiki/Shebang_(Unix)>). | |||||
- `GetLanguageByModeline` for cases when Vim/Emacs modeline e.g. `/* vim: set ft=cpp: */` may be present at a head or a tail of the text. | |||||
- `GetLanguageByClassifier` uses a Bayesian classifier trained on all the `./samples/` from Linguist. | |||||
It usually is a last-resort strategy that is used to disambiguate the guess of the previous strategies, and thus it requires a list of "candidate" guesses. One can provide a list of all known languages - keys from the `data.LanguagesLogProbabilities` as possible candidates if more intelligent hypotheses are not available, at the price of possibly suboptimal accuracy. | |||||
### By file | ### By file | ||||
The most accurate guess would be one when both, the file name and the content are available: | The most accurate guess would be one when both, the file name and the content are available: | ||||
- `GetLanguagesByContent` only uses file extension and a set of regexp-based content heuristics. | |||||
- `GetLanguages` uses the full set of matching strategies and is expected to be most accurate. | |||||
- `GetLanguagesByContent` only uses file extension and a set of regexp-based content heuristics. | |||||
- `GetLanguages` uses the full set of matching strategies and is expected to be most accurate. | |||||
### Filtering: vendoring, binaries, etc | ### Filtering: vendoring, binaries, etc | ||||
*enry* expose a set of file-level helpers `Is*` to simplify filtering out the files that are less interesting for the purpose of source code analysis: | |||||
- `IsBinary` | |||||
- `IsVendor` | |||||
- `IsConfiguration` | |||||
- `IsDocumentation` | |||||
- `IsDotFile` | |||||
- `IsImage` | |||||
_enry_ expose a set of file-level helpers `Is*` to simplify filtering out the files that are less interesting for the purpose of source code analysis: | |||||
- `IsBinary` | |||||
- `IsVendor` | |||||
- `IsConfiguration` | |||||
- `IsDocumentation` | |||||
- `IsDotFile` | |||||
- `IsImage` | |||||
- `IsTest` | |||||
- `IsGenerated` | |||||
### Language colors and groups | ### Language colors and groups | ||||
*enry* exposes function to get language color to use for example in presenting statistics in graphs: | |||||
- `GetColor` | |||||
- `GetLanguageGroup` can be used to group similar languages together e.g. for `Less` this function will return `CSS` | |||||
_enry_ exposes function to get language color to use for example in presenting statistics in graphs: | |||||
- `GetColor` | |||||
- `GetLanguageGroup` can be used to group similar languages together e.g. for `Less` this function will return `CSS` | |||||
## Languages | ## Languages | ||||
A library is going to be published on pypi as [enry](https://pypi.org/project/enry/) for | A library is going to be published on pypi as [enry](https://pypi.org/project/enry/) for | ||||
macOS and linux platforms. Windows support is planned under [src-d/enry#150](https://github.com/src-d/enry/issues/150). | macOS and linux platforms. Windows support is planned under [src-d/enry#150](https://github.com/src-d/enry/issues/150). | ||||
Divergences from Linguist | |||||
------------ | |||||
## Divergences from Linguist | |||||
The `enry` library is based on the data from `github/linguist` version **v7.9.0**. | The `enry` library is based on the data from `github/linguist` version **v7.9.0**. | ||||
Parsing [linguist/samples](https://github.com/github/linguist/tree/master/samples) the following `enry` results are different from the Linguist: | Parsing [linguist/samples](https://github.com/github/linguist/tree/master/samples) the following `enry` results are different from the Linguist: | ||||
* [Heuristics for ".es" extension](https://github.com/github/linguist/blob/e761f9b013e5b61161481fcb898b59721ee40e3d/lib/linguist/heuristics.yml#L103) in JavaScript could not be parsed, due to unsupported backreference in RE2 regexp engine. | |||||
- [Heuristics for ".es" extension](https://github.com/github/linguist/blob/e761f9b013e5b61161481fcb898b59721ee40e3d/lib/linguist/heuristics.yml#L103) in JavaScript could not be parsed, due to unsupported backreference in RE2 regexp engine. | |||||
* [Heuristics for ".rno" extension](https://github.com/github/linguist/blob/3a1bd3c3d3e741a8aaec4704f782e06f5cd2a00d/lib/linguist/heuristics.yml#L365) in RUNOFF could not be parsed, due to unsupported lookahead in RE2 regexp engine. | |||||
- [Heuristics for ".rno" extension](https://github.com/github/linguist/blob/3a1bd3c3d3e741a8aaec4704f782e06f5cd2a00d/lib/linguist/heuristics.yml#L365) in RUNOFF could not be parsed, due to unsupported lookahead in RE2 regexp engine. | |||||
* [Heuristics for ".inc" extension](https://github.com/github/linguist/blob/f0e2d0d7f1ce600b2a5acccaef6b149c87d8b99c/lib/linguist/heuristics.yml#L222) in NASL could not be parsed, due to unsupported possessive quantifier in RE2 regexp engine. | |||||
- [Heuristics for ".inc" extension](https://github.com/github/linguist/blob/f0e2d0d7f1ce600b2a5acccaef6b149c87d8b99c/lib/linguist/heuristics.yml#L222) in NASL could not be parsed, due to unsupported possessive quantifier in RE2 regexp engine. | |||||
* As of [Linguist v5.3.2](https://github.com/github/linguist/releases/tag/v5.3.2) it is using [flex-based scanner in C for tokenization](https://github.com/github/linguist/pull/3846). Enry still uses [extract_token](https://github.com/github/linguist/pull/3846/files#diff-d5179df0b71620e3fac4535cd1368d15L60) regex-based algorithm. See [#193](https://github.com/src-d/enry/issues/193). | |||||
- As of [Linguist v5.3.2](https://github.com/github/linguist/releases/tag/v5.3.2) it is using [flex-based scanner in C for tokenization](https://github.com/github/linguist/pull/3846). Enry still uses [extract_token](https://github.com/github/linguist/pull/3846/files#diff-d5179df0b71620e3fac4535cd1368d15L60) regex-based algorithm. See [#193](https://github.com/src-d/enry/issues/193). | |||||
* Bayesian classifier can't distinguish "SQL" from "PLpgSQL. See [#194](https://github.com/src-d/enry/issues/194). | |||||
- Bayesian classifier can't distinguish "SQL" from "PLpgSQL. See [#194](https://github.com/src-d/enry/issues/194). | |||||
* Detection of [generated files](https://github.com/github/linguist/blob/bf95666fc15e49d556f2def4d0a85338423c25f3/lib/linguist/generated.rb#L53) is not supported yet. | |||||
(Thus they are not excluded from CLI output). See [#213](https://github.com/src-d/enry/issues/213). | |||||
- Detection of [generated files](https://github.com/github/linguist/blob/bf95666fc15e49d556f2def4d0a85338423c25f3/lib/linguist/generated.rb#L53) is not supported yet. | |||||
(Thus they are not excluded from CLI output). See [#213](https://github.com/src-d/enry/issues/213). | |||||
* XML detection strategy is not implemented. See [#192](https://github.com/src-d/enry/issues/192). | |||||
- XML detection strategy is not implemented. See [#192](https://github.com/src-d/enry/issues/192). | |||||
* Overriding languages and types though `.gitattributes` is not yet supported. See [#18](https://github.com/src-d/enry/issues/18). | |||||
- Overriding languages and types though `.gitattributes` is not yet supported. See [#18](https://github.com/src-d/enry/issues/18). | |||||
* `enry` CLI output does NOT exclude `.gitignore`ed files and git submodules, as Linguist does | |||||
- `enry` CLI output does NOT exclude `.gitignore`ed files and git submodules, as Linguist does | |||||
In all the cases above that have an issue number - we plan to update enry to match Linguist behavior. | In all the cases above that have an issue number - we plan to update enry to match Linguist behavior. | ||||
## Benchmarks | |||||
Benchmarks | |||||
------------ | |||||
Enry's language detection has been compared with Linguist's on [*linguist/samples*](https://github.com/github/linguist/tree/master/samples). | |||||
Enry's language detection has been compared with Linguist's on [_linguist/samples_](https://github.com/github/linguist/tree/master/samples). | |||||
We got these results: | We got these results: | ||||
See [instructions](#misc) for running enry with oniguruma. | See [instructions](#misc) for running enry with oniguruma. | ||||
Why Enry? | |||||
------------ | |||||
## Why Enry? | |||||
In the movie [My Fair Lady](https://en.wikipedia.org/wiki/My_Fair_Lady), [Professor Henry Higgins](http://www.imdb.com/character/ch0011719/) is a linguist who at the very beginning of the movie enjoys guessing the origin of people based on their accent. | In the movie [My Fair Lady](https://en.wikipedia.org/wiki/My_Fair_Lady), [Professor Henry Higgins](http://www.imdb.com/character/ch0011719/) is a linguist who at the very beginning of the movie enjoys guessing the origin of people based on their accent. | ||||
Setting `ENRY_TEST_REPO` to the path to existing checkout of Linguist will avoid cloning it and sepeed tests up. | Setting `ENRY_TEST_REPO` to the path to existing checkout of Linguist will avoid cloning it and sepeed tests up. | ||||
Setting `ENRY_DEBUG=1` will provide insight in the Bayesian classifier building done by `make code-generate`. | Setting `ENRY_DEBUG=1` will provide insight in the Bayesian classifier building done by `make code-generate`. | ||||
### Sync with github/linguist upstream | ### Sync with github/linguist upstream | ||||
*enry* re-uses parts of the original [github/linguist](https://github.com/github/linguist) to generate internal data structures. | |||||
_enry_ re-uses parts of the original [github/linguist](https://github.com/github/linguist) to generate internal data structures. | |||||
In order to update to the latest release of linguist do: | In order to update to the latest release of linguist do: | ||||
```bash | ```bash | ||||
To stay in sync, enry needs to be updated when a new release of the linguist includes changes to any of the following files: | To stay in sync, enry needs to be updated when a new release of the linguist includes changes to any of the following files: | ||||
* [languages.yml](https://github.com/github/linguist/blob/master/lib/linguist/languages.yml) | |||||
* [heuristics.yml](https://github.com/github/linguist/blob/master/lib/linguist/heuristics.yml) | |||||
* [vendor.yml](https://github.com/github/linguist/blob/master/lib/linguist/vendor.yml) | |||||
* [documentation.yml](https://github.com/github/linguist/blob/master/lib/linguist/documentation.yml) | |||||
- [languages.yml](https://github.com/github/linguist/blob/master/lib/linguist/languages.yml) | |||||
- [heuristics.yml](https://github.com/github/linguist/blob/master/lib/linguist/heuristics.yml) | |||||
- [vendor.yml](https://github.com/github/linguist/blob/master/lib/linguist/vendor.yml) | |||||
- [documentation.yml](https://github.com/github/linguist/blob/master/lib/linguist/documentation.yml) | |||||
There is no automation for detecting the changes in the linguist project, so this process above has to be done manually from time to time. | There is no automation for detecting the changes in the linguist project, so this process above has to be done manually from time to time. | ||||
Separating all the necessary "manual" code changes to a different PR that includes some background description and an update to the documentation on ["divergences from linguist"](#divergences-from-linguist) is very much appreciated as it simplifies the maintenance (review/release notes/etc). | Separating all the necessary "manual" code changes to a different PR that includes some background description and an update to the documentation on ["divergences from linguist"](#divergences-from-linguist) is very much appreciated as it simplifies the maintenance (review/release notes/etc). | ||||
## Misc | ## Misc | ||||
<details> | <details> | ||||
### Benchmark | ### Benchmark | ||||
All benchmark scripts are in [*benchmarks*](https://github.com/go-enry/go-enry/blob/master/benchmarks) directory. | |||||
All benchmark scripts are in [_benchmarks_](https://github.com/go-enry/go-enry/blob/master/benchmarks) directory. | |||||
#### Dependencies | #### Dependencies | ||||
As benchmarks depend on Ruby and Github-Linguist gem make sure you have: | As benchmarks depend on Ruby and Github-Linguist gem make sure you have: | ||||
- Ruby (e.g using [`rbenv`](https://github.com/rbenv/rbenv)), [`bundler`](https://bundler.io/) installed | |||||
- Docker | |||||
- [native dependencies](https://github.com/github/linguist/#dependencies) installed | |||||
- Build the gem `cd .linguist && bundle install && rake build_gem && cd -` | |||||
- Install it `gem install --no-rdoc --no-ri --local .linguist/github-linguist-*.gem` | |||||
- Ruby (e.g using [`rbenv`](https://github.com/rbenv/rbenv)), [`bundler`](https://bundler.io/) installed | |||||
- Docker | |||||
- [native dependencies](https://github.com/github/linguist/#dependencies) installed | |||||
- Build the gem `cd .linguist && bundle install && rake build_gem && cd -` | |||||
- Install it `gem install --no-rdoc --no-ri --local .linguist/github-linguist-*.gem` | |||||
#### Quick benchmark | #### Quick benchmark | ||||
To run quicker benchmarks | To run quicker benchmarks | ||||
make benchmarks | make benchmarks | ||||
make benchmarks-samples | make benchmarks-samples | ||||
#### Full benchmark | #### Full benchmark | ||||
If you want to reproduce the same benchmarks as reported above: | If you want to reproduce the same benchmarks as reported above: | ||||
- Make sure all [dependencies](#benchmark-dependencies) are installed | |||||
- Install [gnuplot](http://gnuplot.info) (in order to plot the histogram) | |||||
- Run `ENRY_TEST_REPO="$PWD/.linguist" benchmarks/run.sh` (takes ~15h) | |||||
- Make sure all [dependencies](#benchmark-dependencies) are installed | |||||
- Install [gnuplot](http://gnuplot.info) (in order to plot the histogram) | |||||
- Run `ENRY_TEST_REPO="$PWD/.linguist" benchmarks/run.sh` (takes ~15h) | |||||
It will run the benchmarks for enry and Linguist, parse the output, create csv files and plot the histogram. | It will run the benchmarks for enry and Linguist, parse the output, create csv files and plot the histogram. | ||||
### Faster regexp engine (optional) | ### Faster regexp engine (optional) | ||||
[Oniguruma](https://github.com/kkos/oniguruma) is CRuby's regular expression engine. | [Oniguruma](https://github.com/kkos/oniguruma) is CRuby's regular expression engine. | ||||
It is very fast and performs better than the one built into Go runtime. *enry* supports swapping | |||||
It is very fast and performs better than the one built into Go runtime. _enry_ supports swapping | |||||
between those two engines thanks to [rubex](https://github.com/moovweb/rubex) project. | between those two engines thanks to [rubex](https://github.com/moovweb/rubex) project. | ||||
The typical overall speedup from using Oniguruma is 1.5-2x. However, it requires CGo and the external shared library. | The typical overall speedup from using Oniguruma is 1.5-2x. However, it requires CGo and the external shared library. | ||||
On macOS with [Homebrew](https://brew.sh/), it is: | On macOS with [Homebrew](https://brew.sh/), it is: | ||||
</details> | </details> | ||||
License | |||||
------------ | |||||
## License | |||||
Apache License, Version 2.0. See [LICENSE](LICENSE) | Apache License, Version 2.0. See [LICENSE](LICENSE) |
return | return | ||||
} | } | ||||
func getFirstLine(data []byte) []byte { | |||||
buf := bufio.NewScanner(bytes.NewReader(data)) | |||||
buf.Scan() | |||||
line := buf.Bytes() | |||||
if err := buf.Err(); err != nil { | |||||
return nil | |||||
func getFirstLine(content []byte) []byte { | |||||
nlpos := bytes.IndexByte(content, '\n') | |||||
if nlpos < 0 { | |||||
return content | |||||
} | } | ||||
return line | |||||
return content[:nlpos] | |||||
} | } | ||||
func hasShebang(line []byte) bool { | func hasShebang(line []byte) bool { |
package data | package data | ||||
import "gopkg.in/toqueteos/substring.v1" | |||||
import "github.com/go-enry/go-enry/v2/regex" | |||||
var DocumentationMatchers = substring.Or( | |||||
substring.Regexp(`^[Dd]ocs?/`), | |||||
substring.Regexp(`(^|/)[Dd]ocumentation/`), | |||||
substring.Regexp(`(^|/)[Gg]roovydoc/`), | |||||
substring.Regexp(`(^|/)[Jj]avadoc/`), | |||||
substring.Regexp(`^[Mm]an/`), | |||||
substring.Regexp(`^[Ee]xamples/`), | |||||
substring.Regexp(`^[Dd]emos?/`), | |||||
substring.Regexp(`(^|/)inst/doc/`), | |||||
substring.Regexp(`(^|/)CHANGE(S|LOG)?(\.|$)`), | |||||
substring.Regexp(`(^|/)CONTRIBUTING(\.|$)`), | |||||
substring.Regexp(`(^|/)COPYING(\.|$)`), | |||||
substring.Regexp(`(^|/)INSTALL(\.|$)`), | |||||
substring.Regexp(`(^|/)LICEN[CS]E(\.|$)`), | |||||
substring.Regexp(`(^|/)[Ll]icen[cs]e(\.|$)`), | |||||
substring.Regexp(`(^|/)README(\.|$)`), | |||||
substring.Regexp(`(^|/)[Rr]eadme(\.|$)`), | |||||
substring.Regexp(`^[Ss]amples?/`), | |||||
) | |||||
var DocumentationMatchers = []regex.EnryRegexp{ | |||||
regex.MustCompile(`^[Dd]ocs?/`), | |||||
regex.MustCompile(`(^|/)[Dd]ocumentation/`), | |||||
regex.MustCompile(`(^|/)[Gg]roovydoc/`), | |||||
regex.MustCompile(`(^|/)[Jj]avadoc/`), | |||||
regex.MustCompile(`^[Mm]an/`), | |||||
regex.MustCompile(`^[Ee]xamples/`), | |||||
regex.MustCompile(`^[Dd]emos?/`), | |||||
regex.MustCompile(`(^|/)inst/doc/`), | |||||
regex.MustCompile(`(^|/)CHANGE(S|LOG)?(\.|$)`), | |||||
regex.MustCompile(`(^|/)CONTRIBUTING(\.|$)`), | |||||
regex.MustCompile(`(^|/)COPYING(\.|$)`), | |||||
regex.MustCompile(`(^|/)INSTALL(\.|$)`), | |||||
regex.MustCompile(`(^|/)LICEN[CS]E(\.|$)`), | |||||
regex.MustCompile(`(^|/)[Ll]icen[cs]e(\.|$)`), | |||||
regex.MustCompile(`(^|/)README(\.|$)`), | |||||
regex.MustCompile(`(^|/)[Rr]eadme(\.|$)`), | |||||
regex.MustCompile(`^[Ss]amples?/`), | |||||
} |
package data | |||||
import ( | |||||
"bytes" | |||||
"strings" | |||||
"github.com/go-enry/go-enry/v2/regex" | |||||
) | |||||
// GeneratedCodeExtensions contains all extensions that belong to generated | |||||
// files for sure. | |||||
var GeneratedCodeExtensions = map[string]struct{}{ | |||||
// XCode files | |||||
".nib": {}, | |||||
".xcworkspacedata": {}, | |||||
".xcuserstate": {}, | |||||
} | |||||
// GeneratedCodeNameMatcher is a function that tells whether the file with the | |||||
// given name is generated. | |||||
type GeneratedCodeNameMatcher func(string) bool | |||||
func nameMatches(pattern string) GeneratedCodeNameMatcher { | |||||
r := regex.MustCompile(pattern) | |||||
return func(name string) bool { | |||||
return r.MatchString(name) | |||||
} | |||||
} | |||||
func nameContains(pattern string) GeneratedCodeNameMatcher { | |||||
return func(name string) bool { | |||||
return strings.Contains(name, pattern) | |||||
} | |||||
} | |||||
func nameEndsWith(pattern string) GeneratedCodeNameMatcher { | |||||
return func(name string) bool { | |||||
return strings.HasSuffix(name, pattern) | |||||
} | |||||
} | |||||
// GeneratedCodeNameMatchers are all the matchers that check whether the code | |||||
// is generated based only on the file name. | |||||
var GeneratedCodeNameMatchers = []GeneratedCodeNameMatcher{ | |||||
// Cocoa pods | |||||
nameMatches(`(^Pods|\/Pods)\/`), | |||||
// Carthage build | |||||
nameMatches(`(^|\/)Carthage\/Build\/`), | |||||
// NET designer file | |||||
nameMatches(`(?i)\.designer\.(cs|vb)$`), | |||||
// Generated NET specflow feature file | |||||
nameEndsWith(".feature.cs"), | |||||
// Node modules | |||||
nameContains("node_modules/"), | |||||
// Go vendor | |||||
nameMatches(`vendor\/([-0-9A-Za-z]+\.)+(com|edu|gov|in|me|net|org|fm|io)`), | |||||
// Go lock | |||||
nameEndsWith("Gopkg.lock"), | |||||
nameEndsWith("glide.lock"), | |||||
// Esy lock | |||||
nameMatches(`(^|\/)(\w+\.)?esy.lock$`), | |||||
// NPM shrinkwrap | |||||
nameEndsWith("npm-shrinkwrap.json"), | |||||
// NPM package lock | |||||
nameEndsWith("package-lock.json"), | |||||
// Yarn plugnplay | |||||
nameMatches(`(^|\/)\.pnp\.(c|m)?js$`), | |||||
// Godeps | |||||
nameContains("Godeps/"), | |||||
// Composer lock | |||||
nameEndsWith("composer.lock"), | |||||
// Generated by zephir | |||||
nameMatches(`.\.zep\.(?:c|h|php)$`), | |||||
// Cargo lock | |||||
nameEndsWith("Cargo.lock"), | |||||
// Pipenv lock | |||||
nameEndsWith("Pipfile.lock"), | |||||
// GraphQL relay | |||||
nameContains("__generated__/"), | |||||
} | |||||
// GeneratedCodeMatcher checks whether the file with the given data is | |||||
// generated code. | |||||
type GeneratedCodeMatcher func(path, ext string, content []byte) bool | |||||
// GeneratedCodeMatchers is the list of all generated code matchers that | |||||
// rely on checking the content of the file to make the guess. | |||||
var GeneratedCodeMatchers = []GeneratedCodeMatcher{ | |||||
isMinifiedFile, | |||||
hasSourceMapReference, | |||||
isSourceMap, | |||||
isCompiledCoffeeScript, | |||||
isGeneratedNetDocfile, | |||||
isGeneratedJavaScriptPEGParser, | |||||
isGeneratedPostScript, | |||||
isGeneratedGo, | |||||
isGeneratedProtobuf, | |||||
isGeneratedJavaScriptProtocolBuffer, | |||||
isGeneratedApacheThrift, | |||||
isGeneratedJNIHeader, | |||||
isVCRCassette, | |||||
isCompiledCythonFile, | |||||
isGeneratedModule, | |||||
isGeneratedUnity3DMeta, | |||||
isGeneratedRacc, | |||||
isGeneratedJFlex, | |||||
isGeneratedGrammarKit, | |||||
isGeneratedRoxygen2, | |||||
isGeneratedJison, | |||||
isGeneratedGRPCCpp, | |||||
isGeneratedDart, | |||||
isGeneratedPerlPPPortHeader, | |||||
isGeneratedGameMakerStudio, | |||||
isGeneratedGimp, | |||||
isGeneratedVisualStudio6, | |||||
isGeneratedHaxe, | |||||
isGeneratedHTML, | |||||
isGeneratedJooq, | |||||
} | |||||
func canBeMinified(ext string) bool { | |||||
return ext == ".js" || ext == ".css" | |||||
} | |||||
// isMinifiedFile returns whether the file may be minified. | |||||
// We consider a minified file any css or js file whose average number of chars | |||||
// per line is more than 110. | |||||
func isMinifiedFile(path, ext string, content []byte) bool { | |||||
if !canBeMinified(ext) { | |||||
return false | |||||
} | |||||
var chars, lines uint64 | |||||
forEachLine(content, func(line []byte) { | |||||
chars += uint64(len(line)) | |||||
lines++ | |||||
}) | |||||
if lines == 0 { | |||||
return false | |||||
} | |||||
return chars/lines > 110 | |||||
} | |||||
var sourceMapRegex = regex.MustCompile(`^\/[*\/][\#@] source(?:Mapping)?URL|sourceURL=`) | |||||
// hasSourceMapReference returns whether the file contains a reference to a | |||||
// source-map file. | |||||
func hasSourceMapReference(_ string, ext string, content []byte) bool { | |||||
if !canBeMinified(ext) { | |||||
return false | |||||
} | |||||
for _, line := range getLines(content, -2) { | |||||
if sourceMapRegex.Match(line) { | |||||
return true | |||||
} | |||||
} | |||||
return false | |||||
} | |||||
var sourceMapRegexps = []regex.EnryRegexp{ | |||||
regex.MustCompile(`^{"version":\d+,`), | |||||
regex.MustCompile(`^\/\*\* Begin line maps\. \*\*\/{`), | |||||
} | |||||
// isSourceMap returns whether the file itself is a source map. | |||||
func isSourceMap(path, _ string, content []byte) bool { | |||||
if strings.HasSuffix(path, ".js.map") || strings.HasSuffix(path, ".css.map") { | |||||
return true | |||||
} | |||||
firstLine := getFirstLine(content) | |||||
if len(firstLine) == 0 { | |||||
return false | |||||
} | |||||
for _, r := range sourceMapRegexps { | |||||
if r.Match(firstLine) { | |||||
return true | |||||
} | |||||
} | |||||
return false | |||||
} | |||||
func isCompiledCoffeeScript(path, ext string, content []byte) bool { | |||||
if ext != ".js" { | |||||
return false | |||||
} | |||||
firstLine := getFirstLine(content) | |||||
lastLines := getLines(content, -2) | |||||
if len(lastLines) < 2 { | |||||
return false | |||||
} | |||||
if string(firstLine) == "(function() {" && | |||||
string(lastLines[1]) == "}).call(this);" && | |||||
string(lastLines[0]) == "" { | |||||
score := 0 | |||||
forEachLine(content, func(line []byte) { | |||||
if bytes.Contains(line, []byte("var ")) { | |||||
// Underscored temp vars are likely to be Coffee | |||||
score += 1 * countAppearancesInLine(line, "_fn", "_i", "_len", "_ref", "_results") | |||||
// bind and extend functions are very Coffee specific | |||||
score += 3 * countAppearancesInLine(line, "__bind", "__extends", "__hasProp", "__indexOf", "__slice") | |||||
} | |||||
}) | |||||
// Require a score of 3. This is fairly abritrary. Consider tweaking later. | |||||
// See: https://github.com/github/linguist/blob/master/lib/linguist/generated.rb#L176-L213 | |||||
return score >= 3 | |||||
} | |||||
return false | |||||
} | |||||
func isGeneratedNetDocfile(_, ext string, content []byte) bool { | |||||
if ext != ".xml" { | |||||
return false | |||||
} | |||||
lines := bytes.Split(content, []byte{'\n'}) | |||||
if len(lines) <= 3 { | |||||
return false | |||||
} | |||||
return bytes.Contains(lines[1], []byte("<doc>")) && | |||||
bytes.Contains(lines[2], []byte("<assembly>")) && | |||||
bytes.Contains(lines[len(lines)-2], []byte("</doc>")) | |||||
} | |||||
var pegJavaScriptGeneratedRegex = regex.MustCompile(`^(?:[^\/]|\/[^\*])*\/\*(?:[^\*]|\*[^\/])*Generated by PEG.js`) | |||||
func isGeneratedJavaScriptPEGParser(_, ext string, content []byte) bool { | |||||
if ext != ".js" { | |||||
return false | |||||
} | |||||
// PEG.js-generated parsers include a comment near the top of the file | |||||
// that marks them as such. | |||||
return pegJavaScriptGeneratedRegex.Match(bytes.Join(getLines(content, 5), []byte(""))) | |||||
} | |||||
var postScriptType1And42Regex = regex.MustCompile(`(\n|\r\n|\r)\s*(?:currentfile eexec\s+|\/sfnts\s+\[)`) | |||||
var postScriptRegexes = []regex.EnryRegexp{ | |||||
regex.MustCompile(`[0-9]|draw|mpage|ImageMagick|inkscape|MATLAB`), | |||||
regex.MustCompile(`PCBNEW|pnmtops|\(Unknown\)|Serif Affinity|Filterimage -tops`), | |||||
} | |||||
func isGeneratedPostScript(_, ext string, content []byte) bool { | |||||
if ext != ".ps" && ext != ".eps" && ext != ".pfa" { | |||||
return false | |||||
} | |||||
// Type 1 and Type 42 fonts converted to PostScript are stored as hex-encoded byte streams; these | |||||
// streams are always preceded the `eexec` operator (if Type 1), or the `/sfnts` key (if Type 42). | |||||
if postScriptType1And42Regex.Match(content) { | |||||
return true | |||||
} | |||||
// We analyze the "%%Creator:" comment, which contains the author/generator | |||||
// of the file. If there is one, it should be in one of the first few lines. | |||||
var creator []byte | |||||
for _, line := range getLines(content, 10) { | |||||
if bytes.HasPrefix(line, []byte("%%Creator: ")) { | |||||
creator = line | |||||
break | |||||
} | |||||
} | |||||
if len(creator) == 0 { | |||||
return false | |||||
} | |||||
// EAGLE doesn't include a version number when it generates PostScript. | |||||
// However, it does prepend its name to the document's "%%Title" field. | |||||
if bytes.Contains(creator, []byte("EAGLE")) { | |||||
for _, line := range getLines(content, 5) { | |||||
if bytes.HasPrefix(line, []byte("%%Title: EAGLE Drawing ")) { | |||||
return true | |||||
} | |||||
} | |||||
} | |||||
// Most generators write their version number, while human authors' or companies' | |||||
// names don't contain numbers. So look if the line contains digits. Also | |||||
// look for some special cases without version numbers. | |||||
for _, r := range postScriptRegexes { | |||||
if r.Match(creator) { | |||||
return true | |||||
} | |||||
} | |||||
return false | |||||
} | |||||
func isGeneratedGo(_, ext string, content []byte) bool { | |||||
if ext != ".go" { | |||||
return false | |||||
} | |||||
lines := getLines(content, 40) | |||||
if len(lines) <= 1 { | |||||
return false | |||||
} | |||||
for _, line := range lines { | |||||
if bytes.Contains(line, []byte("Code generated by")) { | |||||
return true | |||||
} | |||||
} | |||||
return false | |||||
} | |||||
var protoExtensions = map[string]struct{}{ | |||||
".py": {}, | |||||
".java": {}, | |||||
".h": {}, | |||||
".cc": {}, | |||||
".cpp": {}, | |||||
".m": {}, | |||||
".rb": {}, | |||||
".php": {}, | |||||
} | |||||
func isGeneratedProtobuf(_, ext string, content []byte) bool { | |||||
if _, ok := protoExtensions[ext]; !ok { | |||||
return false | |||||
} | |||||
lines := getLines(content, 3) | |||||
if len(lines) <= 1 { | |||||
return false | |||||
} | |||||
for _, line := range lines { | |||||
if bytes.Contains(line, []byte("Generated by the protocol buffer compiler. DO NOT EDIT!")) { | |||||
return true | |||||
} | |||||
} | |||||
return false | |||||
} | |||||
func isGeneratedJavaScriptProtocolBuffer(_, ext string, content []byte) bool { | |||||
if ext != ".js" { | |||||
return false | |||||
} | |||||
lines := getLines(content, 6) | |||||
if len(lines) < 6 { | |||||
return false | |||||
} | |||||
return bytes.Contains(lines[5], []byte("GENERATED CODE -- DO NOT EDIT!")) | |||||
} | |||||
var apacheThriftExtensions = map[string]struct{}{ | |||||
".rb": {}, | |||||
".py": {}, | |||||
".go": {}, | |||||
".js": {}, | |||||
".m": {}, | |||||
".java": {}, | |||||
".h": {}, | |||||
".cc": {}, | |||||
".cpp": {}, | |||||
".php": {}, | |||||
} | |||||
func isGeneratedApacheThrift(_, ext string, content []byte) bool { | |||||
if _, ok := apacheThriftExtensions[ext]; !ok { | |||||
return false | |||||
} | |||||
for _, line := range getLines(content, 6) { | |||||
if bytes.Contains(line, []byte("Autogenerated by Thrift Compiler")) { | |||||
return true | |||||
} | |||||
} | |||||
return false | |||||
} | |||||
func isGeneratedJNIHeader(_, ext string, content []byte) bool { | |||||
if ext != ".h" { | |||||
return false | |||||
} | |||||
lines := getLines(content, 2) | |||||
if len(lines) < 2 { | |||||
return false | |||||
} | |||||
return bytes.Contains(lines[0], []byte("/* DO NOT EDIT THIS FILE - it is machine generated */")) && | |||||
bytes.Contains(lines[1], []byte("#include <jni.h>")) | |||||
} | |||||
func isVCRCassette(_, ext string, content []byte) bool { | |||||
if ext != ".yml" { | |||||
return false | |||||
} | |||||
lines := getLines(content, -2) | |||||
if len(lines) < 2 { | |||||
return false | |||||
} | |||||
return bytes.Contains(lines[1], []byte("recorded_with: VCR")) | |||||
} | |||||
func isCompiledCythonFile(_, ext string, content []byte) bool { | |||||
if ext != ".c" && ext != ".cpp" { | |||||
return false | |||||
} | |||||
lines := getLines(content, 1) | |||||
if len(lines) < 1 { | |||||
return false | |||||
} | |||||
return bytes.Contains(lines[0], []byte("Generated by Cython")) | |||||
} | |||||
func isGeneratedModule(_, ext string, content []byte) bool { | |||||
if ext != ".mod" { | |||||
return false | |||||
} | |||||
lines := getLines(content, 1) | |||||
if len(lines) < 1 { | |||||
return false | |||||
} | |||||
return bytes.Contains(lines[0], []byte("PCBNEW-LibModule-V")) || | |||||
bytes.Contains(lines[0], []byte("GFORTRAN module version '")) | |||||
} | |||||
func isGeneratedUnity3DMeta(_, ext string, content []byte) bool { | |||||
if ext != ".meta" { | |||||
return false | |||||
} | |||||
lines := getLines(content, 1) | |||||
if len(lines) < 1 { | |||||
return false | |||||
} | |||||
return bytes.Contains(lines[0], []byte("fileFormatVersion: ")) | |||||
} | |||||
func isGeneratedRacc(_, ext string, content []byte) bool { | |||||
if ext != ".rb" { | |||||
return false | |||||
} | |||||
lines := getLines(content, 3) | |||||
if len(lines) < 3 { | |||||
return false | |||||
} | |||||
return bytes.HasPrefix(lines[2], []byte("# This file is automatically generated by Racc")) | |||||
} | |||||
func isGeneratedJFlex(_, ext string, content []byte) bool { | |||||
if ext != ".java" { | |||||
return false | |||||
} | |||||
lines := getLines(content, 1) | |||||
if len(lines) < 1 { | |||||
return false | |||||
} | |||||
return bytes.HasPrefix(lines[0], []byte("/* The following code was generated by JFlex ")) | |||||
} | |||||
func isGeneratedGrammarKit(_, ext string, content []byte) bool { | |||||
if ext != ".java" { | |||||
return false | |||||
} | |||||
lines := getLines(content, 1) | |||||
if len(lines) < 1 { | |||||
return false | |||||
} | |||||
return bytes.Contains(lines[0], []byte("// This is a generated file. Not intended for manual editing.")) | |||||
} | |||||
func isGeneratedRoxygen2(_, ext string, content []byte) bool { | |||||
if ext != ".rd" { | |||||
return false | |||||
} | |||||
lines := getLines(content, 1) | |||||
if len(lines) < 1 { | |||||
return false | |||||
} | |||||
return bytes.Contains(lines[0], []byte("% Generated by roxygen2: do not edit by hand")) | |||||
} | |||||
func isGeneratedJison(_, ext string, content []byte) bool { | |||||
if ext != ".js" { | |||||
return false | |||||
} | |||||
lines := getLines(content, 1) | |||||
if len(lines) < 1 { | |||||
return false | |||||
} | |||||
return bytes.Contains(lines[0], []byte("/* parser generated by jison ")) || | |||||
bytes.Contains(lines[0], []byte("/* generated by jison-lex ")) | |||||
} | |||||
func isGeneratedGRPCCpp(_, ext string, content []byte) bool { | |||||
switch ext { | |||||
case ".cpp", ".hpp", ".h", ".cc": | |||||
lines := getLines(content, 1) | |||||
if len(lines) < 1 { | |||||
return false | |||||
} | |||||
return bytes.Contains(lines[0], []byte("// Generated by the gRPC")) | |||||
default: | |||||
return false | |||||
} | |||||
} | |||||
var dartRegex = regex.MustCompile(`generated code\W{2,3}do not modify`) | |||||
func isGeneratedDart(_, ext string, content []byte) bool { | |||||
if ext != ".dart" { | |||||
return false | |||||
} | |||||
lines := getLines(content, 1) | |||||
if len(lines) < 1 { | |||||
return false | |||||
} | |||||
return dartRegex.Match(bytes.ToLower(lines[0])) | |||||
} | |||||
func isGeneratedPerlPPPortHeader(name, _ string, content []byte) bool { | |||||
if !strings.HasSuffix(name, "ppport.h") { | |||||
return false | |||||
} | |||||
lines := getLines(content, 10) | |||||
if len(lines) < 10 { | |||||
return false | |||||
} | |||||
return bytes.Contains(lines[8], []byte("Automatically created by Devel::PPPort")) | |||||
} | |||||
var ( | |||||
gameMakerStudioFirstLineRegex = regex.MustCompile(`^\d\.\d\.\d.+\|\{`) | |||||
gameMakerStudioThirdLineRegex = regex.MustCompile(`\"modelName\"\:\s*\"GM`) | |||||
) | |||||
func isGeneratedGameMakerStudio(_, ext string, content []byte) bool { | |||||
if ext != ".yy" && ext != ".yyp" { | |||||
return false | |||||
} | |||||
lines := getLines(content, 3) | |||||
if len(lines) < 3 { | |||||
return false | |||||
} | |||||
return gameMakerStudioThirdLineRegex.Match(lines[2]) || | |||||
gameMakerStudioFirstLineRegex.Match(lines[0]) | |||||
} | |||||
var gimpRegexes = []regex.EnryRegexp{ | |||||
regex.MustCompile(`\/\* GIMP [a-zA-Z0-9\- ]+ C\-Source image dump \(.+?\.c\) \*\/`), | |||||
regex.MustCompile(`\/\* GIMP header image file format \([a-zA-Z0-9\- ]+\)\: .+?\.h \*\/`), | |||||
} | |||||
func isGeneratedGimp(_, ext string, content []byte) bool { | |||||
if ext != ".c" && ext != ".h" { | |||||
return false | |||||
} | |||||
lines := getLines(content, 1) | |||||
if len(lines) < 1 { | |||||
return false | |||||
} | |||||
for _, r := range gimpRegexes { | |||||
if r.Match(lines[0]) { | |||||
return true | |||||
} | |||||
} | |||||
return false | |||||
} | |||||
func isGeneratedVisualStudio6(_, ext string, content []byte) bool { | |||||
if ext != ".dsp" { | |||||
return false | |||||
} | |||||
for _, l := range getLines(content, 3) { | |||||
if bytes.Contains(l, []byte("# Microsoft Developer Studio Generated Build File")) { | |||||
return true | |||||
} | |||||
} | |||||
return false | |||||
} | |||||
var haxeExtensions = map[string]struct{}{ | |||||
".js": {}, | |||||
".py": {}, | |||||
".lua": {}, | |||||
".cpp": {}, | |||||
".h": {}, | |||||
".java": {}, | |||||
".cs": {}, | |||||
".php": {}, | |||||
} | |||||
func isGeneratedHaxe(_, ext string, content []byte) bool { | |||||
if _, ok := haxeExtensions[ext]; !ok { | |||||
return false | |||||
} | |||||
for _, l := range getLines(content, 3) { | |||||
if bytes.Contains(l, []byte("Generated by Haxe")) { | |||||
return true | |||||
} | |||||
} | |||||
return false | |||||
} | |||||
var ( | |||||
doxygenRegex = regex.MustCompile(`<!--\s+Generated by Doxygen\s+[.0-9]+\s*-->`) | |||||
htmlMetaRegex = regex.MustCompile(`<meta(\s+[^>]+)>`) | |||||
htmlMetaContentRegex = regex.MustCompile(`\s+(name|content|value)\s*=\s*("[^"]+"|'[^']+'|[^\s"']+)`) | |||||
orgModeMetaRegex = regex.MustCompile(`org\s+mode`) | |||||
) | |||||
func isGeneratedHTML(_, ext string, content []byte) bool { | |||||
if ext != ".html" && ext != ".htm" && ext != ".xhtml" { | |||||
return false | |||||
} | |||||
lines := getLines(content, 30) | |||||
// Pkgdown | |||||
if len(lines) >= 2 { | |||||
for _, l := range lines[:2] { | |||||
if bytes.Contains(l, []byte("<!-- Generated by pkgdown: do not edit by hand -->")) { | |||||
return true | |||||
} | |||||
} | |||||
} | |||||
// Mandoc | |||||
if len(lines) > 2 && | |||||
bytes.HasPrefix(lines[2], []byte("<!-- This is an automatically generated file.")) { | |||||
return true | |||||
} | |||||
// Doxygen | |||||
for _, l := range lines { | |||||
if doxygenRegex.Match(l) { | |||||
return true | |||||
} | |||||
} | |||||
// HTML tag: <meta name="generator" content="" /> | |||||
part := bytes.ToLower(bytes.Join(lines, []byte{' '})) | |||||
part = bytes.ReplaceAll(part, []byte{'\n'}, []byte{}) | |||||
part = bytes.ReplaceAll(part, []byte{'\r'}, []byte{}) | |||||
matches := htmlMetaRegex.FindAll(part, -1) | |||||
if len(matches) == 0 { | |||||
return false | |||||
} | |||||
for _, m := range matches { | |||||
var name, value, content string | |||||
ms := htmlMetaContentRegex.FindAllStringSubmatch(string(m), -1) | |||||
for _, m := range ms { | |||||
switch m[1] { | |||||
case "name": | |||||
name = m[2] | |||||
case "value": | |||||
value = m[2] | |||||
case "content": | |||||
content = m[2] | |||||
} | |||||
} | |||||
var val = value | |||||
if val == "" { | |||||
val = content | |||||
} | |||||
name = strings.Trim(name, `"'`) | |||||
val = strings.Trim(val, `"'`) | |||||
if name != "generator" || val == "" { | |||||
continue | |||||
} | |||||
if strings.Contains(val, "jlatex2html") || | |||||
strings.Contains(val, "latex2html") || | |||||
strings.Contains(val, "groff") || | |||||
strings.Contains(val, "makeinfo") || | |||||
strings.Contains(val, "texi2html") || | |||||
strings.Contains(val, "ronn") || | |||||
orgModeMetaRegex.MatchString(val) { | |||||
return true | |||||
} | |||||
} | |||||
return false | |||||
} | |||||
func isGeneratedJooq(_, ext string, content []byte) bool { | |||||
if ext != ".java" { | |||||
return false | |||||
} | |||||
for _, l := range getLines(content, 2) { | |||||
if bytes.Contains(l, []byte("This file is generated by jOOQ.")) { | |||||
return true | |||||
} | |||||
} | |||||
return false | |||||
} | |||||
func getFirstLine(content []byte) []byte { | |||||
lines := getLines(content, 1) | |||||
if len(lines) > 0 { | |||||
return lines[0] | |||||
} | |||||
return nil | |||||
} | |||||
// getLines returns up to the first n lines. A negative index will return up to | |||||
// the last n lines in reverse order. | |||||
func getLines(content []byte, n int) [][]byte { | |||||
var result [][]byte | |||||
if n < 0 { | |||||
for pos := len(content); pos > 0 && len(result) < -n; { | |||||
nlpos := bytes.LastIndexByte(content[:pos], '\n') | |||||
if nlpos+1 < len(content)-1 { | |||||
result = append(result, content[nlpos+1:pos]) | |||||
} | |||||
pos = nlpos | |||||
} | |||||
} else { | |||||
for pos := 0; pos < len(content) && len(result) < n; { | |||||
nlpos := bytes.IndexByte(content[pos:], '\n') | |||||
if nlpos < 0 && pos < len(content) { | |||||
nlpos = len(content) | |||||
} else if nlpos >= 0 { | |||||
nlpos += pos | |||||
} | |||||
result = append(result, content[pos:nlpos]) | |||||
pos = nlpos + 1 | |||||
} | |||||
} | |||||
return result | |||||
} | |||||
func forEachLine(content []byte, cb func([]byte)) { | |||||
var pos int | |||||
for pos < len(content) { | |||||
nlpos := bytes.IndexByte(content[pos:], '\n') | |||||
if nlpos < 0 && pos < len(content) { | |||||
nlpos = len(content) | |||||
} else if nlpos >= 0 { | |||||
nlpos += pos | |||||
} | |||||
cb(content[pos:nlpos]) | |||||
pos = nlpos + 1 | |||||
} | |||||
} | |||||
func countAppearancesInLine(line []byte, targets ...string) int { | |||||
var count int | |||||
for _, t := range targets { | |||||
count += bytes.Count(line, []byte(t)) | |||||
} | |||||
return count | |||||
} |
package data | |||||
import "github.com/go-enry/go-enry/v2/regex" | |||||
// TestMatchers is hand made collection of regexp used by the function `enry.IsTest` | |||||
// to identify test files in different languages. | |||||
var TestMatchers = []regex.EnryRegexp{ | |||||
regex.MustCompile(`(^|/)tests/.*Test\.php$`), | |||||
regex.MustCompile(`(^|/)test/.*Test(s?)\.java$`), | |||||
regex.MustCompile(`(^|/)test(/|/.*/)Test.*\.java$`), | |||||
regex.MustCompile(`(^|/)test/.*(Test(s?)|Spec(s?))\.scala$`), | |||||
regex.MustCompile(`(^|/)test_.*\.py$`), | |||||
regex.MustCompile(`(^|/).*_test\.go$`), | |||||
regex.MustCompile(`(^|/).*_(test|spec)\.rb$`), | |||||
regex.MustCompile(`(^|/).*Test(s?)\.cs$`), | |||||
regex.MustCompile(`(^|/).*\.(test|spec)\.(ts|tsx|js)$`), | |||||
} |
package data | package data | ||||
import "gopkg.in/toqueteos/substring.v1" | |||||
import "github.com/go-enry/go-enry/v2/regex" | |||||
var VendorMatchers = substring.Or( | |||||
substring.Regexp(`(^|/)cache/`), | |||||
substring.Regexp(`^[Dd]ependencies/`), | |||||
substring.Regexp(`(^|/)dist/`), | |||||
substring.Regexp(`^deps/`), | |||||
substring.Regexp(`(^|/)configure$`), | |||||
substring.Regexp(`(^|/)config.guess$`), | |||||
substring.Regexp(`(^|/)config.sub$`), | |||||
substring.Regexp(`(^|/)aclocal.m4`), | |||||
substring.Regexp(`(^|/)libtool.m4`), | |||||
substring.Regexp(`(^|/)ltoptions.m4`), | |||||
substring.Regexp(`(^|/)ltsugar.m4`), | |||||
substring.Regexp(`(^|/)ltversion.m4`), | |||||
substring.Regexp(`(^|/)lt~obsolete.m4`), | |||||
substring.Regexp(`dotnet-install\.(ps1|sh)$`), | |||||
substring.Regexp(`cpplint.py`), | |||||
substring.Regexp(`node_modules/`), | |||||
substring.Regexp(`(^|/)\.yarn/releases/`), | |||||
substring.Regexp(`(^|/)_esy$`), | |||||
substring.Regexp(`bower_components/`), | |||||
substring.Regexp(`^rebar$`), | |||||
substring.Regexp(`erlang.mk`), | |||||
substring.Regexp(`Godeps/_workspace/`), | |||||
substring.Regexp(`(^|/)testdata/`), | |||||
substring.Regexp(`.indent.pro`), | |||||
substring.Regexp(`(\.|-)min\.(js|css)$`), | |||||
substring.Regexp(`([^\s]*)import\.(css|less|scss|styl)$`), | |||||
substring.Regexp(`(^|/)bootstrap([^.]*)\.(js|css|less|scss|styl)$`), | |||||
substring.Regexp(`(^|/)custom\.bootstrap([^\s]*)(js|css|less|scss|styl)$`), | |||||
substring.Regexp(`(^|/)font-?awesome\.(css|less|scss|styl)$`), | |||||
substring.Regexp(`(^|/)font-?awesome/.*\.(css|less|scss|styl)$`), | |||||
substring.Regexp(`(^|/)foundation\.(css|less|scss|styl)$`), | |||||
substring.Regexp(`(^|/)normalize\.(css|less|scss|styl)$`), | |||||
substring.Regexp(`(^|/)skeleton\.(css|less|scss|styl)$`), | |||||
substring.Regexp(`(^|/)[Bb]ourbon/.*\.(css|less|scss|styl)$`), | |||||
substring.Regexp(`(^|/)animate\.(css|less|scss|styl)$`), | |||||
substring.Regexp(`(^|/)materialize\.(css|less|scss|styl|js)$`), | |||||
substring.Regexp(`(^|/)select2/.*\.(css|scss|js)$`), | |||||
substring.Regexp(`(^|/)bulma\.(css|sass|scss)$`), | |||||
substring.Regexp(`(3rd|[Tt]hird)[-_]?[Pp]arty/`), | |||||
substring.Regexp(`vendors?/`), | |||||
substring.Regexp(`extern(al)?/`), | |||||
substring.Regexp(`(^|/)[Vv]+endor/`), | |||||
substring.Regexp(`^debian/`), | |||||
substring.Regexp(`run.n$`), | |||||
substring.Regexp(`bootstrap-datepicker/`), | |||||
substring.Regexp(`(^|/)jquery([^.]*)\.js$`), | |||||
substring.Regexp(`(^|/)jquery\-\d\.\d+(\.\d+)?\.js$`), | |||||
substring.Regexp(`(^|/)jquery\-ui(\-\d\.\d+(\.\d+)?)?(\.\w+)?\.(js|css)$`), | |||||
substring.Regexp(`(^|/)jquery\.(ui|effects)\.([^.]*)\.(js|css)$`), | |||||
substring.Regexp(`jquery.fn.gantt.js`), | |||||
substring.Regexp(`jquery.fancybox.(js|css)`), | |||||
substring.Regexp(`fuelux.js`), | |||||
substring.Regexp(`(^|/)jquery\.fileupload(-\w+)?\.js$`), | |||||
substring.Regexp(`jquery.dataTables.js`), | |||||
substring.Regexp(`bootbox.js`), | |||||
substring.Regexp(`pdf.worker.js`), | |||||
substring.Regexp(`(^|/)slick\.\w+.js$`), | |||||
substring.Regexp(`(^|/)Leaflet\.Coordinates-\d+\.\d+\.\d+\.src\.js$`), | |||||
substring.Regexp(`leaflet.draw-src.js`), | |||||
substring.Regexp(`leaflet.draw.css`), | |||||
substring.Regexp(`Control.FullScreen.css`), | |||||
substring.Regexp(`Control.FullScreen.js`), | |||||
substring.Regexp(`leaflet.spin.js`), | |||||
substring.Regexp(`wicket-leaflet.js`), | |||||
substring.Regexp(`.sublime-project`), | |||||
substring.Regexp(`.sublime-workspace`), | |||||
substring.Regexp(`.vscode`), | |||||
substring.Regexp(`(^|/)prototype(.*)\.js$`), | |||||
substring.Regexp(`(^|/)effects\.js$`), | |||||
substring.Regexp(`(^|/)controls\.js$`), | |||||
substring.Regexp(`(^|/)dragdrop\.js$`), | |||||
substring.Regexp(`(.*?)\.d\.ts$`), | |||||
substring.Regexp(`(^|/)mootools([^.]*)\d+\.\d+.\d+([^.]*)\.js$`), | |||||
substring.Regexp(`(^|/)dojo\.js$`), | |||||
substring.Regexp(`(^|/)MochiKit\.js$`), | |||||
substring.Regexp(`(^|/)yahoo-([^.]*)\.js$`), | |||||
substring.Regexp(`(^|/)yui([^.]*)\.js$`), | |||||
substring.Regexp(`(^|/)ckeditor\.js$`), | |||||
substring.Regexp(`(^|/)tiny_mce([^.]*)\.js$`), | |||||
substring.Regexp(`(^|/)tiny_mce/(langs|plugins|themes|utils)`), | |||||
substring.Regexp(`(^|/)ace-builds/`), | |||||
substring.Regexp(`(^|/)fontello(.*?)\.css$`), | |||||
substring.Regexp(`(^|/)MathJax/`), | |||||
substring.Regexp(`(^|/)Chart\.js$`), | |||||
substring.Regexp(`(^|/)[Cc]ode[Mm]irror/(\d+\.\d+/)?(lib|mode|theme|addon|keymap|demo)`), | |||||
substring.Regexp(`(^|/)shBrush([^.]*)\.js$`), | |||||
substring.Regexp(`(^|/)shCore\.js$`), | |||||
substring.Regexp(`(^|/)shLegacy\.js$`), | |||||
substring.Regexp(`(^|/)angular([^.]*)\.js$`), | |||||
substring.Regexp(`(^|\/)d3(\.v\d+)?([^.]*)\.js$`), | |||||
substring.Regexp(`(^|/)react(-[^.]*)?\.js$`), | |||||
substring.Regexp(`(^|/)flow-typed/.*\.js$`), | |||||
substring.Regexp(`(^|/)modernizr\-\d\.\d+(\.\d+)?\.js$`), | |||||
substring.Regexp(`(^|/)modernizr\.custom\.\d+\.js$`), | |||||
substring.Regexp(`(^|/)knockout-(\d+\.){3}(debug\.)?js$`), | |||||
substring.Regexp(`(^|/)docs?/_?(build|themes?|templates?|static)/`), | |||||
substring.Regexp(`(^|/)admin_media/`), | |||||
substring.Regexp(`(^|/)env/`), | |||||
substring.Regexp(`^fabfile\.py$`), | |||||
substring.Regexp(`^waf$`), | |||||
substring.Regexp(`^.osx$`), | |||||
substring.Regexp(`\.xctemplate/`), | |||||
substring.Regexp(`\.imageset/`), | |||||
substring.Regexp(`(^|/)Carthage/`), | |||||
substring.Regexp(`(^|/)Sparkle/`), | |||||
substring.Regexp(`Crashlytics.framework/`), | |||||
substring.Regexp(`Fabric.framework/`), | |||||
substring.Regexp(`BuddyBuildSDK.framework/`), | |||||
substring.Regexp(`Realm.framework`), | |||||
substring.Regexp(`RealmSwift.framework`), | |||||
substring.Regexp(`gitattributes$`), | |||||
substring.Regexp(`gitignore$`), | |||||
substring.Regexp(`gitmodules$`), | |||||
substring.Regexp(`(^|/)gradlew$`), | |||||
substring.Regexp(`(^|/)gradlew\.bat$`), | |||||
substring.Regexp(`(^|/)gradle/wrapper/`), | |||||
substring.Regexp(`(^|/)mvnw$`), | |||||
substring.Regexp(`(^|/)mvnw\.cmd$`), | |||||
substring.Regexp(`(^|/)\.mvn/wrapper/`), | |||||
substring.Regexp(`-vsdoc\.js$`), | |||||
substring.Regexp(`\.intellisense\.js$`), | |||||
substring.Regexp(`(^|/)jquery([^.]*)\.validate(\.unobtrusive)?\.js$`), | |||||
substring.Regexp(`(^|/)jquery([^.]*)\.unobtrusive\-ajax\.js$`), | |||||
substring.Regexp(`(^|/)[Mm]icrosoft([Mm]vc)?([Aa]jax|[Vv]alidation)(\.debug)?\.js$`), | |||||
substring.Regexp(`^[Pp]ackages\/.+\.\d+\/`), | |||||
substring.Regexp(`(^|/)extjs/.*?\.js$`), | |||||
substring.Regexp(`(^|/)extjs/.*?\.xml$`), | |||||
substring.Regexp(`(^|/)extjs/.*?\.txt$`), | |||||
substring.Regexp(`(^|/)extjs/.*?\.html$`), | |||||
substring.Regexp(`(^|/)extjs/.*?\.properties$`), | |||||
substring.Regexp(`(^|/)extjs/.sencha/`), | |||||
substring.Regexp(`(^|/)extjs/docs/`), | |||||
substring.Regexp(`(^|/)extjs/builds/`), | |||||
substring.Regexp(`(^|/)extjs/cmd/`), | |||||
substring.Regexp(`(^|/)extjs/examples/`), | |||||
substring.Regexp(`(^|/)extjs/locale/`), | |||||
substring.Regexp(`(^|/)extjs/packages/`), | |||||
substring.Regexp(`(^|/)extjs/plugins/`), | |||||
substring.Regexp(`(^|/)extjs/resources/`), | |||||
substring.Regexp(`(^|/)extjs/src/`), | |||||
substring.Regexp(`(^|/)extjs/welcome/`), | |||||
substring.Regexp(`(^|/)html5shiv\.js$`), | |||||
substring.Regexp(`^[Tt]ests?/fixtures/`), | |||||
substring.Regexp(`^[Ss]pecs?/fixtures/`), | |||||
substring.Regexp(`(^|/)cordova([^.]*)\.js$`), | |||||
substring.Regexp(`(^|/)cordova\-\d\.\d(\.\d)?\.js$`), | |||||
substring.Regexp(`foundation(\..*)?\.js$`), | |||||
substring.Regexp(`^Vagrantfile$`), | |||||
substring.Regexp(`.[Dd][Ss]_[Ss]tore$`), | |||||
substring.Regexp(`^vignettes/`), | |||||
substring.Regexp(`^inst/extdata/`), | |||||
substring.Regexp(`octicons.css`), | |||||
substring.Regexp(`sprockets-octicons.scss`), | |||||
substring.Regexp(`(^|/)activator$`), | |||||
substring.Regexp(`(^|/)activator\.bat$`), | |||||
substring.Regexp(`proguard.pro`), | |||||
substring.Regexp(`proguard-rules.pro`), | |||||
substring.Regexp(`^puphpet/`), | |||||
substring.Regexp(`(^|/)\.google_apis/`), | |||||
substring.Regexp(`^Jenkinsfile$`), | |||||
) | |||||
var VendorMatchers = []regex.EnryRegexp{ | |||||
regex.MustCompile(`(^|/)cache/`), | |||||
regex.MustCompile(`^[Dd]ependencies/`), | |||||
regex.MustCompile(`(^|/)dist/`), | |||||
regex.MustCompile(`^deps/`), | |||||
regex.MustCompile(`(^|/)configure$`), | |||||
regex.MustCompile(`(^|/)config.guess$`), | |||||
regex.MustCompile(`(^|/)config.sub$`), | |||||
regex.MustCompile(`(^|/)aclocal.m4`), | |||||
regex.MustCompile(`(^|/)libtool.m4`), | |||||
regex.MustCompile(`(^|/)ltoptions.m4`), | |||||
regex.MustCompile(`(^|/)ltsugar.m4`), | |||||
regex.MustCompile(`(^|/)ltversion.m4`), | |||||
regex.MustCompile(`(^|/)lt~obsolete.m4`), | |||||
regex.MustCompile(`dotnet-install\.(ps1|sh)$`), | |||||
regex.MustCompile(`cpplint.py`), | |||||
regex.MustCompile(`node_modules/`), | |||||
regex.MustCompile(`(^|/)\.yarn/releases/`), | |||||
regex.MustCompile(`(^|/)_esy$`), | |||||
regex.MustCompile(`bower_components/`), | |||||
regex.MustCompile(`^rebar$`), | |||||
regex.MustCompile(`erlang.mk`), | |||||
regex.MustCompile(`Godeps/_workspace/`), | |||||
regex.MustCompile(`(^|/)testdata/`), | |||||
regex.MustCompile(`.indent.pro`), | |||||
regex.MustCompile(`(\.|-)min\.(js|css)$`), | |||||
regex.MustCompile(`([^\s]*)import\.(css|less|scss|styl)$`), | |||||
regex.MustCompile(`(^|/)bootstrap([^.]*)\.(js|css|less|scss|styl)$`), | |||||
regex.MustCompile(`(^|/)custom\.bootstrap([^\s]*)(js|css|less|scss|styl)$`), | |||||
regex.MustCompile(`(^|/)font-?awesome\.(css|less|scss|styl)$`), | |||||
regex.MustCompile(`(^|/)font-?awesome/.*\.(css|less|scss|styl)$`), | |||||
regex.MustCompile(`(^|/)foundation\.(css|less|scss|styl)$`), | |||||
regex.MustCompile(`(^|/)normalize\.(css|less|scss|styl)$`), | |||||
regex.MustCompile(`(^|/)skeleton\.(css|less|scss|styl)$`), | |||||
regex.MustCompile(`(^|/)[Bb]ourbon/.*\.(css|less|scss|styl)$`), | |||||
regex.MustCompile(`(^|/)animate\.(css|less|scss|styl)$`), | |||||
regex.MustCompile(`(^|/)materialize\.(css|less|scss|styl|js)$`), | |||||
regex.MustCompile(`(^|/)select2/.*\.(css|scss|js)$`), | |||||
regex.MustCompile(`(^|/)bulma\.(css|sass|scss)$`), | |||||
regex.MustCompile(`(3rd|[Tt]hird)[-_]?[Pp]arty/`), | |||||
regex.MustCompile(`vendors?/`), | |||||
regex.MustCompile(`extern(al)?/`), | |||||
regex.MustCompile(`(^|/)[Vv]+endor/`), | |||||
regex.MustCompile(`^debian/`), | |||||
regex.MustCompile(`run.n$`), | |||||
regex.MustCompile(`bootstrap-datepicker/`), | |||||
regex.MustCompile(`(^|/)jquery([^.]*)\.js$`), | |||||
regex.MustCompile(`(^|/)jquery\-\d\.\d+(\.\d+)?\.js$`), | |||||
regex.MustCompile(`(^|/)jquery\-ui(\-\d\.\d+(\.\d+)?)?(\.\w+)?\.(js|css)$`), | |||||
regex.MustCompile(`(^|/)jquery\.(ui|effects)\.([^.]*)\.(js|css)$`), | |||||
regex.MustCompile(`jquery.fn.gantt.js`), | |||||
regex.MustCompile(`jquery.fancybox.(js|css)`), | |||||
regex.MustCompile(`fuelux.js`), | |||||
regex.MustCompile(`(^|/)jquery\.fileupload(-\w+)?\.js$`), | |||||
regex.MustCompile(`jquery.dataTables.js`), | |||||
regex.MustCompile(`bootbox.js`), | |||||
regex.MustCompile(`pdf.worker.js`), | |||||
regex.MustCompile(`(^|/)slick\.\w+.js$`), | |||||
regex.MustCompile(`(^|/)Leaflet\.Coordinates-\d+\.\d+\.\d+\.src\.js$`), | |||||
regex.MustCompile(`leaflet.draw-src.js`), | |||||
regex.MustCompile(`leaflet.draw.css`), | |||||
regex.MustCompile(`Control.FullScreen.css`), | |||||
regex.MustCompile(`Control.FullScreen.js`), | |||||
regex.MustCompile(`leaflet.spin.js`), | |||||
regex.MustCompile(`wicket-leaflet.js`), | |||||
regex.MustCompile(`.sublime-project`), | |||||
regex.MustCompile(`.sublime-workspace`), | |||||
regex.MustCompile(`.vscode`), | |||||
regex.MustCompile(`(^|/)prototype(.*)\.js$`), | |||||
regex.MustCompile(`(^|/)effects\.js$`), | |||||
regex.MustCompile(`(^|/)controls\.js$`), | |||||
regex.MustCompile(`(^|/)dragdrop\.js$`), | |||||
regex.MustCompile(`(.*?)\.d\.ts$`), | |||||
regex.MustCompile(`(^|/)mootools([^.]*)\d+\.\d+.\d+([^.]*)\.js$`), | |||||
regex.MustCompile(`(^|/)dojo\.js$`), | |||||
regex.MustCompile(`(^|/)MochiKit\.js$`), | |||||
regex.MustCompile(`(^|/)yahoo-([^.]*)\.js$`), | |||||
regex.MustCompile(`(^|/)yui([^.]*)\.js$`), | |||||
regex.MustCompile(`(^|/)ckeditor\.js$`), | |||||
regex.MustCompile(`(^|/)tiny_mce([^.]*)\.js$`), | |||||
regex.MustCompile(`(^|/)tiny_mce/(langs|plugins|themes|utils)`), | |||||
regex.MustCompile(`(^|/)ace-builds/`), | |||||
regex.MustCompile(`(^|/)fontello(.*?)\.css$`), | |||||
regex.MustCompile(`(^|/)MathJax/`), | |||||
regex.MustCompile(`(^|/)Chart\.js$`), | |||||
regex.MustCompile(`(^|/)[Cc]ode[Mm]irror/(\d+\.\d+/)?(lib|mode|theme|addon|keymap|demo)`), | |||||
regex.MustCompile(`(^|/)shBrush([^.]*)\.js$`), | |||||
regex.MustCompile(`(^|/)shCore\.js$`), | |||||
regex.MustCompile(`(^|/)shLegacy\.js$`), | |||||
regex.MustCompile(`(^|/)angular([^.]*)\.js$`), | |||||
regex.MustCompile(`(^|\/)d3(\.v\d+)?([^.]*)\.js$`), | |||||
regex.MustCompile(`(^|/)react(-[^.]*)?\.js$`), | |||||
regex.MustCompile(`(^|/)flow-typed/.*\.js$`), | |||||
regex.MustCompile(`(^|/)modernizr\-\d\.\d+(\.\d+)?\.js$`), | |||||
regex.MustCompile(`(^|/)modernizr\.custom\.\d+\.js$`), | |||||
regex.MustCompile(`(^|/)knockout-(\d+\.){3}(debug\.)?js$`), | |||||
regex.MustCompile(`(^|/)docs?/_?(build|themes?|templates?|static)/`), | |||||
regex.MustCompile(`(^|/)admin_media/`), | |||||
regex.MustCompile(`(^|/)env/`), | |||||
regex.MustCompile(`^fabfile\.py$`), | |||||
regex.MustCompile(`^waf$`), | |||||
regex.MustCompile(`^.osx$`), | |||||
regex.MustCompile(`\.xctemplate/`), | |||||
regex.MustCompile(`\.imageset/`), | |||||
regex.MustCompile(`(^|/)Carthage/`), | |||||
regex.MustCompile(`(^|/)Sparkle/`), | |||||
regex.MustCompile(`Crashlytics.framework/`), | |||||
regex.MustCompile(`Fabric.framework/`), | |||||
regex.MustCompile(`BuddyBuildSDK.framework/`), | |||||
regex.MustCompile(`Realm.framework`), | |||||
regex.MustCompile(`RealmSwift.framework`), | |||||
regex.MustCompile(`gitattributes$`), | |||||
regex.MustCompile(`gitignore$`), | |||||
regex.MustCompile(`gitmodules$`), | |||||
regex.MustCompile(`(^|/)gradlew$`), | |||||
regex.MustCompile(`(^|/)gradlew\.bat$`), | |||||
regex.MustCompile(`(^|/)gradle/wrapper/`), | |||||
regex.MustCompile(`(^|/)mvnw$`), | |||||
regex.MustCompile(`(^|/)mvnw\.cmd$`), | |||||
regex.MustCompile(`(^|/)\.mvn/wrapper/`), | |||||
regex.MustCompile(`-vsdoc\.js$`), | |||||
regex.MustCompile(`\.intellisense\.js$`), | |||||
regex.MustCompile(`(^|/)jquery([^.]*)\.validate(\.unobtrusive)?\.js$`), | |||||
regex.MustCompile(`(^|/)jquery([^.]*)\.unobtrusive\-ajax\.js$`), | |||||
regex.MustCompile(`(^|/)[Mm]icrosoft([Mm]vc)?([Aa]jax|[Vv]alidation)(\.debug)?\.js$`), | |||||
regex.MustCompile(`^[Pp]ackages\/.+\.\d+\/`), | |||||
regex.MustCompile(`(^|/)extjs/.*?\.js$`), | |||||
regex.MustCompile(`(^|/)extjs/.*?\.xml$`), | |||||
regex.MustCompile(`(^|/)extjs/.*?\.txt$`), | |||||
regex.MustCompile(`(^|/)extjs/.*?\.html$`), | |||||
regex.MustCompile(`(^|/)extjs/.*?\.properties$`), | |||||
regex.MustCompile(`(^|/)extjs/.sencha/`), | |||||
regex.MustCompile(`(^|/)extjs/docs/`), | |||||
regex.MustCompile(`(^|/)extjs/builds/`), | |||||
regex.MustCompile(`(^|/)extjs/cmd/`), | |||||
regex.MustCompile(`(^|/)extjs/examples/`), | |||||
regex.MustCompile(`(^|/)extjs/locale/`), | |||||
regex.MustCompile(`(^|/)extjs/packages/`), | |||||
regex.MustCompile(`(^|/)extjs/plugins/`), | |||||
regex.MustCompile(`(^|/)extjs/resources/`), | |||||
regex.MustCompile(`(^|/)extjs/src/`), | |||||
regex.MustCompile(`(^|/)extjs/welcome/`), | |||||
regex.MustCompile(`(^|/)html5shiv\.js$`), | |||||
regex.MustCompile(`^[Tt]ests?/fixtures/`), | |||||
regex.MustCompile(`^[Ss]pecs?/fixtures/`), | |||||
regex.MustCompile(`(^|/)cordova([^.]*)\.js$`), | |||||
regex.MustCompile(`(^|/)cordova\-\d\.\d(\.\d)?\.js$`), | |||||
regex.MustCompile(`foundation(\..*)?\.js$`), | |||||
regex.MustCompile(`^Vagrantfile$`), | |||||
regex.MustCompile(`.[Dd][Ss]_[Ss]tore$`), | |||||
regex.MustCompile(`^vignettes/`), | |||||
regex.MustCompile(`^inst/extdata/`), | |||||
regex.MustCompile(`octicons.css`), | |||||
regex.MustCompile(`sprockets-octicons.scss`), | |||||
regex.MustCompile(`(^|/)activator$`), | |||||
regex.MustCompile(`(^|/)activator\.bat$`), | |||||
regex.MustCompile(`proguard.pro`), | |||||
regex.MustCompile(`proguard-rules.pro`), | |||||
regex.MustCompile(`^puphpet/`), | |||||
regex.MustCompile(`(^|/)\.google_apis/`), | |||||
regex.MustCompile(`^Jenkinsfile$`), | |||||
} |
go 1.14 | go 1.14 | ||||
require ( | require ( | ||||
github.com/go-enry/go-oniguruma v1.2.0 | |||||
github.com/go-enry/go-oniguruma v1.2.1 | |||||
github.com/stretchr/testify v1.3.0 | github.com/stretchr/testify v1.3.0 | ||||
github.com/toqueteos/trie v1.0.0 // indirect | |||||
gopkg.in/toqueteos/substring.v1 v1.0.2 | |||||
gopkg.in/yaml.v2 v2.2.8 | gopkg.in/yaml.v2 v2.2.8 | ||||
) | ) |
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= | ||||
github.com/go-enry/go-oniguruma v1.2.0 h1:oBO9XC1IDT9+AoWW5oFsa/7gFeOPacEqDbyXZKWXuDs= | github.com/go-enry/go-oniguruma v1.2.0 h1:oBO9XC1IDT9+AoWW5oFsa/7gFeOPacEqDbyXZKWXuDs= | ||||
github.com/go-enry/go-oniguruma v1.2.0/go.mod h1:bWDhYP+S6xZQgiRL7wlTScFYBe023B6ilRZbCAD5Hf4= | github.com/go-enry/go-oniguruma v1.2.0/go.mod h1:bWDhYP+S6xZQgiRL7wlTScFYBe023B6ilRZbCAD5Hf4= | ||||
github.com/go-enry/go-oniguruma v1.2.1 h1:k8aAMuJfMrqm/56SG2lV9Cfti6tC4x8673aHCcBk+eo= | |||||
github.com/go-enry/go-oniguruma v1.2.1/go.mod h1:bWDhYP+S6xZQgiRL7wlTScFYBe023B6ilRZbCAD5Hf4= | |||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= | ||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= | ||||
github.com/stretchr/objx v0.1.0 h1:4G4v2dO3VZwixGIRoQ5Lfboy6nUhCyYzaqnIAPPhYs4= | github.com/stretchr/objx v0.1.0 h1:4G4v2dO3VZwixGIRoQ5Lfboy6nUhCyYzaqnIAPPhYs4= | ||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= | ||||
github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= | github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= | ||||
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= | github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= | ||||
github.com/toqueteos/trie v1.0.0 h1:8i6pXxNUXNRAqP246iibb7w/pSFquNTQ+uNfriG7vlk= | |||||
github.com/toqueteos/trie v1.0.0/go.mod h1:Ywk48QhEqhU1+DwhMkJ2x7eeGxDHiGkAdc9+0DYcbsM= | |||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= | ||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= | ||||
gopkg.in/toqueteos/substring.v1 v1.0.2 h1:urLqCeMm6x/eTuQa1oZerNw8N1KNOIp5hD5kGL7lFsE= | |||||
gopkg.in/toqueteos/substring.v1 v1.0.2/go.mod h1:Eb2Z1UYehlVK8LYW2WBVR2rwbujsz3aX8XDrM1vbNew= | |||||
gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10= | gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10= | ||||
gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= | gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= |
"strings" | "strings" | ||||
"github.com/go-enry/go-enry/v2/data" | "github.com/go-enry/go-enry/v2/data" | ||||
"github.com/go-enry/go-enry/v2/regex" | |||||
) | ) | ||||
const binSniffLen = 8000 | const binSniffLen = 8000 | ||||
var configurationLanguages = map[string]bool{ | |||||
"XML": true, "JSON": true, "TOML": true, "YAML": true, "INI": true, "SQL": true, | |||||
var configurationLanguages = map[string]struct{}{ | |||||
"XML": {}, | |||||
"JSON": {}, | |||||
"TOML": {}, | |||||
"YAML": {}, | |||||
"INI": {}, | |||||
"SQL": {}, | |||||
} | } | ||||
// IsConfiguration tells if filename is in one of the configuration languages. | // IsConfiguration tells if filename is in one of the configuration languages. | ||||
// IsDocumentation returns whether or not path is a documentation path. | // IsDocumentation returns whether or not path is a documentation path. | ||||
func IsDocumentation(path string) bool { | func IsDocumentation(path string) bool { | ||||
return data.DocumentationMatchers.Match(path) | |||||
return matchRegexSlice(data.DocumentationMatchers, path) | |||||
} | } | ||||
// IsDotFile returns whether or not path has dot as a prefix. | // IsDotFile returns whether or not path has dot as a prefix. | ||||
// IsVendor returns whether or not path is a vendor path. | // IsVendor returns whether or not path is a vendor path. | ||||
func IsVendor(path string) bool { | func IsVendor(path string) bool { | ||||
return data.VendorMatchers.Match(path) | |||||
return matchRegexSlice(data.VendorMatchers, path) | |||||
} | |||||
// IsTest returns whether or not path is a test path. | |||||
func IsTest(path string) bool { | |||||
return matchRegexSlice(data.TestMatchers, path) | |||||
} | } | ||||
// IsBinary detects if data is a binary value based on: | // IsBinary detects if data is a binary value based on: | ||||
return "#cccccc" | return "#cccccc" | ||||
} | } | ||||
func matchRegexSlice(exprs []regex.EnryRegexp, str string) bool { | |||||
for _, expr := range exprs { | |||||
if expr.MatchString(str) { | |||||
return true | |||||
} | |||||
} | |||||
return false | |||||
} | |||||
// IsGenerated returns whether the file with the given path and content is a | |||||
// generated file. | |||||
func IsGenerated(path string, content []byte) bool { | |||||
ext := strings.ToLower(filepath.Ext(path)) | |||||
if _, ok := data.GeneratedCodeExtensions[ext]; ok { | |||||
return true | |||||
} | |||||
for _, m := range data.GeneratedCodeNameMatchers { | |||||
if m(path) { | |||||
return true | |||||
} | |||||
} | |||||
path = strings.ToLower(path) | |||||
for _, m := range data.GeneratedCodeMatchers { | |||||
if m(path, ext, content) { | |||||
return true | |||||
} | |||||
} | |||||
return false | |||||
} |
#include "chelper.h" | #include "chelper.h" | ||||
int NewOnigRegex( char *pattern, int pattern_length, int option, | int NewOnigRegex( char *pattern, int pattern_length, int option, | ||||
OnigRegex *regex, OnigRegion **region, OnigEncoding *encoding, OnigErrorInfo **error_info, char **error_buffer) { | |||||
OnigRegex *regex, OnigEncoding *encoding, OnigErrorInfo **error_info, char **error_buffer) { | |||||
int ret = ONIG_NORMAL; | int ret = ONIG_NORMAL; | ||||
int error_msg_len = 0; | int error_msg_len = 0; | ||||
memset(*error_buffer, 0, ONIG_MAX_ERROR_MESSAGE_LEN * sizeof(char)); | memset(*error_buffer, 0, ONIG_MAX_ERROR_MESSAGE_LEN * sizeof(char)); | ||||
*region = onig_region_new(); | |||||
ret = onig_new(regex, pattern_start, pattern_end, (OnigOptionType)(option), *encoding, OnigDefaultSyntax, *error_info); | ret = onig_new(regex, pattern_start, pattern_end, (OnigOptionType)(option), *encoding, OnigDefaultSyntax, *error_info); | ||||
if (ret != ONIG_NORMAL) { | if (ret != ONIG_NORMAL) { | ||||
} | } | ||||
int SearchOnigRegex( void *str, int str_length, int offset, int option, | int SearchOnigRegex( void *str, int str_length, int offset, int option, | ||||
OnigRegex regex, OnigRegion *region, OnigErrorInfo *error_info, char *error_buffer, int *captures, int *numCaptures) { | |||||
OnigRegex regex, OnigErrorInfo *error_info, char *error_buffer, int *captures, int *numCaptures) { | |||||
int ret = ONIG_MISMATCH; | int ret = ONIG_MISMATCH; | ||||
int error_msg_len = 0; | int error_msg_len = 0; | ||||
OnigRegion *region; | |||||
#ifdef BENCHMARK_CHELP | #ifdef BENCHMARK_CHELP | ||||
struct timeval tim1, tim2; | struct timeval tim1, tim2; | ||||
long t; | long t; | ||||
gettimeofday(&tim1, NULL); | gettimeofday(&tim1, NULL); | ||||
#endif | #endif | ||||
region = onig_region_new(); | |||||
ret = onig_search(regex, str_start, str_end, search_start, search_end, region, option); | ret = onig_search(regex, str_start, str_end, search_start, search_end, region, option); | ||||
if (ret < 0 && error_buffer != NULL) { | if (ret < 0 && error_buffer != NULL) { | ||||
error_msg_len = onig_error_code_to_str((unsigned char*)(error_buffer), ret, error_info); | error_msg_len = onig_error_code_to_str((unsigned char*)(error_buffer), ret, error_info); | ||||
*numCaptures = count; | *numCaptures = count; | ||||
} | } | ||||
onig_region_free(region, 1); | |||||
#ifdef BENCHMARK_CHELP | #ifdef BENCHMARK_CHELP | ||||
gettimeofday(&tim2, NULL); | gettimeofday(&tim2, NULL); | ||||
t = (tim2.tv_sec - tim1.tv_sec) * 1000000 + tim2.tv_usec - tim1.tv_usec; | t = (tim2.tv_sec - tim1.tv_sec) * 1000000 + tim2.tv_usec - tim1.tv_usec; | ||||
} | } | ||||
int MatchOnigRegex(void *str, int str_length, int offset, int option, | int MatchOnigRegex(void *str, int str_length, int offset, int option, | ||||
OnigRegex regex, OnigRegion *region) { | |||||
OnigRegex regex) { | |||||
int ret = ONIG_MISMATCH; | int ret = ONIG_MISMATCH; | ||||
int error_msg_len = 0; | int error_msg_len = 0; | ||||
OnigRegion *region; | |||||
#ifdef BENCHMARK_CHELP | #ifdef BENCHMARK_CHELP | ||||
struct timeval tim1, tim2; | struct timeval tim1, tim2; | ||||
long t; | long t; | ||||
#ifdef BENCHMARK_CHELP | #ifdef BENCHMARK_CHELP | ||||
gettimeofday(&tim1, NULL); | gettimeofday(&tim1, NULL); | ||||
#endif | #endif | ||||
region = onig_region_new(); | |||||
ret = onig_match(regex, str_start, str_end, search_start, region, option); | ret = onig_match(regex, str_start, str_end, search_start, region, option); | ||||
onig_region_free(region, 1); | |||||
#ifdef BENCHMARK_CHELP | #ifdef BENCHMARK_CHELP | ||||
gettimeofday(&tim2, NULL); | gettimeofday(&tim2, NULL); | ||||
t = (tim2.tv_sec - tim1.tv_sec) * 1000000 + tim2.tv_usec - tim1.tv_usec; | t = (tim2.tv_sec - tim1.tv_sec) * 1000000 + tim2.tv_usec - tim1.tv_usec; | ||||
} | } | ||||
int LookupOnigCaptureByName(char *name, int name_length, | int LookupOnigCaptureByName(char *name, int name_length, | ||||
OnigRegex regex, OnigRegion *region) { | |||||
OnigRegex regex) { | |||||
int ret = ONIGERR_UNDEFINED_NAME_REFERENCE; | int ret = ONIGERR_UNDEFINED_NAME_REFERENCE; | ||||
OnigRegion *region; | |||||
#ifdef BENCHMARK_CHELP | #ifdef BENCHMARK_CHELP | ||||
struct timeval tim1, tim2; | struct timeval tim1, tim2; | ||||
long t; | long t; | ||||
#ifdef BENCHMARK_CHELP | #ifdef BENCHMARK_CHELP | ||||
gettimeofday(&tim1, NULL); | gettimeofday(&tim1, NULL); | ||||
#endif | #endif | ||||
region = onig_region_new(); | |||||
ret = onig_name_to_backref_number(regex, name_start, name_end, region); | ret = onig_name_to_backref_number(regex, name_start, name_end, region); | ||||
onig_region_free(region, 1); | |||||
#ifdef BENCHMARK_CHELP | #ifdef BENCHMARK_CHELP | ||||
gettimeofday(&tim2, NULL); | gettimeofday(&tim2, NULL); | ||||
t = (tim2.tv_sec - tim1.tv_sec) * 1000000 + tim2.tv_usec - tim1.tv_usec; | t = (tim2.tv_sec - tim1.tv_sec) * 1000000 + tim2.tv_usec - tim1.tv_usec; | ||||
onig_foreach_name(reg, name_callback, (void* )&groupInfo); | onig_foreach_name(reg, name_callback, (void* )&groupInfo); | ||||
return groupInfo.bufferOffset; | return groupInfo.bufferOffset; | ||||
} | } | ||||
#include <oniguruma.h> | #include <oniguruma.h> | ||||
extern int NewOnigRegex( char *pattern, int pattern_length, int option, | extern int NewOnigRegex( char *pattern, int pattern_length, int option, | ||||
OnigRegex *regex, OnigRegion **region, OnigEncoding *encoding, OnigErrorInfo **error_info, char **error_buffer); | |||||
OnigRegex *regex, OnigEncoding *encoding, OnigErrorInfo **error_info, char **error_buffer); | |||||
extern int SearchOnigRegex( void *str, int str_length, int offset, int option, | extern int SearchOnigRegex( void *str, int str_length, int offset, int option, | ||||
OnigRegex regex, OnigRegion *region, OnigErrorInfo *error_info, char *error_buffer, int *captures, int *numCaptures); | |||||
OnigRegex regex, OnigErrorInfo *error_info, char *error_buffer, int *captures, int *numCaptures); | |||||
extern int MatchOnigRegex( void *str, int str_length, int offset, int option, | extern int MatchOnigRegex( void *str, int str_length, int offset, int option, | ||||
OnigRegex regex, OnigRegion *region); | |||||
OnigRegex regex); | |||||
extern int LookupOnigCaptureByName(char *name, int name_length, OnigRegex regex, OnigRegion *region); | |||||
extern int LookupOnigCaptureByName(char *name, int name_length, OnigRegex regex); | |||||
extern int GetCaptureNames(OnigRegex regex, void *buffer, int bufferSize, int* groupNumbers); | extern int GetCaptureNames(OnigRegex regex, void *buffer, int bufferSize, int* groupNumbers); |
"errors" | "errors" | ||||
"fmt" | "fmt" | ||||
"io" | "io" | ||||
"log" | |||||
"runtime" | "runtime" | ||||
"strconv" | "strconv" | ||||
"sync" | "sync" | ||||
"unsafe" | "unsafe" | ||||
) | ) | ||||
type strRange []int | |||||
const numMatchStartSize = 4 | const numMatchStartSize = 4 | ||||
const numReadBufferStartSize = 256 | const numReadBufferStartSize = 256 | ||||
var mutex sync.Mutex | var mutex sync.Mutex | ||||
type MatchData struct { | |||||
count int | |||||
indexes [][]int32 | |||||
} | |||||
type NamedGroupInfo map[string]int | type NamedGroupInfo map[string]int | ||||
type Regexp struct { | type Regexp struct { | ||||
pattern string | |||||
regex C.OnigRegex | |||||
region *C.OnigRegion | |||||
encoding C.OnigEncoding | |||||
errorInfo *C.OnigErrorInfo | |||||
errorBuf *C.char | |||||
matchData *MatchData | |||||
pattern string | |||||
regex C.OnigRegex | |||||
encoding C.OnigEncoding | |||||
errorInfo *C.OnigErrorInfo | |||||
errorBuf *C.char | |||||
numCaptures int32 | |||||
namedGroupInfo NamedGroupInfo | namedGroupInfo NamedGroupInfo | ||||
} | } | ||||
// NewRegexp creates and initializes a new Regexp with the given pattern and option. | // NewRegexp creates and initializes a new Regexp with the given pattern and option. | ||||
func NewRegexp(pattern string, option int) (re *Regexp, err error) { | |||||
func NewRegexp(pattern string, option int) (*Regexp, error) { | |||||
return initRegexp(&Regexp{pattern: pattern, encoding: C.ONIG_ENCODING_UTF8}, option) | return initRegexp(&Regexp{pattern: pattern, encoding: C.ONIG_ENCODING_UTF8}, option) | ||||
} | } | ||||
// NewRegexpASCII is equivalent to NewRegexp, but with the encoding restricted to ASCII. | // NewRegexpASCII is equivalent to NewRegexp, but with the encoding restricted to ASCII. | ||||
func NewRegexpASCII(pattern string, option int) (re *Regexp, err error) { | |||||
func NewRegexpASCII(pattern string, option int) (*Regexp, error) { | |||||
return initRegexp(&Regexp{pattern: pattern, encoding: C.ONIG_ENCODING_ASCII}, option) | return initRegexp(&Regexp{pattern: pattern, encoding: C.ONIG_ENCODING_ASCII}, option) | ||||
} | } | ||||
func initRegexp(re *Regexp, option int) (*Regexp, error) { | func initRegexp(re *Regexp, option int) (*Regexp, error) { | ||||
var err error | |||||
patternCharPtr := C.CString(re.pattern) | patternCharPtr := C.CString(re.pattern) | ||||
defer C.free(unsafe.Pointer(patternCharPtr)) | defer C.free(unsafe.Pointer(patternCharPtr)) | ||||
mutex.Lock() | mutex.Lock() | ||||
defer mutex.Unlock() | defer mutex.Unlock() | ||||
errorCode := C.NewOnigRegex(patternCharPtr, C.int(len(re.pattern)), C.int(option), &re.regex, &re.region, &re.encoding, &re.errorInfo, &re.errorBuf) | |||||
errorCode := C.NewOnigRegex(patternCharPtr, C.int(len(re.pattern)), C.int(option), &re.regex, &re.encoding, &re.errorInfo, &re.errorBuf) | |||||
if errorCode != C.ONIG_NORMAL { | if errorCode != C.ONIG_NORMAL { | ||||
err = errors.New(C.GoString(re.errorBuf)) | |||||
} else { | |||||
err = nil | |||||
numCapturesInPattern := int(C.onig_number_of_captures(re.regex)) + 1 | |||||
re.matchData = &MatchData{} | |||||
re.matchData.indexes = make([][]int32, numMatchStartSize) | |||||
for i := 0; i < numMatchStartSize; i++ { | |||||
re.matchData.indexes[i] = make([]int32, numCapturesInPattern*2) | |||||
} | |||||
re.namedGroupInfo = re.getNamedGroupInfo() | |||||
runtime.SetFinalizer(re, (*Regexp).Free) | |||||
return re, errors.New(C.GoString(re.errorBuf)) | |||||
} | } | ||||
return re, err | |||||
re.numCaptures = int32(C.onig_number_of_captures(re.regex)) + 1 | |||||
re.namedGroupInfo = re.getNamedGroupInfo() | |||||
runtime.SetFinalizer(re, (*Regexp).Free) | |||||
return re, nil | |||||
} | } | ||||
func Compile(str string) (*Regexp, error) { | func Compile(str string) (*Regexp, error) { | ||||
if error != nil { | if error != nil { | ||||
panic("regexp: compiling " + str + ": " + error.Error()) | panic("regexp: compiling " + str + ": " + error.Error()) | ||||
} | } | ||||
return regexp | return regexp | ||||
} | } | ||||
if error != nil { | if error != nil { | ||||
panic("regexp: compiling " + str + ": " + error.Error()) | panic("regexp: compiling " + str + ": " + error.Error()) | ||||
} | } | ||||
return regexp | return regexp | ||||
} | } | ||||
if error != nil { | if error != nil { | ||||
panic("regexp: compiling " + str + ": " + error.Error()) | panic("regexp: compiling " + str + ": " + error.Error()) | ||||
} | } | ||||
return regexp | return regexp | ||||
} | } | ||||
C.onig_free(re.regex) | C.onig_free(re.regex) | ||||
re.regex = nil | re.regex = nil | ||||
} | } | ||||
if re.region != nil { | |||||
C.onig_region_free(re.region, 1) | |||||
re.region = nil | |||||
} | |||||
mutex.Unlock() | mutex.Unlock() | ||||
if re.errorInfo != nil { | if re.errorInfo != nil { | ||||
C.free(unsafe.Pointer(re.errorInfo)) | C.free(unsafe.Pointer(re.errorInfo)) | ||||
} | } | ||||
} | } | ||||
func (re *Regexp) getNamedGroupInfo() (namedGroupInfo NamedGroupInfo) { | |||||
func (re *Regexp) getNamedGroupInfo() NamedGroupInfo { | |||||
numNamedGroups := int(C.onig_number_of_names(re.regex)) | numNamedGroups := int(C.onig_number_of_names(re.regex)) | ||||
//when any named capture exisits, there is no numbered capture even if there are unnamed captures | |||||
if numNamedGroups > 0 { | |||||
namedGroupInfo = make(map[string]int) | |||||
//try to get the names | |||||
bufferSize := len(re.pattern) * 2 | |||||
nameBuffer := make([]byte, bufferSize) | |||||
groupNumbers := make([]int32, numNamedGroups) | |||||
bufferPtr := unsafe.Pointer(&nameBuffer[0]) | |||||
numbersPtr := unsafe.Pointer(&groupNumbers[0]) | |||||
length := int(C.GetCaptureNames(re.regex, bufferPtr, (C.int)(bufferSize), (*C.int)(numbersPtr))) | |||||
if length > 0 { | |||||
namesAsBytes := bytes.Split(nameBuffer[:length], ([]byte)(";")) | |||||
if len(namesAsBytes) != numNamedGroups { | |||||
log.Fatalf("the number of named groups (%d) does not match the number names found (%d)\n", numNamedGroups, len(namesAsBytes)) | |||||
} | |||||
for i, nameAsBytes := range namesAsBytes { | |||||
name := string(nameAsBytes) | |||||
namedGroupInfo[name] = int(groupNumbers[i]) | |||||
} | |||||
} else { | |||||
log.Fatalf("could not get the capture group names from %q", re.String()) | |||||
} | |||||
// when any named capture exists, there is no numbered capture even if | |||||
// there are unnamed captures. | |||||
if numNamedGroups == 0 { | |||||
return nil | |||||
} | } | ||||
return | |||||
} | |||||
func (re *Regexp) groupNameToId(name string) (id int) { | |||||
if re.namedGroupInfo == nil { | |||||
id = ONIGERR_UNDEFINED_NAME_REFERENCE | |||||
} else { | |||||
id = re.namedGroupInfo[name] | |||||
namedGroupInfo := make(map[string]int) | |||||
//try to get the names | |||||
bufferSize := len(re.pattern) * 2 | |||||
nameBuffer := make([]byte, bufferSize) | |||||
groupNumbers := make([]int32, numNamedGroups) | |||||
bufferPtr := unsafe.Pointer(&nameBuffer[0]) | |||||
numbersPtr := unsafe.Pointer(&groupNumbers[0]) | |||||
length := int(C.GetCaptureNames(re.regex, bufferPtr, (C.int)(bufferSize), (*C.int)(numbersPtr))) | |||||
if length == 0 { | |||||
panic(fmt.Errorf("could not get the capture group names from %q", re.String())) | |||||
} | } | ||||
return | |||||
} | |||||
func (re *Regexp) processMatch(numCaptures int) (match []int32) { | |||||
if numCaptures <= 0 { | |||||
panic("cannot have 0 captures when processing a match") | |||||
namesAsBytes := bytes.Split(nameBuffer[:length], ([]byte)(";")) | |||||
if len(namesAsBytes) != numNamedGroups { | |||||
panic(fmt.Errorf( | |||||
"the number of named groups (%d) does not match the number names found (%d)", | |||||
numNamedGroups, len(namesAsBytes), | |||||
)) | |||||
} | |||||
for i, nameAsBytes := range namesAsBytes { | |||||
name := string(nameAsBytes) | |||||
namedGroupInfo[name] = int(groupNumbers[i]) | |||||
} | } | ||||
matchData := re.matchData | |||||
return matchData.indexes[matchData.count][:numCaptures*2] | |||||
} | |||||
func (re *Regexp) ClearMatchData() { | |||||
matchData := re.matchData | |||||
matchData.count = 0 | |||||
return namedGroupInfo | |||||
} | } | ||||
func (re *Regexp) find(b []byte, n int, offset int) (match []int) { | |||||
func (re *Regexp) find(b []byte, n int, offset int) []int { | |||||
match := make([]int, re.numCaptures*2) | |||||
if n == 0 { | if n == 0 { | ||||
b = []byte{0} | b = []byte{0} | ||||
} | } | ||||
ptr := unsafe.Pointer(&b[0]) | |||||
matchData := re.matchData | |||||
capturesPtr := unsafe.Pointer(&(matchData.indexes[matchData.count][0])) | |||||
numCaptures := int32(0) | |||||
bytesPtr := unsafe.Pointer(&b[0]) | |||||
// captures contains two pairs of ints, start and end, so we need list | |||||
// twice the size of the capture groups. | |||||
captures := make([]C.int, re.numCaptures*2) | |||||
capturesPtr := unsafe.Pointer(&captures[0]) | |||||
var numCaptures int32 | |||||
numCapturesPtr := unsafe.Pointer(&numCaptures) | numCapturesPtr := unsafe.Pointer(&numCaptures) | ||||
pos := int(C.SearchOnigRegex((ptr), C.int(n), C.int(offset), C.int(ONIG_OPTION_DEFAULT), re.regex, re.region, re.errorInfo, (*C.char)(nil), (*C.int)(capturesPtr), (*C.int)(numCapturesPtr))) | |||||
if pos >= 0 { | |||||
if numCaptures <= 0 { | |||||
panic("cannot have 0 captures when processing a match") | |||||
} | |||||
match2 := matchData.indexes[matchData.count][:numCaptures*2] | |||||
match = make([]int, len(match2)) | |||||
for i := range match2 { | |||||
match[i] = int(match2[i]) | |||||
} | |||||
numCapturesInPattern := int32(C.onig_number_of_captures(re.regex)) + 1 | |||||
if numCapturesInPattern != numCaptures { | |||||
log.Fatalf("expected %d captures but got %d\n", numCapturesInPattern, numCaptures) | |||||
} | |||||
pos := int(C.SearchOnigRegex( | |||||
bytesPtr, C.int(n), C.int(offset), C.int(ONIG_OPTION_DEFAULT), | |||||
re.regex, re.errorInfo, (*C.char)(nil), (*C.int)(capturesPtr), (*C.int)(numCapturesPtr), | |||||
)) | |||||
if pos < 0 { | |||||
return nil | |||||
} | |||||
if numCaptures <= 0 { | |||||
panic("cannot have 0 captures when processing a match") | |||||
} | |||||
if re.numCaptures != numCaptures { | |||||
panic(fmt.Errorf("expected %d captures but got %d", re.numCaptures, numCaptures)) | |||||
} | |||||
for i := range captures { | |||||
match[i] = int(captures[i]) | |||||
} | } | ||||
return | |||||
return match | |||||
} | } | ||||
func getCapture(b []byte, beg int, end int) []byte { | func getCapture(b []byte, beg int, end int) []byte { | ||||
if beg < 0 || end < 0 { | if beg < 0 || end < 0 { | ||||
return nil | return nil | ||||
} | } | ||||
return b[beg:end] | return b[beg:end] | ||||
} | } | ||||
func (re *Regexp) match(b []byte, n int, offset int) bool { | func (re *Regexp) match(b []byte, n int, offset int) bool { | ||||
re.ClearMatchData() | |||||
if n == 0 { | if n == 0 { | ||||
b = []byte{0} | b = []byte{0} | ||||
} | } | ||||
ptr := unsafe.Pointer(&b[0]) | |||||
pos := int(C.SearchOnigRegex((ptr), C.int(n), C.int(offset), C.int(ONIG_OPTION_DEFAULT), re.regex, re.region, re.errorInfo, (*C.char)(nil), (*C.int)(nil), (*C.int)(nil))) | |||||
bytesPtr := unsafe.Pointer(&b[0]) | |||||
pos := int(C.SearchOnigRegex( | |||||
bytesPtr, C.int(n), C.int(offset), C.int(ONIG_OPTION_DEFAULT), | |||||
re.regex, re.errorInfo, nil, nil, nil, | |||||
)) | |||||
return pos >= 0 | return pos >= 0 | ||||
} | } | ||||
func (re *Regexp) findAll(b []byte, n int) (matches [][]int) { | |||||
re.ClearMatchData() | |||||
func (re *Regexp) findAll(b []byte, n int) [][]int { | |||||
if n < 0 { | if n < 0 { | ||||
n = len(b) | n = len(b) | ||||
} | } | ||||
matchData := re.matchData | |||||
offset := 0 | |||||
capture := make([][]int, 0, numMatchStartSize) | |||||
var offset int | |||||
for offset <= n { | for offset <= n { | ||||
if matchData.count >= len(matchData.indexes) { | |||||
length := len(matchData.indexes[0]) | |||||
matchData.indexes = append(matchData.indexes, make([]int32, length)) | |||||
} | |||||
if match := re.find(b, n, offset); len(match) > 0 { | |||||
matchData.count += 1 | |||||
//move offset to the ending index of the current match and prepare to find the next non-overlapping match | |||||
offset = match[1] | |||||
//if match[0] == match[1], it means the current match does not advance the search. we need to exit the loop to avoid getting stuck here. | |||||
if match[0] == match[1] { | |||||
if offset < n && offset >= 0 { | |||||
//there are more bytes, so move offset by a word | |||||
_, width := utf8.DecodeRune(b[offset:]) | |||||
offset += width | |||||
} else { | |||||
//search is over, exit loop | |||||
break | |||||
} | |||||
} | |||||
} else { | |||||
match := re.find(b, n, offset) | |||||
if match == nil { | |||||
break | break | ||||
} | } | ||||
} | |||||
matches2 := matchData.indexes[:matchData.count] | |||||
matches = make([][]int, len(matches2)) | |||||
for i, v := range matches2 { | |||||
matches[i] = make([]int, len(v)) | |||||
for j, v2 := range v { | |||||
matches[i][j] = int(v2) | |||||
capture = append(capture, match) | |||||
// move offset to the ending index of the current match and prepare to | |||||
// find the next non-overlapping match. | |||||
offset = match[1] | |||||
// if match[0] == match[1], it means the current match does not advance | |||||
// the search. we need to exit the loop to avoid getting stuck here. | |||||
if match[0] == match[1] { | |||||
if offset < n && offset >= 0 { | |||||
//there are more bytes, so move offset by a word | |||||
_, width := utf8.DecodeRune(b[offset:]) | |||||
offset += width | |||||
} else { | |||||
//search is over, exit loop | |||||
break | |||||
} | |||||
} | } | ||||
} | } | ||||
return | |||||
return capture | |||||
} | } | ||||
func (re *Regexp) FindIndex(b []byte) []int { | func (re *Regexp) FindIndex(b []byte) []int { | ||||
re.ClearMatchData() | |||||
match := re.find(b, len(b), 0) | match := re.find(b, len(b), 0) | ||||
if len(match) == 0 { | if len(match) == 0 { | ||||
return nil | return nil | ||||
} | } | ||||
return match[:2] | return match[:2] | ||||
} | } | ||||
if loc == nil { | if loc == nil { | ||||
return nil | return nil | ||||
} | } | ||||
return getCapture(b, loc[0], loc[1]) | return getCapture(b, loc[0], loc[1]) | ||||
} | } | ||||
func (re *Regexp) FindString(s string) string { | func (re *Regexp) FindString(s string) string { | ||||
b := []byte(s) | |||||
mb := re.Find(b) | |||||
mb := re.Find([]byte(s)) | |||||
if mb == nil { | if mb == nil { | ||||
return "" | return "" | ||||
} | } | ||||
return string(mb) | return string(mb) | ||||
} | } | ||||
func (re *Regexp) FindStringIndex(s string) []int { | func (re *Regexp) FindStringIndex(s string) []int { | ||||
b := []byte(s) | |||||
return re.FindIndex(b) | |||||
return re.FindIndex([]byte(s)) | |||||
} | } | ||||
func (re *Regexp) FindAllIndex(b []byte, n int) [][]int { | func (re *Regexp) FindAllIndex(b []byte, n int) [][]int { | ||||
if len(matches) == 0 { | if len(matches) == 0 { | ||||
return nil | return nil | ||||
} | } | ||||
return matches | return matches | ||||
} | } | ||||
if matches == nil { | if matches == nil { | ||||
return nil | return nil | ||||
} | } | ||||
matchBytes := make([][]byte, 0, len(matches)) | matchBytes := make([][]byte, 0, len(matches)) | ||||
for _, match := range matches { | for _, match := range matches { | ||||
matchBytes = append(matchBytes, getCapture(b, match[0], match[1])) | matchBytes = append(matchBytes, getCapture(b, match[0], match[1])) | ||||
} | } | ||||
return matchBytes | return matchBytes | ||||
} | } | ||||
if matches == nil { | if matches == nil { | ||||
return nil | return nil | ||||
} | } | ||||
matchStrings := make([]string, 0, len(matches)) | matchStrings := make([]string, 0, len(matches)) | ||||
for _, match := range matches { | for _, match := range matches { | ||||
m := getCapture(b, match[0], match[1]) | m := getCapture(b, match[0], match[1]) | ||||
matchStrings = append(matchStrings, string(m)) | matchStrings = append(matchStrings, string(m)) | ||||
} | } | ||||
} | } | ||||
return matchStrings | return matchStrings | ||||
} | } | ||||
func (re *Regexp) FindAllStringIndex(s string, n int) [][]int { | func (re *Regexp) FindAllStringIndex(s string, n int) [][]int { | ||||
b := []byte(s) | |||||
return re.FindAllIndex(b, n) | |||||
} | |||||
func (re *Regexp) findSubmatchIndex(b []byte) (match []int) { | |||||
re.ClearMatchData() | |||||
match = re.find(b, len(b), 0) | |||||
return | |||||
return re.FindAllIndex([]byte(s), n) | |||||
} | } | ||||
func (re *Regexp) FindSubmatchIndex(b []byte) []int { | func (re *Regexp) FindSubmatchIndex(b []byte) []int { | ||||
match := re.findSubmatchIndex(b) | |||||
match := re.find(b, len(b), 0) | |||||
if len(match) == 0 { | if len(match) == 0 { | ||||
return nil | return nil | ||||
} | } | ||||
return match | return match | ||||
} | } | ||||
func (re *Regexp) FindSubmatch(b []byte) [][]byte { | func (re *Regexp) FindSubmatch(b []byte) [][]byte { | ||||
match := re.findSubmatchIndex(b) | |||||
match := re.FindSubmatchIndex(b) | |||||
if match == nil { | if match == nil { | ||||
return nil | return nil | ||||
} | } | ||||
length := len(match) / 2 | length := len(match) / 2 | ||||
if length == 0 { | if length == 0 { | ||||
return nil | return nil | ||||
} | } | ||||
results := make([][]byte, 0, length) | results := make([][]byte, 0, length) | ||||
for i := 0; i < length; i++ { | for i := 0; i < length; i++ { | ||||
results = append(results, getCapture(b, match[2*i], match[2*i+1])) | results = append(results, getCapture(b, match[2*i], match[2*i+1])) | ||||
} | } | ||||
return results | return results | ||||
} | } | ||||
func (re *Regexp) FindStringSubmatch(s string) []string { | func (re *Regexp) FindStringSubmatch(s string) []string { | ||||
b := []byte(s) | b := []byte(s) | ||||
match := re.findSubmatchIndex(b) | |||||
match := re.FindSubmatchIndex(b) | |||||
if match == nil { | if match == nil { | ||||
return nil | return nil | ||||
} | } | ||||
length := len(match) / 2 | length := len(match) / 2 | ||||
if length == 0 { | if length == 0 { | ||||
return nil | return nil | ||||
results = append(results, string(cap)) | results = append(results, string(cap)) | ||||
} | } | ||||
} | } | ||||
return results | return results | ||||
} | } | ||||
func (re *Regexp) FindStringSubmatchIndex(s string) []int { | func (re *Regexp) FindStringSubmatchIndex(s string) []int { | ||||
b := []byte(s) | |||||
return re.FindSubmatchIndex(b) | |||||
return re.FindSubmatchIndex([]byte(s)) | |||||
} | } | ||||
func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int { | func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int { | ||||
if len(matches) == 0 { | if len(matches) == 0 { | ||||
return nil | return nil | ||||
} | } | ||||
return matches | return matches | ||||
} | } | ||||
if len(matches) == 0 { | if len(matches) == 0 { | ||||
return nil | return nil | ||||
} | } | ||||
allCapturedBytes := make([][][]byte, 0, len(matches)) | allCapturedBytes := make([][][]byte, 0, len(matches)) | ||||
for _, match := range matches { | for _, match := range matches { | ||||
length := len(match) / 2 | length := len(match) / 2 | ||||
for i := 0; i < length; i++ { | for i := 0; i < length; i++ { | ||||
capturedBytes = append(capturedBytes, getCapture(b, match[2*i], match[2*i+1])) | capturedBytes = append(capturedBytes, getCapture(b, match[2*i], match[2*i+1])) | ||||
} | } | ||||
allCapturedBytes = append(allCapturedBytes, capturedBytes) | allCapturedBytes = append(allCapturedBytes, capturedBytes) | ||||
} | } | ||||
func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string { | func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string { | ||||
b := []byte(s) | b := []byte(s) | ||||
matches := re.findAll(b, n) | matches := re.findAll(b, n) | ||||
if len(matches) == 0 { | if len(matches) == 0 { | ||||
return nil | return nil | ||||
} | } | ||||
allCapturedStrings := make([][]string, 0, len(matches)) | allCapturedStrings := make([][]string, 0, len(matches)) | ||||
for _, match := range matches { | for _, match := range matches { | ||||
length := len(match) / 2 | length := len(match) / 2 | ||||
capturedStrings = append(capturedStrings, string(cap)) | capturedStrings = append(capturedStrings, string(cap)) | ||||
} | } | ||||
} | } | ||||
allCapturedStrings = append(allCapturedStrings, capturedStrings) | allCapturedStrings = append(allCapturedStrings, capturedStrings) | ||||
} | } | ||||
return allCapturedStrings | return allCapturedStrings | ||||
} | } | ||||
func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int { | func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int { | ||||
b := []byte(s) | |||||
return re.FindAllSubmatchIndex(b, n) | |||||
return re.FindAllSubmatchIndex([]byte(s), n) | |||||
} | } | ||||
func (re *Regexp) Match(b []byte) bool { | func (re *Regexp) Match(b []byte) bool { | ||||
} | } | ||||
func (re *Regexp) MatchString(s string) bool { | func (re *Regexp) MatchString(s string) bool { | ||||
b := []byte(s) | |||||
return re.Match(b) | |||||
return re.Match([]byte(s)) | |||||
} | } | ||||
func (re *Regexp) NumSubexp() int { | func (re *Regexp) NumSubexp() int { | ||||
return (int)(C.onig_number_of_captures(re.regex)) | return (int)(C.onig_number_of_captures(re.regex)) | ||||
} | } | ||||
func (re *Regexp) getNamedCapture(name []byte, capturedBytes [][]byte) []byte { | |||||
nameStr := string(name) | |||||
capNum := re.groupNameToId(nameStr) | |||||
if capNum < 0 || capNum >= len(capturedBytes) { | |||||
panic(fmt.Sprintf("capture group name (%q) has error\n", nameStr)) | |||||
} | |||||
return capturedBytes[capNum] | |||||
} | |||||
func (re *Regexp) getNumberedCapture(num int, capturedBytes [][]byte) []byte { | |||||
//when named capture groups exist, numbered capture groups returns "" | |||||
if re.namedGroupInfo == nil && num <= (len(capturedBytes)-1) && num >= 0 { | |||||
return capturedBytes[num] | |||||
} | |||||
return ([]byte)("") | |||||
} | |||||
func fillCapturedValues(repl []byte, _ []byte, capturedBytes map[string][]byte) []byte { | func fillCapturedValues(repl []byte, _ []byte, capturedBytes map[string][]byte) []byte { | ||||
replLen := len(repl) | replLen := len(repl) | ||||
newRepl := make([]byte, 0, replLen*3) | newRepl := make([]byte, 0, replLen*3) | ||||
inEscapeMode := false | |||||
inGroupNameMode := false | |||||
groupName := make([]byte, 0, replLen) | groupName := make([]byte, 0, replLen) | ||||
for index := 0; index < replLen; index += 1 { | |||||
var inGroupNameMode, inEscapeMode bool | |||||
for index := 0; index < replLen; index++ { | |||||
ch := repl[index] | ch := repl[index] | ||||
if inGroupNameMode && ch == byte('<') { | if inGroupNameMode && ch == byte('<') { | ||||
} else if inGroupNameMode && ch == byte('>') { | } else if inGroupNameMode && ch == byte('>') { | ||||
inGroupNameMode = false | inGroupNameMode = false | ||||
groupNameStr := string(groupName) | |||||
capBytes := capturedBytes[groupNameStr] | |||||
capBytes := capturedBytes[string(groupName)] | |||||
newRepl = append(newRepl, capBytes...) | newRepl = append(newRepl, capBytes...) | ||||
groupName = groupName[:0] //reset the name | groupName = groupName[:0] //reset the name | ||||
} else if inGroupNameMode { | } else if inGroupNameMode { | ||||
} else if inEscapeMode && ch == byte('k') && (index+1) < replLen && repl[index+1] == byte('<') { | } else if inEscapeMode && ch == byte('k') && (index+1) < replLen && repl[index+1] == byte('<') { | ||||
inGroupNameMode = true | inGroupNameMode = true | ||||
inEscapeMode = false | inEscapeMode = false | ||||
index += 1 //bypass the next char '<' | |||||
index++ //bypass the next char '<' | |||||
} else if inEscapeMode { | } else if inEscapeMode { | ||||
newRepl = append(newRepl, '\\') | newRepl = append(newRepl, '\\') | ||||
newRepl = append(newRepl, ch) | newRepl = append(newRepl, ch) | ||||
inEscapeMode = !inEscapeMode | inEscapeMode = !inEscapeMode | ||||
} | } | ||||
} | } | ||||
return newRepl | return newRepl | ||||
} | } | ||||
if len(matches) == 0 { | if len(matches) == 0 { | ||||
return src | return src | ||||
} | } | ||||
dest := make([]byte, 0, srcLen) | dest := make([]byte, 0, srcLen) | ||||
for i, match := range matches { | for i, match := range matches { | ||||
length := len(match) / 2 | length := len(match) / 2 | ||||
capturedBytes := make(map[string][]byte) | capturedBytes := make(map[string][]byte) | ||||
if re.namedGroupInfo == nil { | if re.namedGroupInfo == nil { | ||||
for j := 0; j < length; j++ { | for j := 0; j < length; j++ { | ||||
capturedBytes[strconv.Itoa(j)] = getCapture(src, match[2*j], match[2*j+1]) | capturedBytes[strconv.Itoa(j)] = getCapture(src, match[2*j], match[2*j+1]) | ||||
capturedBytes[name] = getCapture(src, match[2*j], match[2*j+1]) | capturedBytes[name] = getCapture(src, match[2*j], match[2*j+1]) | ||||
} | } | ||||
} | } | ||||
matchBytes := getCapture(src, match[0], match[1]) | matchBytes := getCapture(src, match[0], match[1]) | ||||
newRepl := replFunc(repl, matchBytes, capturedBytes) | newRepl := replFunc(repl, matchBytes, capturedBytes) | ||||
prevEnd := 0 | prevEnd := 0 | ||||
prevMatch := matches[i-1][:2] | prevMatch := matches[i-1][:2] | ||||
prevEnd = prevMatch[1] | prevEnd = prevMatch[1] | ||||
} | } | ||||
if match[0] > prevEnd && prevEnd >= 0 && match[0] <= srcLen { | if match[0] > prevEnd && prevEnd >= 0 && match[0] <= srcLen { | ||||
dest = append(dest, src[prevEnd:match[0]]...) | dest = append(dest, src[prevEnd:match[0]]...) | ||||
} | } | ||||
dest = append(dest, newRepl...) | dest = append(dest, newRepl...) | ||||
} | } | ||||
lastEnd := matches[len(matches)-1][1] | lastEnd := matches[len(matches)-1][1] | ||||
if lastEnd < srcLen && lastEnd >= 0 { | if lastEnd < srcLen && lastEnd >= 0 { | ||||
dest = append(dest, src[lastEnd:]...) | dest = append(dest, src[lastEnd:]...) | ||||
} | } | ||||
return dest | return dest | ||||
} | } | ||||
} | } | ||||
func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte { | func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte { | ||||
return re.replaceAll(src, []byte(""), func(_ []byte, matchBytes []byte, _ map[string][]byte) []byte { | |||||
return re.replaceAll(src, nil, func(_ []byte, matchBytes []byte, _ map[string][]byte) []byte { | |||||
return repl(matchBytes) | return repl(matchBytes) | ||||
}) | }) | ||||
} | } | ||||
} | } | ||||
func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string { | func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string { | ||||
srcB := []byte(src) | |||||
destB := re.replaceAll(srcB, []byte(""), func(_ []byte, matchBytes []byte, _ map[string][]byte) []byte { | |||||
return string(re.replaceAll([]byte(src), nil, func(_ []byte, matchBytes []byte, _ map[string][]byte) []byte { | |||||
return []byte(repl(string(matchBytes))) | return []byte(repl(string(matchBytes))) | ||||
}) | |||||
return string(destB) | |||||
})) | |||||
} | } | ||||
func (re *Regexp) String() string { | func (re *Regexp) String() string { | ||||
return re.pattern | return re.pattern | ||||
} | } | ||||
func grow_buffer(b []byte, offset int, n int) []byte { | |||||
func growBuffer(b []byte, offset int, n int) []byte { | |||||
if offset+n > cap(b) { | if offset+n > cap(b) { | ||||
buf := make([]byte, 2*cap(b)+n) | buf := make([]byte, 2*cap(b)+n) | ||||
copy(buf, b[:offset]) | copy(buf, b[:offset]) | ||||
return buf | return buf | ||||
} | } | ||||
return b | return b | ||||
} | } | ||||
func fromReader(r io.RuneReader) []byte { | func fromReader(r io.RuneReader) []byte { | ||||
b := make([]byte, numReadBufferStartSize) | b := make([]byte, numReadBufferStartSize) | ||||
offset := 0 | |||||
var err error = nil | |||||
for err == nil { | |||||
var offset int | |||||
for { | |||||
rune, runeWidth, err := r.ReadRune() | rune, runeWidth, err := r.ReadRune() | ||||
if err == nil { | |||||
b = grow_buffer(b, offset, runeWidth) | |||||
writeWidth := utf8.EncodeRune(b[offset:], rune) | |||||
if runeWidth != writeWidth { | |||||
panic("reading rune width not equal to the written rune width") | |||||
} | |||||
offset += writeWidth | |||||
} else { | |||||
if err != nil { | |||||
break | break | ||||
} | } | ||||
b = growBuffer(b, offset, runeWidth) | |||||
writeWidth := utf8.EncodeRune(b[offset:], rune) | |||||
if runeWidth != writeWidth { | |||||
panic("reading rune width not equal to the written rune width") | |||||
} | |||||
offset += writeWidth | |||||
} | } | ||||
return b[:offset] | return b[:offset] | ||||
} | } | ||||
if err != nil { | if err != nil { | ||||
return false, err | return false, err | ||||
} | } | ||||
return re.MatchString(s), nil | return re.MatchString(s), nil | ||||
} | } | ||||
func (re *Regexp) Gsub(src, repl string) string { | func (re *Regexp) Gsub(src, repl string) string { | ||||
srcBytes := ([]byte)(src) | |||||
replBytes := ([]byte)(repl) | |||||
replaced := re.replaceAll(srcBytes, replBytes, fillCapturedValues) | |||||
return string(replaced) | |||||
return string(re.replaceAll([]byte(src), []byte(repl), fillCapturedValues)) | |||||
} | } | ||||
func (re *Regexp) GsubFunc(src string, replFunc func(string, map[string]string) string) string { | func (re *Regexp) GsubFunc(src string, replFunc func(string, map[string]string) string) string { | ||||
srcBytes := ([]byte)(src) | |||||
replaced := re.replaceAll(srcBytes, nil, func(_ []byte, matchBytes []byte, capturedBytes map[string][]byte) []byte { | |||||
capturedStrings := make(map[string]string) | |||||
for name, capBytes := range capturedBytes { | |||||
capturedStrings[name] = string(capBytes) | |||||
} | |||||
matchString := string(matchBytes) | |||||
return ([]byte)(replFunc(matchString, capturedStrings)) | |||||
}) | |||||
replaced := re.replaceAll([]byte(src), nil, | |||||
func(_ []byte, matchBytes []byte, capturedBytes map[string][]byte) []byte { | |||||
capturedStrings := make(map[string]string) | |||||
for name, capBytes := range capturedBytes { | |||||
capturedStrings[name] = string(capBytes) | |||||
} | |||||
matchString := string(matchBytes) | |||||
return ([]byte)(replFunc(matchString, capturedStrings)) | |||||
}, | |||||
) | |||||
return string(replaced) | return string(replaced) | ||||
} | } |
Copyright (c) 2013 Caleb Spare | |||||
MIT License | |||||
Permission is hereby granted, free of charge, to any person obtaining | |||||
a copy of this software and associated documentation files (the | |||||
"Software"), to deal in the Software without restriction, including | |||||
without limitation the rights to use, copy, modify, merge, publish, | |||||
distribute, sublicense, and/or sell copies of the Software, and to | |||||
permit persons to whom the Software is furnished to do so, subject to | |||||
the following conditions: | |||||
The above copyright notice and this permission notice shall be | |||||
included in all copies or substantial portions of the Software. | |||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE | |||||
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION | |||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION | |||||
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
# Trie | |||||
[![GoDoc](http://godoc.org/github.com/toqueteos/trie?status.png)](http://godoc.org/github.com/toqueteos/trie) | |||||
This is a fork of https://github.com/cespare/go-trie that adds the `PrefixIndex` method. | |||||
It's required for https://github.com/toqueteos/substring. |
module github.com/toqueteos/trie |
// Package trie is an implementation of a trie (prefix tree) data structure over byte slices. It provides a | |||||
// small and simple API for usage as a set as well as a 'Node' API for walking the trie. | |||||
package trie | |||||
// A Trie is a a prefix tree. | |||||
type Trie struct { | |||||
root *Node | |||||
} | |||||
// New construct a new, empty Trie ready for use. | |||||
func New() *Trie { | |||||
return &Trie{ | |||||
root: &Node{}, | |||||
} | |||||
} | |||||
// Insert puts b into the Trie. It returns true if the element was not previously in t. | |||||
func (t *Trie) Insert(b []byte) bool { | |||||
n := t.root | |||||
for _, c := range b { | |||||
next, ok := n.Walk(c) | |||||
if !ok { | |||||
next = &Node{} | |||||
n.branches[c] = next | |||||
n.hasChildren = true | |||||
} | |||||
n = next | |||||
} | |||||
if n.terminal { | |||||
return false | |||||
} | |||||
n.terminal = true | |||||
return true | |||||
} | |||||
// Contains checks t for membership of b. | |||||
func (t *Trie) Contains(b []byte) bool { | |||||
n := t.root | |||||
for _, c := range b { | |||||
next, ok := n.Walk(c) | |||||
if !ok { | |||||
return false | |||||
} | |||||
n = next | |||||
} | |||||
return n.terminal | |||||
} | |||||
// PrefixIndex walks through `b` until a prefix is found (terminal node) or it is exhausted. | |||||
func (t *Trie) PrefixIndex(b []byte) int { | |||||
var idx int | |||||
n := t.root | |||||
for _, c := range b { | |||||
next, ok := n.Walk(c) | |||||
if !ok { | |||||
return -1 | |||||
} | |||||
if next.terminal { | |||||
return idx | |||||
} | |||||
n = next | |||||
idx++ | |||||
} | |||||
if !n.terminal { | |||||
idx = -1 | |||||
} | |||||
return idx | |||||
} | |||||
// Root returns the root node of a Trie. A valid Trie (i.e., constructed with New), always has a non-nil root | |||||
// node. | |||||
func (t *Trie) Root() *Node { | |||||
return t.root | |||||
} | |||||
// A Node represents a logical vertex in the trie structure. | |||||
type Node struct { | |||||
branches [256]*Node | |||||
terminal bool | |||||
hasChildren bool | |||||
} | |||||
// Walk returns the node reached along edge c, if one exists. The ok value indicates whether such a node | |||||
// exist. | |||||
func (n *Node) Walk(c byte) (next *Node, ok bool) { | |||||
next = n.branches[int(c)] | |||||
return next, (next != nil) | |||||
} | |||||
// Terminal indicates whether n is terminal in the trie (that is, whether the path from the root to n | |||||
// represents an element in the set). For instance, if the root node is terminal, then []byte{} is in the | |||||
// trie. | |||||
func (n *Node) Terminal() bool { | |||||
return n.terminal | |||||
} | |||||
// Leaf indicates whether n is a leaf node in the trie (that is, whether it has children). A leaf node must be | |||||
// terminal (else it would not exist). Logically, if n is a leaf node then the []byte represented by the path | |||||
// from the root to n is not a proper prefix of any element of the trie. | |||||
func (n *Node) Leaf() bool { | |||||
return !n.hasChildren | |||||
} |
# Compiled Object files, Static and Dynamic libs (Shared Objects) | |||||
*.o | |||||
*.a | |||||
*.so | |||||
# Folders | |||||
_obj | |||||
_test | |||||
# Architecture specific extensions/prefixes | |||||
*.[568vq] | |||||
[568vq].out | |||||
*.cgo1.go | |||||
*.cgo2.c | |||||
_cgo_defun.c | |||||
_cgo_gotypes.go | |||||
_cgo_export.* | |||||
_testmain.go | |||||
*.exe | |||||
*.test | |||||
*.prof |
language: go | |||||
go: | |||||
- 1.2 | |||||
- 1.3 | |||||
- 1.4 | |||||
- tip | |||||
script: | |||||
- go get launchpad.net/gocheck | |||||
- go test |
The MIT License (MIT) | |||||
Copyright (c) 2015 Carlos Cobo | |||||
Permission is hereby granted, free of charge, to any person obtaining a copy | |||||
of this software and associated documentation files (the "Software"), to deal | |||||
in the Software without restriction, including without limitation the rights | |||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||||
copies of the Software, and to permit persons to whom the Software is | |||||
furnished to do so, subject to the following conditions: | |||||
The above copyright notice and this permission notice shall be included in all | |||||
copies or substantial portions of the Software. | |||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||||
SOFTWARE. | |||||
# substring [![Build Status](https://travis-ci.org/toqueteos/substring.png?branch=master)](https://travis-ci.org/toqueteos/substring) [![GoDoc](http://godoc.org/github.com/toqueteos/substring?status.png)](http://godoc.org/github.com/toqueteos/substring) [![GitHub release](https://img.shields.io/github/release/toqueteos/substring.svg)](https://github.com/toqueteos/substring/releases) | |||||
Simple and composable alternative to [regexp](http://golang.org/pkg/regexp/) package for fast substring searches. | |||||
## Installation | |||||
The recommended way to install substring | |||||
``` | |||||
go get -t gopkg.in/toqueteos/substring.v1 | |||||
``` | |||||
The `-t` flag is for fetching [gocheck](https://gopkg.in/check.v1), required for tests and benchmarks. | |||||
## Examples | |||||
A basic example with two matchers: | |||||
```go | |||||
package main | |||||
import ( | |||||
"fmt" | |||||
"regexp" | |||||
"gopkg.in/toqueteos/substring.v1" | |||||
) | |||||
func main() { | |||||
m1 := substring.After("assets/", substring.Or( | |||||
substring.Has("jquery"), | |||||
substring.Has("angular"), | |||||
substring.Suffixes(".js", ".css", ".html"), | |||||
)) | |||||
fmt.Println(m1.Match("assets/angular/foo/bar")) //Prints: true | |||||
fmt.Println(m1.Match("assets/js/file.js")) //Prints: true | |||||
fmt.Println(m1.Match("assets/style/bar.css")) //Prints: true | |||||
fmt.Println(m1.Match("assets/foo/bar.html")) //Prints: false | |||||
fmt.Println(m1.Match("assets/js/qux.json")) //Prints: false | |||||
fmt.Println(m1.Match("core/file.html")) //Prints: false | |||||
fmt.Println(m1.Match("foobar/that.jsx")) //Prints: false | |||||
m2 := substring.After("vendor/", substring.Suffixes(".css", ".js", ".less")) | |||||
fmt.Println(m2.Match("foo/vendor/bar/qux.css")) //Prints: true | |||||
fmt.Println(m2.Match("foo/var/qux.less")) //Prints: false | |||||
re := regexp.MustCompile(`vendor\/.*\.(css|js|less)$`) | |||||
fmt.Println(re.MatchString("foo/vendor/bar/qux.css")) //Prints: true | |||||
fmt.Println(re.MatchString("foo/var/qux.less")) //Prints: false | |||||
} | |||||
``` | |||||
## How fast? | |||||
It may vary depending on your use case but 1~2 orders of magnitude faster than `regexp` is pretty common. | |||||
Test it out for yourself by running `go test -check.b`! | |||||
``` | |||||
$ go test -check.b | |||||
PASS: lib_test.go:18: LibSuite.BenchmarkExample1 10000000 221 ns/op | |||||
PASS: lib_test.go:23: LibSuite.BenchmarkExample2 10000000 229 ns/op | |||||
PASS: lib_test.go:28: LibSuite.BenchmarkExample3 10000000 216 ns/op | |||||
PASS: lib_test.go:33: LibSuite.BenchmarkExample4 10000000 208 ns/op | |||||
PASS: lib_test.go:38: LibSuite.BenchmarkExample5 20000000 82.1 ns/op | |||||
PASS: lib_test.go:48: LibSuite.BenchmarkExampleRe1 500000 4136 ns/op | |||||
PASS: lib_test.go:53: LibSuite.BenchmarkExampleRe2 500000 5222 ns/op | |||||
PASS: lib_test.go:58: LibSuite.BenchmarkExampleRe3 500000 5116 ns/op | |||||
PASS: lib_test.go:63: LibSuite.BenchmarkExampleRe4 500000 4020 ns/op | |||||
PASS: lib_test.go:68: LibSuite.BenchmarkExampleRe5 10000000 226 ns/op | |||||
OK: 10 passed | |||||
PASS | |||||
ok gopkg.in/toqueteos/substring.v1 23.471s | |||||
``` | |||||
License | |||||
------- | |||||
MIT, see [LICENSE](LICENSE) |
package substring | |||||
import ( | |||||
"bytes" | |||||
"regexp" | |||||
"github.com/toqueteos/trie" | |||||
) | |||||
type BytesMatcher interface { | |||||
Match(b []byte) bool | |||||
MatchIndex(b []byte) int | |||||
} | |||||
// regexp | |||||
type regexpBytes struct{ re *regexp.Regexp } | |||||
func BytesRegexp(pat string) *regexpBytes { return ®expBytes{regexp.MustCompile(pat)} } | |||||
func (m *regexpBytes) Match(b []byte) bool { return m.re.Match(b) } | |||||
func (m *regexpBytes) MatchIndex(b []byte) int { | |||||
found := m.re.FindIndex(b) | |||||
if found != nil { | |||||
return found[1] | |||||
} | |||||
return -1 | |||||
} | |||||
// exact | |||||
type exactBytes struct{ pat []byte } | |||||
func BytesExact(pat string) *exactBytes { return &exactBytes{[]byte(pat)} } | |||||
func (m *exactBytes) Match(b []byte) bool { | |||||
l, r := len(m.pat), len(b) | |||||
if l != r { | |||||
return false | |||||
} | |||||
for i := 0; i < l; i++ { | |||||
if b[i] != m.pat[i] { | |||||
return false | |||||
} | |||||
} | |||||
return true | |||||
} | |||||
func (m *exactBytes) MatchIndex(b []byte) int { | |||||
if m.Match(b) { | |||||
return len(b) | |||||
} | |||||
return -1 | |||||
} | |||||
// any, search `s` in `.Match(pat)` | |||||
type anyBytes struct { | |||||
pat []byte | |||||
} | |||||
func BytesAny(pat string) *anyBytes { return &anyBytes{[]byte(pat)} } | |||||
func (m *anyBytes) Match(b []byte) bool { return bytes.Index(m.pat, b) >= 0 } | |||||
func (m *anyBytes) MatchIndex(b []byte) int { | |||||
if idx := bytes.Index(m.pat, b); idx >= 0 { | |||||
return idx + len(b) | |||||
} | |||||
return -1 | |||||
} | |||||
// has, search `pat` in `.Match(s)` | |||||
type hasBytes struct { | |||||
pat []byte | |||||
} | |||||
func BytesHas(pat string) *hasBytes { return &hasBytes{[]byte(pat)} } | |||||
func (m *hasBytes) Match(b []byte) bool { return bytes.Index(b, m.pat) >= 0 } | |||||
func (m *hasBytes) MatchIndex(b []byte) int { | |||||
if idx := bytes.Index(b, m.pat); idx >= 0 { | |||||
return idx + len(m.pat) | |||||
} | |||||
return -1 | |||||
} | |||||
// prefix | |||||
type prefixBytes struct{ pat []byte } | |||||
func BytesPrefix(pat string) *prefixBytes { return &prefixBytes{[]byte(pat)} } | |||||
func (m *prefixBytes) Match(b []byte) bool { return bytes.HasPrefix(b, m.pat) } | |||||
func (m *prefixBytes) MatchIndex(b []byte) int { | |||||
if bytes.HasPrefix(b, m.pat) { | |||||
return len(m.pat) | |||||
} | |||||
return -1 | |||||
} | |||||
// prefixes | |||||
type prefixesBytes struct { | |||||
t *trie.Trie | |||||
} | |||||
func BytesPrefixes(pats ...string) *prefixesBytes { | |||||
t := trie.New() | |||||
for _, pat := range pats { | |||||
t.Insert([]byte(pat)) | |||||
} | |||||
return &prefixesBytes{t} | |||||
} | |||||
func (m *prefixesBytes) Match(b []byte) bool { return m.t.PrefixIndex(b) >= 0 } | |||||
func (m *prefixesBytes) MatchIndex(b []byte) int { | |||||
if idx := m.t.PrefixIndex(b); idx >= 0 { | |||||
return idx | |||||
} | |||||
return -1 | |||||
} | |||||
// suffix | |||||
type suffixBytes struct{ pat []byte } | |||||
func BytesSuffix(pat string) *suffixBytes { return &suffixBytes{[]byte(pat)} } | |||||
func (m *suffixBytes) Match(b []byte) bool { return bytes.HasSuffix(b, m.pat) } | |||||
func (m *suffixBytes) MatchIndex(b []byte) int { | |||||
if bytes.HasSuffix(b, m.pat) { | |||||
return len(m.pat) | |||||
} | |||||
return -1 | |||||
} | |||||
// suffixes | |||||
type suffixesBytes struct { | |||||
t *trie.Trie | |||||
} | |||||
func BytesSuffixes(pats ...string) *suffixesBytes { | |||||
t := trie.New() | |||||
for _, pat := range pats { | |||||
t.Insert(reverse([]byte(pat))) | |||||
} | |||||
return &suffixesBytes{t} | |||||
} | |||||
func (m *suffixesBytes) Match(b []byte) bool { | |||||
return m.t.PrefixIndex(reverse(b)) >= 0 | |||||
} | |||||
func (m *suffixesBytes) MatchIndex(b []byte) int { | |||||
if idx := m.t.PrefixIndex(reverse(b)); idx >= 0 { | |||||
return idx | |||||
} | |||||
return -1 | |||||
} | |||||
// after | |||||
type afterBytes struct { | |||||
first []byte | |||||
matcher BytesMatcher | |||||
} | |||||
func BytesAfter(first string, m BytesMatcher) *afterBytes { return &afterBytes{[]byte(first), m} } | |||||
func (a *afterBytes) Match(b []byte) bool { | |||||
if idx := bytes.Index(b, a.first); idx >= 0 { | |||||
return a.matcher.Match(b[idx+len(a.first):]) | |||||
} | |||||
return false | |||||
} | |||||
func (a *afterBytes) MatchIndex(b []byte) int { | |||||
if idx := bytes.Index(b, a.first); idx >= 0 { | |||||
return idx + a.matcher.MatchIndex(b[idx:]) | |||||
} | |||||
return -1 | |||||
} | |||||
// and, returns true iff all matchers return true | |||||
type andBytes struct{ matchers []BytesMatcher } | |||||
func BytesAnd(m ...BytesMatcher) *andBytes { return &andBytes{m} } | |||||
func (a *andBytes) Match(b []byte) bool { | |||||
for _, m := range a.matchers { | |||||
if !m.Match(b) { | |||||
return false | |||||
} | |||||
} | |||||
return true | |||||
} | |||||
func (a *andBytes) MatchIndex(b []byte) int { | |||||
longest := 0 | |||||
for _, m := range a.matchers { | |||||
if idx := m.MatchIndex(b); idx < 0 { | |||||
return -1 | |||||
} else if idx > longest { | |||||
longest = idx | |||||
} | |||||
} | |||||
return longest | |||||
} | |||||
// or, returns true iff any matcher returns true | |||||
type orBytes struct{ matchers []BytesMatcher } | |||||
func BytesOr(m ...BytesMatcher) *orBytes { return &orBytes{m} } | |||||
func (o *orBytes) Match(b []byte) bool { | |||||
for _, m := range o.matchers { | |||||
if m.Match(b) { | |||||
return true | |||||
} | |||||
} | |||||
return false | |||||
} | |||||
func (o *orBytes) MatchIndex(b []byte) int { | |||||
for _, m := range o.matchers { | |||||
if idx := m.MatchIndex(b); idx >= 0 { | |||||
return idx | |||||
} | |||||
} | |||||
return -1 | |||||
} | |||||
type suffixGroupBytes struct { | |||||
suffix BytesMatcher | |||||
matchers []BytesMatcher | |||||
} | |||||
func BytesSuffixGroup(s string, m ...BytesMatcher) *suffixGroupBytes { | |||||
return &suffixGroupBytes{BytesSuffix(s), m} | |||||
} | |||||
func (sg *suffixGroupBytes) Match(b []byte) bool { | |||||
if sg.suffix.Match(b) { | |||||
return BytesOr(sg.matchers...).Match(b) | |||||
} | |||||
return false | |||||
} | |||||
func (sg *suffixGroupBytes) MatchIndex(b []byte) int { | |||||
if sg.suffix.MatchIndex(b) >= 0 { | |||||
return BytesOr(sg.matchers...).MatchIndex(b) | |||||
} | |||||
return -1 | |||||
} |
package substring | |||||
// reverse is a helper fn for Suffixes | |||||
func reverse(b []byte) []byte { | |||||
n := len(b) | |||||
for i := 0; i < n/2; i++ { | |||||
b[i], b[n-1-i] = b[n-1-i], b[i] | |||||
} | |||||
return b | |||||
} |
package substring | |||||
import ( | |||||
"regexp" | |||||
"strings" | |||||
"github.com/toqueteos/trie" | |||||
) | |||||
type StringsMatcher interface { | |||||
Match(s string) bool | |||||
MatchIndex(s string) int | |||||
} | |||||
// regexp | |||||
type regexpString struct{ re *regexp.Regexp } | |||||
func Regexp(pat string) *regexpString { return ®expString{regexp.MustCompile(pat)} } | |||||
func (m *regexpString) Match(s string) bool { return m.re.MatchString(s) } | |||||
func (m *regexpString) MatchIndex(s string) int { | |||||
found := m.re.FindStringIndex(s) | |||||
if found != nil { | |||||
return found[1] | |||||
} | |||||
return -1 | |||||
} | |||||
// exact | |||||
type exactString struct{ pat string } | |||||
func Exact(pat string) *exactString { return &exactString{pat} } | |||||
func (m *exactString) Match(s string) bool { return m.pat == s } | |||||
func (m *exactString) MatchIndex(s string) int { | |||||
if m.pat == s { | |||||
return len(s) | |||||
} | |||||
return -1 | |||||
} | |||||
// any, search `s` in `.Match(pat)` | |||||
type anyString struct{ pat string } | |||||
func Any(pat string) *anyString { return &anyString{pat} } | |||||
func (m *anyString) Match(s string) bool { | |||||
return strings.Index(m.pat, s) >= 0 | |||||
} | |||||
func (m *anyString) MatchIndex(s string) int { | |||||
if idx := strings.Index(m.pat, s); idx >= 0 { | |||||
return idx + len(s) | |||||
} | |||||
return -1 | |||||
} | |||||
// has, search `pat` in `.Match(s)` | |||||
type hasString struct{ pat string } | |||||
func Has(pat string) *hasString { return &hasString{pat} } | |||||
func (m *hasString) Match(s string) bool { | |||||
return strings.Index(s, m.pat) >= 0 | |||||
} | |||||
func (m *hasString) MatchIndex(s string) int { | |||||
if idx := strings.Index(s, m.pat); idx >= 0 { | |||||
return idx + len(m.pat) | |||||
} | |||||
return -1 | |||||
} | |||||
// prefix | |||||
type prefixString struct{ pat string } | |||||
func Prefix(pat string) *prefixString { return &prefixString{pat} } | |||||
func (m *prefixString) Match(s string) bool { return strings.HasPrefix(s, m.pat) } | |||||
func (m *prefixString) MatchIndex(s string) int { | |||||
if strings.HasPrefix(s, m.pat) { | |||||
return len(m.pat) | |||||
} | |||||
return -1 | |||||
} | |||||
// prefixes | |||||
type prefixesString struct{ t *trie.Trie } | |||||
func Prefixes(pats ...string) *prefixesString { | |||||
t := trie.New() | |||||
for _, pat := range pats { | |||||
t.Insert([]byte(pat)) | |||||
} | |||||
return &prefixesString{t} | |||||
} | |||||
func (m *prefixesString) Match(s string) bool { return m.t.PrefixIndex([]byte(s)) >= 0 } | |||||
func (m *prefixesString) MatchIndex(s string) int { | |||||
if idx := m.t.PrefixIndex([]byte(s)); idx >= 0 { | |||||
return idx | |||||
} | |||||
return -1 | |||||
} | |||||
// suffix | |||||
type suffixString struct{ pat string } | |||||
func Suffix(pat string) *suffixString { return &suffixString{pat} } | |||||
func (m *suffixString) Match(s string) bool { return strings.HasSuffix(s, m.pat) } | |||||
func (m *suffixString) MatchIndex(s string) int { | |||||
if strings.HasSuffix(s, m.pat) { | |||||
return len(m.pat) | |||||
} | |||||
return -1 | |||||
} | |||||
// suffixes | |||||
type suffixesString struct{ t *trie.Trie } | |||||
func Suffixes(pats ...string) *suffixesString { | |||||
t := trie.New() | |||||
for _, pat := range pats { | |||||
t.Insert(reverse([]byte(pat))) | |||||
} | |||||
return &suffixesString{t} | |||||
} | |||||
func (m *suffixesString) Match(s string) bool { | |||||
return m.t.PrefixIndex(reverse([]byte(s))) >= 0 | |||||
} | |||||
func (m *suffixesString) MatchIndex(s string) int { | |||||
if idx := m.t.PrefixIndex(reverse([]byte(s))); idx >= 0 { | |||||
return idx | |||||
} | |||||
return -1 | |||||
} | |||||
// after | |||||
type afterString struct { | |||||
first string | |||||
matcher StringsMatcher | |||||
} | |||||
func After(first string, m StringsMatcher) *afterString { | |||||
return &afterString{first, m} | |||||
} | |||||
func (a *afterString) Match(s string) bool { | |||||
if idx := strings.Index(s, a.first); idx >= 0 { | |||||
return a.matcher.Match(s[idx+len(a.first):]) | |||||
} | |||||
return false | |||||
} | |||||
func (a *afterString) MatchIndex(s string) int { | |||||
if idx := strings.Index(s, a.first); idx >= 0 { | |||||
return idx + a.matcher.MatchIndex(s[idx+len(a.first):]) | |||||
} | |||||
return -1 | |||||
} | |||||
// and, returns true iff all matchers return true | |||||
type andString struct{ matchers []StringsMatcher } | |||||
func And(m ...StringsMatcher) *andString { return &andString{m} } | |||||
func (a *andString) Match(s string) bool { | |||||
for _, m := range a.matchers { | |||||
if !m.Match(s) { | |||||
return false | |||||
} | |||||
} | |||||
return true | |||||
} | |||||
func (a *andString) MatchIndex(s string) int { | |||||
longest := 0 | |||||
for _, m := range a.matchers { | |||||
if idx := m.MatchIndex(s); idx < 0 { | |||||
return -1 | |||||
} else if idx > longest { | |||||
longest = idx | |||||
} | |||||
} | |||||
return longest | |||||
} | |||||
// or, returns true iff any matcher returns true | |||||
type orString struct{ matchers []StringsMatcher } | |||||
func Or(m ...StringsMatcher) *orString { return &orString{m} } | |||||
func (o *orString) Match(s string) bool { | |||||
for _, m := range o.matchers { | |||||
if m.Match(s) { | |||||
return true | |||||
} | |||||
} | |||||
return false | |||||
} | |||||
func (o *orString) MatchIndex(s string) int { | |||||
for _, m := range o.matchers { | |||||
if idx := m.MatchIndex(s); idx >= 0 { | |||||
return idx | |||||
} | |||||
} | |||||
return -1 | |||||
} | |||||
type suffixGroupString struct { | |||||
suffix StringsMatcher | |||||
matchers []StringsMatcher | |||||
} | |||||
func SuffixGroup(s string, m ...StringsMatcher) *suffixGroupString { | |||||
return &suffixGroupString{Suffix(s), m} | |||||
} | |||||
func (sg *suffixGroupString) Match(s string) bool { | |||||
if sg.suffix.Match(s) { | |||||
return Or(sg.matchers...).Match(s) | |||||
} | |||||
return false | |||||
} | |||||
func (sg *suffixGroupString) MatchIndex(s string) int { | |||||
if sg.suffix.MatchIndex(s) >= 0 { | |||||
return Or(sg.matchers...).MatchIndex(s) | |||||
} | |||||
return -1 | |||||
} |
# github.com/glycerine/go-unsnap-stream v0.0.0-20190901134440-81cf024a9e0a | # github.com/glycerine/go-unsnap-stream v0.0.0-20190901134440-81cf024a9e0a | ||||
## explicit | ## explicit | ||||
github.com/glycerine/go-unsnap-stream | github.com/glycerine/go-unsnap-stream | ||||
# github.com/go-enry/go-enry/v2 v2.3.0 | |||||
# github.com/go-enry/go-enry/v2 v2.5.2 | |||||
## explicit | ## explicit | ||||
github.com/go-enry/go-enry/v2 | github.com/go-enry/go-enry/v2 | ||||
github.com/go-enry/go-enry/v2/data | github.com/go-enry/go-enry/v2/data | ||||
github.com/go-enry/go-enry/v2/internal/tokenizer | github.com/go-enry/go-enry/v2/internal/tokenizer | ||||
github.com/go-enry/go-enry/v2/internal/tokenizer/flex | github.com/go-enry/go-enry/v2/internal/tokenizer/flex | ||||
github.com/go-enry/go-enry/v2/regex | github.com/go-enry/go-enry/v2/regex | ||||
# github.com/go-enry/go-oniguruma v1.2.0 | |||||
# github.com/go-enry/go-oniguruma v1.2.1 | |||||
github.com/go-enry/go-oniguruma | github.com/go-enry/go-oniguruma | ||||
# github.com/go-git/gcfg v1.5.0 | # github.com/go-git/gcfg v1.5.0 | ||||
github.com/go-git/gcfg | github.com/go-git/gcfg | ||||
# github.com/tinylib/msgp v1.1.2 | # github.com/tinylib/msgp v1.1.2 | ||||
## explicit | ## explicit | ||||
github.com/tinylib/msgp/msgp | github.com/tinylib/msgp/msgp | ||||
# github.com/toqueteos/trie v1.0.0 | |||||
github.com/toqueteos/trie | |||||
# github.com/toqueteos/webbrowser v1.2.0 | # github.com/toqueteos/webbrowser v1.2.0 | ||||
github.com/toqueteos/webbrowser | github.com/toqueteos/webbrowser | ||||
# github.com/tstranex/u2f v1.0.0 | # github.com/tstranex/u2f v1.0.0 | ||||
# gopkg.in/testfixtures.v2 v2.5.0 | # gopkg.in/testfixtures.v2 v2.5.0 | ||||
## explicit | ## explicit | ||||
gopkg.in/testfixtures.v2 | gopkg.in/testfixtures.v2 | ||||
# gopkg.in/toqueteos/substring.v1 v1.0.2 | |||||
gopkg.in/toqueteos/substring.v1 | |||||
# gopkg.in/warnings.v0 v0.1.2 | # gopkg.in/warnings.v0 v0.1.2 | ||||
gopkg.in/warnings.v0 | gopkg.in/warnings.v0 | ||||
# gopkg.in/yaml.v2 v2.2.8 | # gopkg.in/yaml.v2 v2.2.8 |