mirror of
https://github.com/chainreactors/spray.git
synced 2025-09-15 11:40:13 +00:00
进一步优化crawl的正则与特殊情况处理
This commit is contained in:
parent
033f3acdd7
commit
599118284c
@ -506,7 +506,7 @@ func (pool *Pool) doCrawl(bl *pkg.Baseline) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if parsed.Host != bl.Url.Host {
|
if parsed.Host != bl.Url.Host || len(parsed.Path) <= 1 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
u = parsed.Path
|
u = parsed.Path
|
||||||
@ -534,18 +534,18 @@ func (pool *Pool) doCrawl(bl *pkg.Baseline) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if parsed.Host != bl.Url.Host {
|
if parsed.Host != bl.Url.Host || len(parsed.Path) <= 1 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
u = parsed.Path
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pool.locker.Lock()
|
||||||
if _, ok := pool.urls[u]; ok {
|
if _, ok := pool.urls[u]; ok {
|
||||||
pool.urls[u]++
|
pool.urls[u]++
|
||||||
} else {
|
} else {
|
||||||
// 通过map去重, 只有新的url才会进入到该逻辑
|
// 通过map去重, 只有新的url才会进入到该逻辑
|
||||||
pool.locker.Lock()
|
|
||||||
pool.urls[u] = 1
|
pool.urls[u] = 1
|
||||||
pool.locker.Unlock()
|
|
||||||
if bl.ReqDepth < maxCrawl {
|
if bl.ReqDepth < maxCrawl {
|
||||||
pool.wg.Add(1)
|
pool.wg.Add(1)
|
||||||
pool.addAddition(&Unit{
|
pool.addAddition(&Unit{
|
||||||
@ -555,6 +555,7 @@ func (pool *Pool) doCrawl(bl *pkg.Baseline) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
pool.locker.Unlock()
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
@ -13,35 +13,6 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
func GetSourceName(s int) string {
|
|
||||||
switch s {
|
|
||||||
case 1:
|
|
||||||
return "check"
|
|
||||||
case 2:
|
|
||||||
return "random"
|
|
||||||
case 3:
|
|
||||||
return "index"
|
|
||||||
case 4:
|
|
||||||
return "redirect"
|
|
||||||
case 5:
|
|
||||||
return "crawl"
|
|
||||||
case 6:
|
|
||||||
return "active"
|
|
||||||
case 7:
|
|
||||||
return "word"
|
|
||||||
case 8:
|
|
||||||
return "waf"
|
|
||||||
case 9:
|
|
||||||
return "rule"
|
|
||||||
case 10:
|
|
||||||
return "bak"
|
|
||||||
case 11:
|
|
||||||
return "common"
|
|
||||||
default:
|
|
||||||
return "unknown"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewBaseline(u, host string, resp *ihttp.Response) *Baseline {
|
func NewBaseline(u, host string, resp *ihttp.Response) *Baseline {
|
||||||
bl := &Baseline{
|
bl := &Baseline{
|
||||||
UrlString: u,
|
UrlString: u,
|
||||||
@ -176,8 +147,9 @@ func (bl *Baseline) CollectURL() {
|
|||||||
for _, reg := range JSRegexps {
|
for _, reg := range JSRegexps {
|
||||||
urls := reg.FindAllStringSubmatch(string(bl.Body), -1)
|
urls := reg.FindAllStringSubmatch(string(bl.Body), -1)
|
||||||
for _, u := range urls {
|
for _, u := range urls {
|
||||||
|
u[1] = formatURL(u[1])
|
||||||
if !filterJs(u[1]) {
|
if !filterJs(u[1]) {
|
||||||
bl.URLs = append(bl.URLs, formatURL(u[1]))
|
bl.URLs = append(bl.URLs, u[1])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -185,8 +157,9 @@ func (bl *Baseline) CollectURL() {
|
|||||||
for _, reg := range URLRegexps {
|
for _, reg := range URLRegexps {
|
||||||
urls := reg.FindAllStringSubmatch(string(bl.Body), -1)
|
urls := reg.FindAllStringSubmatch(string(bl.Body), -1)
|
||||||
for _, u := range urls {
|
for _, u := range urls {
|
||||||
|
u[1] = formatURL(u[1])
|
||||||
if !filterUrl(u[1]) {
|
if !filterUrl(u[1]) {
|
||||||
bl.URLs = append(bl.URLs, formatURL(u[1]))
|
bl.URLs = append(bl.URLs, u[1])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
29
pkg/types.go
29
pkg/types.go
@ -27,3 +27,32 @@ func (es Extracteds) String() string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
var Extractors = make(fingers.Extractors)
|
var Extractors = make(fingers.Extractors)
|
||||||
|
|
||||||
|
func GetSourceName(s int) string {
|
||||||
|
switch s {
|
||||||
|
case 1:
|
||||||
|
return "check"
|
||||||
|
case 2:
|
||||||
|
return "random"
|
||||||
|
case 3:
|
||||||
|
return "index"
|
||||||
|
case 4:
|
||||||
|
return "redirect"
|
||||||
|
case 5:
|
||||||
|
return "crawl"
|
||||||
|
case 6:
|
||||||
|
return "active"
|
||||||
|
case 7:
|
||||||
|
return "word"
|
||||||
|
case 8:
|
||||||
|
return "waf"
|
||||||
|
case 9:
|
||||||
|
return "rule"
|
||||||
|
case 10:
|
||||||
|
return "bak"
|
||||||
|
case 11:
|
||||||
|
return "common"
|
||||||
|
default:
|
||||||
|
return "unknown"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
35
pkg/utils.go
35
pkg/utils.go
@ -23,16 +23,17 @@ var (
|
|||||||
ActivePath []string
|
ActivePath []string
|
||||||
Fingers fingers.Fingers
|
Fingers fingers.Fingers
|
||||||
JSRegexps []*regexp.Regexp = []*regexp.Regexp{
|
JSRegexps []*regexp.Regexp = []*regexp.Regexp{
|
||||||
regexp.MustCompile(`.(https{0,1}:[^\s^'^,^’^"^”^>^<^;^(^)^|^*^\[]{2,250}?[^=^*^\s^'^’^"^”^>^<^:^;^*^|^(^)^\[]{3}[.]js)`),
|
regexp.MustCompile(`.(https{0,1}:[^\s',’"”><;()|*\[]{2,250}?[^=*\s'’><:;|()[]{3}\[]\.js)`),
|
||||||
regexp.MustCompile(`["'‘“]\s{0,6}(/{0,1}[^\s^,^'^’^"^”^|^>^<^:^;^*^(^\)^\[]{2,250}?[^=^*^\s^'^’^|^"^”^>^<^:^;^*^(^)^\[]{3}[.]js)`),
|
regexp.MustCompile(`["']\s{0,6}(/{0,1}[^\s',’"”><;()|*:\[]{2,250}?[^=*\s'’|"”><^:;()\[]{3}\.\.js)`),
|
||||||
regexp.MustCompile(`=\s{0,6}["'’”]{0,1}\s{0,6}(/{0,1}[^\s^'^,^’^"^”^>^<^;^(^)^|^*^\[]{2,250}?[^=^,^*^\s^'^’^"^”^>^|^<^:^;^*^(^)^\[]{3}[.]js)`),
|
regexp.MustCompile(`=\s{0,6}["']{0,1}\s{0,6}(/{0,1}[^\s^',’><;()|*\[]{2,250}?[^=,\s'’"”>|<:;*()\[]{3}\.js)`),
|
||||||
}
|
}
|
||||||
URLRegexps []*regexp.Regexp = []*regexp.Regexp{
|
URLRegexps []*regexp.Regexp = []*regexp.Regexp{
|
||||||
regexp.MustCompile(`["'‘“]\s{0,6}(https{0,1}:[^\s^,^'^’^"^”^>^<^),^(]{2,250}?)\s{0,6}["'‘“]`),
|
regexp.MustCompile(`["']\s{0,6}(https{0,1}:[^\s,'’"”><)^(]{2,250}?)\s{0,6}["']`),
|
||||||
regexp.MustCompile(`=\s{0,6}(https{0,1}:[^\s^'^,^’^"^”^>^<^;^(^)^|^*^\[]{2,250})`),
|
regexp.MustCompile(`=\s{0,6}(https{0,1}:[^\s',’"”><;()|*\[]{2,250})`),
|
||||||
regexp.MustCompile(`["']([\w/]{2,250}?\.\w{2,4}?)["']`),
|
regexp.MustCompile(`["']([^\s',’"”><;()|*\[]{2,250}\.[a-zA-Z]\w{1,3})["']`),
|
||||||
regexp.MustCompile(`["'‘“]\s{0,6}([#,.]{0,2}/[^\s^'^,^’^"^”^>^<^;^(^)^|^*^\[]{2,250}?)\s{0,6}["'‘“]`),
|
regexp.MustCompile(`["'](https?:[^\s',’"”><;()|*\[]{2,250}?)["']`),
|
||||||
regexp.MustCompile(`href\s{0,6}=\s{0,6}["'‘“]{0,1}\s{0,6}([^\s^'^,^’^"^”^>^<^;^(^)^|^*^\[]{2,250})|action\s{0,6}=\s{0,6}["'‘“]{0,1}\s{0,6}([^\s^'^’^"^“^>^<^)^(]{2,250})`),
|
regexp.MustCompile(`["']\s{0,6}([#,.]{0,2}/[^\s',’"”><;()|*\[]{2,250}?)\s{0,6}["']`),
|
||||||
|
regexp.MustCompile(`href\s{0,6}=\s{0,6}["'‘“]{0,1}\s{0,6}([^\s',’"”><;()|*\[]{2,250})|action\s{0,6}=\s{0,6}["'‘“]{0,1}\s{0,6}([^\s'’"“><)(]{2,250})`),
|
||||||
}
|
}
|
||||||
|
|
||||||
ContentTypeMap = map[string]string{
|
ContentTypeMap = map[string]string{
|
||||||
@ -226,8 +227,8 @@ func FingerDetect(content string) Frameworks {
|
|||||||
}
|
}
|
||||||
|
|
||||||
var (
|
var (
|
||||||
BadExt = []string{".js", ".css", ".scss", ".,", ".jpeg", ".jpg", ".png", ".gif", ".svg", ".vue", ".ts", ".swf", ".pdf"}
|
BadExt = []string{".js", ".css", ".scss", ".,", ".jpeg", ".jpg", ".png", ".gif", ".svg", ".vue", ".ts", ".swf", ".pdf", ".mp4"}
|
||||||
BadURL = []string{";", "}", "{", "www.w3.org", "example.com", ".src", ".url", ".att", ".href", "location.href", "javascript:", "location:", ".createObject", ":location", ".path", "*#__PURE__*"}
|
BadURL = []string{";", "}", "{", "www.w3.org", ".src", ".url", ".att", ".href", "location.href", "javascript:", "location:", ".createObject", ":location", ".path", "*#__PURE__*"}
|
||||||
)
|
)
|
||||||
|
|
||||||
func filterJs(u string) bool {
|
func filterJs(u string) bool {
|
||||||
@ -259,6 +260,16 @@ func filterUrl(u string) bool {
|
|||||||
|
|
||||||
func formatURL(u string) string {
|
func formatURL(u string) string {
|
||||||
// 去掉frag与params, 节约url.parse性能, 防止带参数造成意外的影响
|
// 去掉frag与params, 节约url.parse性能, 防止带参数造成意外的影响
|
||||||
|
if strings.Contains(u, "2f") || strings.Contains(u, "2F") {
|
||||||
|
u = strings.ReplaceAll(u, "\\u002F", "/")
|
||||||
|
u = strings.ReplaceAll(u, "\\u002f", "/")
|
||||||
|
u = strings.ReplaceAll(u, "%252F", "/")
|
||||||
|
u = strings.ReplaceAll(u, "%252f", "/")
|
||||||
|
u = strings.ReplaceAll(u, "%2f", "/")
|
||||||
|
u = strings.ReplaceAll(u, "%2F", "/")
|
||||||
|
}
|
||||||
|
|
||||||
|
u = strings.TrimRight(u, "\\")
|
||||||
if i := strings.Index(u, "?"); i != -1 {
|
if i := strings.Index(u, "?"); i != -1 {
|
||||||
return u[:i]
|
return u[:i]
|
||||||
}
|
}
|
||||||
@ -273,8 +284,8 @@ func commonFilter(u string) bool {
|
|||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, scoop := range BadURL {
|
for _, bad := range BadURL {
|
||||||
if strings.Contains(u, scoop) {
|
if strings.Contains(u, bad) {
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user