mirror of
https://github.com/chainreactors/spray.git
synced 2025-09-15 11:40:13 +00:00
进一步优化crawl的正则与特殊情况处理
This commit is contained in:
parent
033f3acdd7
commit
599118284c
@ -506,7 +506,7 @@ func (pool *Pool) doCrawl(bl *pkg.Baseline) {
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if parsed.Host != bl.Url.Host {
|
||||
if parsed.Host != bl.Url.Host || len(parsed.Path) <= 1 {
|
||||
continue
|
||||
}
|
||||
u = parsed.Path
|
||||
@ -534,18 +534,18 @@ func (pool *Pool) doCrawl(bl *pkg.Baseline) {
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if parsed.Host != bl.Url.Host {
|
||||
if parsed.Host != bl.Url.Host || len(parsed.Path) <= 1 {
|
||||
continue
|
||||
}
|
||||
u = parsed.Path
|
||||
}
|
||||
|
||||
pool.locker.Lock()
|
||||
if _, ok := pool.urls[u]; ok {
|
||||
pool.urls[u]++
|
||||
} else {
|
||||
// 通过map去重, 只有新的url才会进入到该逻辑
|
||||
pool.locker.Lock()
|
||||
pool.urls[u] = 1
|
||||
pool.locker.Unlock()
|
||||
if bl.ReqDepth < maxCrawl {
|
||||
pool.wg.Add(1)
|
||||
pool.addAddition(&Unit{
|
||||
@ -555,6 +555,7 @@ func (pool *Pool) doCrawl(bl *pkg.Baseline) {
|
||||
})
|
||||
}
|
||||
}
|
||||
pool.locker.Unlock()
|
||||
}
|
||||
}()
|
||||
|
||||
|
@ -13,35 +13,6 @@ import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
func GetSourceName(s int) string {
|
||||
switch s {
|
||||
case 1:
|
||||
return "check"
|
||||
case 2:
|
||||
return "random"
|
||||
case 3:
|
||||
return "index"
|
||||
case 4:
|
||||
return "redirect"
|
||||
case 5:
|
||||
return "crawl"
|
||||
case 6:
|
||||
return "active"
|
||||
case 7:
|
||||
return "word"
|
||||
case 8:
|
||||
return "waf"
|
||||
case 9:
|
||||
return "rule"
|
||||
case 10:
|
||||
return "bak"
|
||||
case 11:
|
||||
return "common"
|
||||
default:
|
||||
return "unknown"
|
||||
}
|
||||
}
|
||||
|
||||
func NewBaseline(u, host string, resp *ihttp.Response) *Baseline {
|
||||
bl := &Baseline{
|
||||
UrlString: u,
|
||||
@ -176,8 +147,9 @@ func (bl *Baseline) CollectURL() {
|
||||
for _, reg := range JSRegexps {
|
||||
urls := reg.FindAllStringSubmatch(string(bl.Body), -1)
|
||||
for _, u := range urls {
|
||||
u[1] = formatURL(u[1])
|
||||
if !filterJs(u[1]) {
|
||||
bl.URLs = append(bl.URLs, formatURL(u[1]))
|
||||
bl.URLs = append(bl.URLs, u[1])
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -185,8 +157,9 @@ func (bl *Baseline) CollectURL() {
|
||||
for _, reg := range URLRegexps {
|
||||
urls := reg.FindAllStringSubmatch(string(bl.Body), -1)
|
||||
for _, u := range urls {
|
||||
u[1] = formatURL(u[1])
|
||||
if !filterUrl(u[1]) {
|
||||
bl.URLs = append(bl.URLs, formatURL(u[1]))
|
||||
bl.URLs = append(bl.URLs, u[1])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
29
pkg/types.go
29
pkg/types.go
@ -27,3 +27,32 @@ func (es Extracteds) String() string {
|
||||
}
|
||||
|
||||
var Extractors = make(fingers.Extractors)
|
||||
|
||||
func GetSourceName(s int) string {
|
||||
switch s {
|
||||
case 1:
|
||||
return "check"
|
||||
case 2:
|
||||
return "random"
|
||||
case 3:
|
||||
return "index"
|
||||
case 4:
|
||||
return "redirect"
|
||||
case 5:
|
||||
return "crawl"
|
||||
case 6:
|
||||
return "active"
|
||||
case 7:
|
||||
return "word"
|
||||
case 8:
|
||||
return "waf"
|
||||
case 9:
|
||||
return "rule"
|
||||
case 10:
|
||||
return "bak"
|
||||
case 11:
|
||||
return "common"
|
||||
default:
|
||||
return "unknown"
|
||||
}
|
||||
}
|
||||
|
35
pkg/utils.go
35
pkg/utils.go
@ -23,16 +23,17 @@ var (
|
||||
ActivePath []string
|
||||
Fingers fingers.Fingers
|
||||
JSRegexps []*regexp.Regexp = []*regexp.Regexp{
|
||||
regexp.MustCompile(`.(https{0,1}:[^\s^'^,^’^"^”^>^<^;^(^)^|^*^\[]{2,250}?[^=^*^\s^'^’^"^”^>^<^:^;^*^|^(^)^\[]{3}[.]js)`),
|
||||
regexp.MustCompile(`["'‘“]\s{0,6}(/{0,1}[^\s^,^'^’^"^”^|^>^<^:^;^*^(^\)^\[]{2,250}?[^=^*^\s^'^’^|^"^”^>^<^:^;^*^(^)^\[]{3}[.]js)`),
|
||||
regexp.MustCompile(`=\s{0,6}["'’”]{0,1}\s{0,6}(/{0,1}[^\s^'^,^’^"^”^>^<^;^(^)^|^*^\[]{2,250}?[^=^,^*^\s^'^’^"^”^>^|^<^:^;^*^(^)^\[]{3}[.]js)`),
|
||||
regexp.MustCompile(`.(https{0,1}:[^\s',’"”><;()|*\[]{2,250}?[^=*\s'’><:;|()[]{3}\[]\.js)`),
|
||||
regexp.MustCompile(`["']\s{0,6}(/{0,1}[^\s',’"”><;()|*:\[]{2,250}?[^=*\s'’|"”><^:;()\[]{3}\.\.js)`),
|
||||
regexp.MustCompile(`=\s{0,6}["']{0,1}\s{0,6}(/{0,1}[^\s^',’><;()|*\[]{2,250}?[^=,\s'’"”>|<:;*()\[]{3}\.js)`),
|
||||
}
|
||||
URLRegexps []*regexp.Regexp = []*regexp.Regexp{
|
||||
regexp.MustCompile(`["'‘“]\s{0,6}(https{0,1}:[^\s^,^'^’^"^”^>^<^),^(]{2,250}?)\s{0,6}["'‘“]`),
|
||||
regexp.MustCompile(`=\s{0,6}(https{0,1}:[^\s^'^,^’^"^”^>^<^;^(^)^|^*^\[]{2,250})`),
|
||||
regexp.MustCompile(`["']([\w/]{2,250}?\.\w{2,4}?)["']`),
|
||||
regexp.MustCompile(`["'‘“]\s{0,6}([#,.]{0,2}/[^\s^'^,^’^"^”^>^<^;^(^)^|^*^\[]{2,250}?)\s{0,6}["'‘“]`),
|
||||
regexp.MustCompile(`href\s{0,6}=\s{0,6}["'‘“]{0,1}\s{0,6}([^\s^'^,^’^"^”^>^<^;^(^)^|^*^\[]{2,250})|action\s{0,6}=\s{0,6}["'‘“]{0,1}\s{0,6}([^\s^'^’^"^“^>^<^)^(]{2,250})`),
|
||||
regexp.MustCompile(`["']\s{0,6}(https{0,1}:[^\s,'’"”><)^(]{2,250}?)\s{0,6}["']`),
|
||||
regexp.MustCompile(`=\s{0,6}(https{0,1}:[^\s',’"”><;()|*\[]{2,250})`),
|
||||
regexp.MustCompile(`["']([^\s',’"”><;()|*\[]{2,250}\.[a-zA-Z]\w{1,3})["']`),
|
||||
regexp.MustCompile(`["'](https?:[^\s',’"”><;()|*\[]{2,250}?)["']`),
|
||||
regexp.MustCompile(`["']\s{0,6}([#,.]{0,2}/[^\s',’"”><;()|*\[]{2,250}?)\s{0,6}["']`),
|
||||
regexp.MustCompile(`href\s{0,6}=\s{0,6}["'‘“]{0,1}\s{0,6}([^\s',’"”><;()|*\[]{2,250})|action\s{0,6}=\s{0,6}["'‘“]{0,1}\s{0,6}([^\s'’"“><)(]{2,250})`),
|
||||
}
|
||||
|
||||
ContentTypeMap = map[string]string{
|
||||
@ -226,8 +227,8 @@ func FingerDetect(content string) Frameworks {
|
||||
}
|
||||
|
||||
var (
|
||||
BadExt = []string{".js", ".css", ".scss", ".,", ".jpeg", ".jpg", ".png", ".gif", ".svg", ".vue", ".ts", ".swf", ".pdf"}
|
||||
BadURL = []string{";", "}", "{", "www.w3.org", "example.com", ".src", ".url", ".att", ".href", "location.href", "javascript:", "location:", ".createObject", ":location", ".path", "*#__PURE__*"}
|
||||
BadExt = []string{".js", ".css", ".scss", ".,", ".jpeg", ".jpg", ".png", ".gif", ".svg", ".vue", ".ts", ".swf", ".pdf", ".mp4"}
|
||||
BadURL = []string{";", "}", "{", "www.w3.org", ".src", ".url", ".att", ".href", "location.href", "javascript:", "location:", ".createObject", ":location", ".path", "*#__PURE__*"}
|
||||
)
|
||||
|
||||
func filterJs(u string) bool {
|
||||
@ -259,6 +260,16 @@ func filterUrl(u string) bool {
|
||||
|
||||
func formatURL(u string) string {
|
||||
// 去掉frag与params, 节约url.parse性能, 防止带参数造成意外的影响
|
||||
if strings.Contains(u, "2f") || strings.Contains(u, "2F") {
|
||||
u = strings.ReplaceAll(u, "\\u002F", "/")
|
||||
u = strings.ReplaceAll(u, "\\u002f", "/")
|
||||
u = strings.ReplaceAll(u, "%252F", "/")
|
||||
u = strings.ReplaceAll(u, "%252f", "/")
|
||||
u = strings.ReplaceAll(u, "%2f", "/")
|
||||
u = strings.ReplaceAll(u, "%2F", "/")
|
||||
}
|
||||
|
||||
u = strings.TrimRight(u, "\\")
|
||||
if i := strings.Index(u, "?"); i != -1 {
|
||||
return u[:i]
|
||||
}
|
||||
@ -273,8 +284,8 @@ func commonFilter(u string) bool {
|
||||
return true
|
||||
}
|
||||
|
||||
for _, scoop := range BadURL {
|
||||
if strings.Contains(u, scoop) {
|
||||
for _, bad := range BadURL {
|
||||
if strings.Contains(u, bad) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user