进一步优化crawl的正则与特殊情况处理

This commit is contained in:
M09Ic 2023-01-10 00:58:16 +08:00
parent 033f3acdd7
commit 599118284c
4 changed files with 61 additions and 47 deletions

View File

@ -506,7 +506,7 @@ func (pool *Pool) doCrawl(bl *pkg.Baseline) {
if err != nil { if err != nil {
continue continue
} }
if parsed.Host != bl.Url.Host { if parsed.Host != bl.Url.Host || len(parsed.Path) <= 1 {
continue continue
} }
u = parsed.Path u = parsed.Path
@ -534,18 +534,18 @@ func (pool *Pool) doCrawl(bl *pkg.Baseline) {
if err != nil { if err != nil {
continue continue
} }
if parsed.Host != bl.Url.Host { if parsed.Host != bl.Url.Host || len(parsed.Path) <= 1 {
continue continue
} }
u = parsed.Path
} }
pool.locker.Lock()
if _, ok := pool.urls[u]; ok { if _, ok := pool.urls[u]; ok {
pool.urls[u]++ pool.urls[u]++
} else { } else {
// 通过map去重, 只有新的url才会进入到该逻辑 // 通过map去重, 只有新的url才会进入到该逻辑
pool.locker.Lock()
pool.urls[u] = 1 pool.urls[u] = 1
pool.locker.Unlock()
if bl.ReqDepth < maxCrawl { if bl.ReqDepth < maxCrawl {
pool.wg.Add(1) pool.wg.Add(1)
pool.addAddition(&Unit{ pool.addAddition(&Unit{
@ -555,6 +555,7 @@ func (pool *Pool) doCrawl(bl *pkg.Baseline) {
}) })
} }
} }
pool.locker.Unlock()
} }
}() }()

View File

@ -13,35 +13,6 @@ import (
"strings" "strings"
) )
func GetSourceName(s int) string {
switch s {
case 1:
return "check"
case 2:
return "random"
case 3:
return "index"
case 4:
return "redirect"
case 5:
return "crawl"
case 6:
return "active"
case 7:
return "word"
case 8:
return "waf"
case 9:
return "rule"
case 10:
return "bak"
case 11:
return "common"
default:
return "unknown"
}
}
func NewBaseline(u, host string, resp *ihttp.Response) *Baseline { func NewBaseline(u, host string, resp *ihttp.Response) *Baseline {
bl := &Baseline{ bl := &Baseline{
UrlString: u, UrlString: u,
@ -176,8 +147,9 @@ func (bl *Baseline) CollectURL() {
for _, reg := range JSRegexps { for _, reg := range JSRegexps {
urls := reg.FindAllStringSubmatch(string(bl.Body), -1) urls := reg.FindAllStringSubmatch(string(bl.Body), -1)
for _, u := range urls { for _, u := range urls {
u[1] = formatURL(u[1])
if !filterJs(u[1]) { if !filterJs(u[1]) {
bl.URLs = append(bl.URLs, formatURL(u[1])) bl.URLs = append(bl.URLs, u[1])
} }
} }
} }
@ -185,8 +157,9 @@ func (bl *Baseline) CollectURL() {
for _, reg := range URLRegexps { for _, reg := range URLRegexps {
urls := reg.FindAllStringSubmatch(string(bl.Body), -1) urls := reg.FindAllStringSubmatch(string(bl.Body), -1)
for _, u := range urls { for _, u := range urls {
u[1] = formatURL(u[1])
if !filterUrl(u[1]) { if !filterUrl(u[1]) {
bl.URLs = append(bl.URLs, formatURL(u[1])) bl.URLs = append(bl.URLs, u[1])
} }
} }
} }

View File

@ -27,3 +27,32 @@ func (es Extracteds) String() string {
} }
var Extractors = make(fingers.Extractors) var Extractors = make(fingers.Extractors)
func GetSourceName(s int) string {
switch s {
case 1:
return "check"
case 2:
return "random"
case 3:
return "index"
case 4:
return "redirect"
case 5:
return "crawl"
case 6:
return "active"
case 7:
return "word"
case 8:
return "waf"
case 9:
return "rule"
case 10:
return "bak"
case 11:
return "common"
default:
return "unknown"
}
}

View File

@ -23,16 +23,17 @@ var (
ActivePath []string ActivePath []string
Fingers fingers.Fingers Fingers fingers.Fingers
JSRegexps []*regexp.Regexp = []*regexp.Regexp{ JSRegexps []*regexp.Regexp = []*regexp.Regexp{
regexp.MustCompile(`.(https{0,1}:[^\s^'^,^^"^^>^<^;^(^)^|^*^\[]{2,250}?[^=^*^\s^'^^"^”^>^<^:^;^*^|^(^)^\[]{3}[.]js)`), regexp.MustCompile(`.(https{0,1}:[^\s',"”><;()|*\[]{2,250}?[^=*\s'><:;|()[]{3}\[]\.js)`),
regexp.MustCompile(`["'‘“]\s{0,6}(/{0,1}[^\s^,^'^^"^”^|^>^<^:^;^*^(^\)^\[]{2,250}?[^=^*^\s^'^^|^"^”^>^<^:^;^*^(^)^\[]{3}[.]js)`), regexp.MustCompile(`["']\s{0,6}(/{0,1}[^\s',"”><;()|*:\[]{2,250}?[^=*\s'|"”><^:;()\[]{3}\.\.js)`),
regexp.MustCompile(`=\s{0,6}["'’”]{0,1}\s{0,6}(/{0,1}[^\s^'^,^^"^”^>^<^;^(^)^|^*^\[]{2,250}?[^=^,^*^\s^'^^"^”^>^|^<^:^;^*^(^)^\[]{3}[.]js)`), regexp.MustCompile(`=\s{0,6}["']{0,1}\s{0,6}(/{0,1}[^\s^',><;()|*\[]{2,250}?[^=,\s'"”>|<:;*()\[]{3}\.js)`),
} }
URLRegexps []*regexp.Regexp = []*regexp.Regexp{ URLRegexps []*regexp.Regexp = []*regexp.Regexp{
regexp.MustCompile(`["'‘“]\s{0,6}(https{0,1}:[^\s^,^'^^"^”^>^<^),^(]{2,250}?)\s{0,6}["'‘“]`), regexp.MustCompile(`["']\s{0,6}(https{0,1}:[^\s,'"”><)^(]{2,250}?)\s{0,6}["']`),
regexp.MustCompile(`=\s{0,6}(https{0,1}:[^\s^'^,^^"^”^>^<^;^(^)^|^*^\[]{2,250})`), regexp.MustCompile(`=\s{0,6}(https{0,1}:[^\s',"”><;()|*\[]{2,250})`),
regexp.MustCompile(`["']([\w/]{2,250}?\.\w{2,4}?)["']`), regexp.MustCompile(`["']([^\s',"”><;()|*\[]{2,250}\.[a-zA-Z]\w{1,3})["']`),
regexp.MustCompile(`["'‘“]\s{0,6}([#,.]{0,2}/[^\s^'^,^^"^”^>^<^;^(^)^|^*^\[]{2,250}?)\s{0,6}["'‘“]`), regexp.MustCompile(`["'](https?:[^\s',"”><;()|*\[]{2,250}?)["']`),
regexp.MustCompile(`href\s{0,6}=\s{0,6}["'‘“]{0,1}\s{0,6}([^\s^'^,^^"^”^>^<^;^(^)^|^*^\[]{2,250})|action\s{0,6}=\s{0,6}["'‘“]{0,1}\s{0,6}([^\s^'^^"^“^>^<^)^(]{2,250})`), regexp.MustCompile(`["']\s{0,6}([#,.]{0,2}/[^\s',"”><;()|*\[]{2,250}?)\s{0,6}["']`),
regexp.MustCompile(`href\s{0,6}=\s{0,6}["'‘“]{0,1}\s{0,6}([^\s',"”><;()|*\[]{2,250})|action\s{0,6}=\s{0,6}["'‘“]{0,1}\s{0,6}([^\s'"“><)(]{2,250})`),
} }
ContentTypeMap = map[string]string{ ContentTypeMap = map[string]string{
@ -226,8 +227,8 @@ func FingerDetect(content string) Frameworks {
} }
var ( var (
BadExt = []string{".js", ".css", ".scss", ".,", ".jpeg", ".jpg", ".png", ".gif", ".svg", ".vue", ".ts", ".swf", ".pdf"} BadExt = []string{".js", ".css", ".scss", ".,", ".jpeg", ".jpg", ".png", ".gif", ".svg", ".vue", ".ts", ".swf", ".pdf", ".mp4"}
BadURL = []string{";", "}", "{", "www.w3.org", "example.com", ".src", ".url", ".att", ".href", "location.href", "javascript:", "location:", ".createObject", ":location", ".path", "*#__PURE__*"} BadURL = []string{";", "}", "{", "www.w3.org", ".src", ".url", ".att", ".href", "location.href", "javascript:", "location:", ".createObject", ":location", ".path", "*#__PURE__*"}
) )
func filterJs(u string) bool { func filterJs(u string) bool {
@ -259,6 +260,16 @@ func filterUrl(u string) bool {
func formatURL(u string) string { func formatURL(u string) string {
// 去掉frag与params, 节约url.parse性能, 防止带参数造成意外的影响 // 去掉frag与params, 节约url.parse性能, 防止带参数造成意外的影响
if strings.Contains(u, "2f") || strings.Contains(u, "2F") {
u = strings.ReplaceAll(u, "\\u002F", "/")
u = strings.ReplaceAll(u, "\\u002f", "/")
u = strings.ReplaceAll(u, "%252F", "/")
u = strings.ReplaceAll(u, "%252f", "/")
u = strings.ReplaceAll(u, "%2f", "/")
u = strings.ReplaceAll(u, "%2F", "/")
}
u = strings.TrimRight(u, "\\")
if i := strings.Index(u, "?"); i != -1 { if i := strings.Index(u, "?"); i != -1 {
return u[:i] return u[:i]
} }
@ -273,8 +284,8 @@ func commonFilter(u string) bool {
return true return true
} }
for _, scoop := range BadURL { for _, bad := range BadURL {
if strings.Contains(u, scoop) { if strings.Contains(u, bad) {
return true return true
} }
} }