进一步优化crawl的正则与特殊情况

This commit is contained in:
M09Ic 2023-01-09 22:41:05 +08:00
parent a23643ebf0
commit 171786c51e
2 changed files with 6 additions and 4 deletions

View File

@ -520,6 +520,8 @@ func (pool *Pool) doCrawl(bl *pkg.Baseline) {
} else {
u = pkg.URLJoin(path.Dir(bl.Url.Path), u[2:])
}
} else if strings.HasPrefix(u, "../") {
u = path.Join(path.Dir(bl.Url.Path), u)
} else if !strings.HasPrefix(u, "http") {
// 相对目录拼接
if bl.Dir {

View File

@ -30,7 +30,7 @@ var (
URLRegexps []*regexp.Regexp = []*regexp.Regexp{
regexp.MustCompile(`["'‘“]\s{0,6}(https{0,1}:[^\s^,^'^^"^”^>^<^),^(]{2,250}?)\s{0,6}["'‘“]`),
regexp.MustCompile(`=\s{0,6}(https{0,1}:[^\s^'^,^^"^”^>^<^;^(^)^|^*^\[]{2,250})`),
regexp.MustCompile(`["'](\w{2,250}?\.\w{2,4}?)["']`),
regexp.MustCompile(`["']([\w/]{2,250}?\.\w{2,4}?)["']`),
regexp.MustCompile(`["'‘“]\s{0,6}([#,.]{0,2}/[^\s^'^,^^"^”^>^<^;^(^)^|^*^\[]{2,250}?)\s{0,6}["'‘“]`),
regexp.MustCompile(`href\s{0,6}=\s{0,6}["'‘“]{0,1}\s{0,6}([^\s^'^,^^"^”^>^<^;^(^)^|^*^\[]{2,250})|action\s{0,6}=\s{0,6}["'‘“]{0,1}\s{0,6}([^\s^'^^"^“^>^<^)^(]{2,250})`),
}
@ -226,7 +226,7 @@ func FingerDetect(content string) Frameworks {
}
var (
BadExt = []string{".js", ".css", ".scss", ",", ".jpeg", ".jpg", ".png", ".gif", ".svg", ".vue", ".ts"}
BadExt = []string{".js", ".css", ".scss", ".,", ".jpeg", ".jpg", ".png", ".gif", ".svg", ".vue", ".ts", ".swf", ".pdf"}
BadURL = []string{";", "}", "{", "www.w3.org", "example.com", ".src", ".url", ".att", ".href", "location.href", "javascript:", "location:", ".createObject", ":location", ".path", "*#__PURE__*"}
)
@ -249,7 +249,7 @@ func filterUrl(u string) bool {
} else {
ext := path.Ext(parsed.Path)
for _, e := range BadExt {
if e == ext {
if strings.EqualFold(e, ext) {
return true
}
}
@ -269,7 +269,7 @@ func formatURL(u string) string {
}
func commonFilter(u string) bool {
if strings.HasPrefix(u, "http") && len(u) < 9 {
if strings.HasPrefix(u, "http") && len(u) < 15 {
return true
}