优化crawl的正则表达式

This commit is contained in:
M09Ic 2023-01-10 01:30:05 +08:00
parent f87acdf657
commit b5da1eb45f

View File

@ -28,10 +28,9 @@ var (
regexp.MustCompile(`=\s{0,6}["']{0,1}\s{0,6}(/{0,1}[^\s^',><;()|*\[]{2,250}?[^=,\s'"”>|<:;*()\[]{3}\.js)`),
}
URLRegexps []*regexp.Regexp = []*regexp.Regexp{
regexp.MustCompile(`["']\s{0,6}(https{0,1}:[^\s,'"”><)^(]{2,250}?)\s{0,6}["']`),
regexp.MustCompile(`=\s{0,6}(https{0,1}:[^\s',"”><;()|*\[]{2,250})`),
regexp.MustCompile(`["']([^\s',"”><;()|*\[]{2,250}\.[a-zA-Z]\w{1,3})["']`),
regexp.MustCompile(`["'](https?:[^\s',"”><;()|*\[]{2,250}?)["']`),
regexp.MustCompile(`["'](https?:[^\s',"”><;()|*\[]{2,250}?\.[^\s',"”><;()|*\[]{2,250}?)["']`),
regexp.MustCompile(`["']\s{0,6}([#,.]{0,2}/[^\s',"”><;()|*\[]{2,250}?)\s{0,6}["']`),
regexp.MustCompile(`href\s{0,6}=\s{0,6}["'‘“]{0,1}\s{0,6}([^\s',"”><;()|*\[]{2,250})|action\s{0,6}=\s{0,6}["'‘“]{0,1}\s{0,6}([^\s'"“><)(]{2,250})`),
}