对爬虫进行大量优化, 包括优化性能, 去重, 优化目录拼接, 适配"./"相对目录

fuzzy会在开启debug时自动启用.
This commit is contained in:
M09Ic 2023-01-09 21:33:05 +08:00
parent f8b84c733b
commit f24c7b3bc6
6 changed files with 105 additions and 68 deletions

View File

@ -232,14 +232,15 @@ func (opt *Option) PrepareRunner() (*Runner, error) {
logs.Log.Importantf("Loaded %d word from %s", len(dicts[i]), f) logs.Log.Importantf("Loaded %d word from %s", len(dicts[i]), f)
} }
if len(opt.Dictionaries) > 0 && opt.Word == "" { if len(opt.Dictionaries) == 0 && opt.Word == "" {
// 用来仅使用高级功能下, 防止无字典报错.
opt.Word = "/"
} else {
opt.Word = "{?" opt.Word = "{?"
for i, _ := range dicts { for i, _ := range dicts {
opt.Word += strconv.Itoa(i) opt.Word += strconv.Itoa(i)
} }
opt.Word += "}" opt.Word += "}"
} else {
opt.Word = "/"
} }
if opt.Suffixes != nil { if opt.Suffixes != nil {
@ -376,7 +377,7 @@ func (opt *Option) PrepareRunner() (*Runner, error) {
if opt.RemoveExtensions != "" { if opt.RemoveExtensions != "" {
rexts := strings.Split(opt.ExcludeExtensions, ",") rexts := strings.Split(opt.ExcludeExtensions, ",")
r.Fns = append(r.Fns, func(s string) string { r.Fns = append(r.Fns, func(s string) string {
if ext := parseExtension(s); StringsContains(rexts, ext) { if ext := parseExtension(s); pkg.StringsContains(rexts, ext) {
return strings.TrimSuffix(s, "."+ext) return strings.TrimSuffix(s, "."+ext)
} }
return s return s
@ -386,7 +387,7 @@ func (opt *Option) PrepareRunner() (*Runner, error) {
if opt.ExcludeExtensions != "" { if opt.ExcludeExtensions != "" {
exexts := strings.Split(opt.ExcludeExtensions, ",") exexts := strings.Split(opt.ExcludeExtensions, ",")
r.Fns = append(r.Fns, func(s string) string { r.Fns = append(r.Fns, func(s string) string {
if ext := parseExtension(s); StringsContains(exexts, ext) { if ext := parseExtension(s); pkg.StringsContains(exexts, ext) {
return "" return ""
} }
return s return s

View File

@ -221,12 +221,6 @@ func (pool *Pool) Run(ctx context.Context, offset, limit int) {
} }
closeCh := make(chan struct{}) closeCh := make(chan struct{})
//go func() {
// select {
// case <-worderDone:
// closeCh <- struct{}{}
// }
//}()
var worderDone bool var worderDone bool
wait := func() { wait := func() {
if !worderDone { if !worderDone {
@ -279,9 +273,6 @@ Loop:
} }
pool.wg.Wait() pool.wg.Wait()
for pool.analyzeDone {
time.Sleep(time.Duration(100) * time.Millisecond)
}
pool.Statistor.EndTime = time.Now().Unix() pool.Statistor.EndTime = time.Now().Unix()
pool.Close() pool.Close()
} }
@ -345,8 +336,8 @@ func (pool *Pool) Invoke(v interface{}) {
bl.Collect() bl.Collect()
pool.locker.Lock() pool.locker.Lock()
pool.random = bl pool.random = bl
pool.locker.Unlock()
pool.addFuzzyBaseline(bl) pool.addFuzzyBaseline(bl)
pool.locker.Unlock()
pool.initwg.Done() pool.initwg.Done()
case InitIndexSource: case InitIndexSource:
bl.Collect() bl.Collect()
@ -354,10 +345,10 @@ func (pool *Pool) Invoke(v interface{}) {
pool.index = bl pool.index = bl
pool.locker.Unlock() pool.locker.Unlock()
pool.wg.Add(1) pool.wg.Add(1)
pool.doCrawl(bl)
if bl.Status == 200 || (bl.Status/100) == 3 { if bl.Status == 200 || (bl.Status/100) == 3 {
pool.OutputCh <- bl pool.OutputCh <- bl
} }
pool.doCrawl(bl)
pool.initwg.Done() pool.initwg.Done()
case CheckSource: case CheckSource:
if bl.ErrString != "" { if bl.ErrString != "" {
@ -399,7 +390,7 @@ func (pool *Pool) Invoke(v interface{}) {
func (pool *Pool) PreCompare(resp *ihttp.Response) error { func (pool *Pool) PreCompare(resp *ihttp.Response) error {
status := resp.StatusCode() status := resp.StatusCode()
if IntsContains(WhiteStatus, status) { if pkg.IntsContains(WhiteStatus, status) {
// 如果为白名单状态码则直接返回 // 如果为白名单状态码则直接返回
return nil return nil
} }
@ -407,11 +398,11 @@ func (pool *Pool) PreCompare(resp *ihttp.Response) error {
return ErrSameStatus return ErrSameStatus
} }
if IntsContains(BlackStatus, status) { if pkg.IntsContains(BlackStatus, status) {
return ErrBadStatus return ErrBadStatus
} }
if IntsContains(WAFStatus, status) { if pkg.IntsContains(WAFStatus, status) {
return ErrWaf return ErrWaf
} }
@ -505,17 +496,47 @@ func (pool *Pool) doCrawl(bl *pkg.Baseline) {
return return
} }
bl.CollectURL() bl.CollectURL()
if bl.URLs == nil {
pool.wg.Done()
return
}
go func() { go func() {
defer pool.wg.Done() defer pool.wg.Done()
for _, u := range bl.URLs { for _, u := range bl.URLs {
if strings.HasPrefix(u, "//") { if strings.HasPrefix(u, "//") {
u = bl.Url.Scheme + u parsed, err := url.Parse(u)
if err != nil {
continue
}
if parsed.Host != bl.Url.Host {
continue
}
u = parsed.Path
} else if strings.HasPrefix(u, "/") { } else if strings.HasPrefix(u, "/") {
// 绝对目录拼接 // 绝对目录拼接
u = pkg.URLJoin(pool.BaseURL, u) // 不需要进行处理, 用来跳过下面的判断
} else if strings.HasPrefix(u, "./") {
// "./"相对目录拼接
if bl.Dir {
u = pkg.URLJoin(bl.Url.Path, u[2:])
} else {
u = pkg.URLJoin(path.Dir(bl.Url.Path), u[2:])
}
} else if !strings.HasPrefix(u, "http") { } else if !strings.HasPrefix(u, "http") {
// 相对目录拼接 // 相对目录拼接
u = pkg.URLJoin(pool.BaseURL, u) if bl.Dir {
u = pkg.URLJoin(bl.Url.Path, u)
} else {
u = pkg.URLJoin(path.Dir(bl.Url.Path), u)
}
} else {
parsed, err := url.Parse(u)
if err != nil {
continue
}
if parsed.Host != bl.Url.Host {
continue
}
} }
if _, ok := pool.urls[u]; ok { if _, ok := pool.urls[u]; ok {
@ -526,17 +547,9 @@ func (pool *Pool) doCrawl(bl *pkg.Baseline) {
pool.urls[u] = 1 pool.urls[u] = 1
pool.locker.Unlock() pool.locker.Unlock()
if bl.ReqDepth < maxCrawl { if bl.ReqDepth < maxCrawl {
parsed, err := url.Parse(u)
if err != nil {
continue
}
if parsed.Host != bl.Url.Host {
// 自动限定scoop, 防止爬到其他网站
continue
}
pool.wg.Add(1) pool.wg.Add(1)
pool.addAddition(&Unit{ pool.addAddition(&Unit{
path: parsed.Path, path: u[1:],
source: CrawlSource, source: CrawlSource,
depth: bl.ReqDepth + 1, depth: bl.ReqDepth + 1,
}) })
@ -645,7 +658,7 @@ func (pool *Pool) addAddition(u *Unit) {
} }
func (pool *Pool) addFuzzyBaseline(bl *pkg.Baseline) { func (pool *Pool) addFuzzyBaseline(bl *pkg.Baseline) {
if _, ok := pool.baselines[bl.Status]; !ok && IntsContains(FuzzyStatus, bl.Status) { if _, ok := pool.baselines[bl.Status]; !ok && pkg.IntsContains(FuzzyStatus, bl.Status) {
bl.Collect() bl.Collect()
pool.wg.Add(1) pool.wg.Add(1)
pool.doCrawl(bl) pool.doCrawl(bl)

View File

@ -326,6 +326,13 @@ func (r *Runner) Done() {
} }
func (r *Runner) Outputting() { func (r *Runner) Outputting() {
debugPrint := func(bl *pkg.Baseline) {
if r.Color {
logs.Log.Debug(bl.ColorString())
} else {
logs.Log.Debug(bl.String())
}
}
go func() { go func() {
var saveFunc func(*pkg.Baseline) var saveFunc func(*pkg.Baseline)
@ -355,7 +362,6 @@ func (r *Runner) Outputting() {
logs.Log.Console("[+] " + bl.String() + "\n") logs.Log.Console("[+] " + bl.String() + "\n")
} }
} }
} }
} }
@ -375,11 +381,7 @@ func (r *Runner) Outputting() {
r.AddPool(&Task{baseUrl: bl.UrlString, depth: bl.RecuDepth + 1}) r.AddPool(&Task{baseUrl: bl.UrlString, depth: bl.RecuDepth + 1})
} }
} else { } else {
if r.Color { debugPrint(bl)
logs.Log.Debug(bl.ColorString())
} else {
logs.Log.Debug(bl.String())
}
} }
} }
} }
@ -411,6 +413,8 @@ func (r *Runner) Outputting() {
} }
if r.Fuzzy { if r.Fuzzy {
fuzzySaveFunc(bl) fuzzySaveFunc(bl)
} else {
debugPrint(bl)
} }
} }
} }

View File

@ -16,24 +16,6 @@ func parseExtension(s string) string {
return "" return ""
} }
func StringsContains(s []string, e string) bool {
for _, v := range s {
if v == e {
return true
}
}
return false
}
func IntsContains(s []int, e int) bool {
for _, v := range s {
if v == e {
return true
}
}
return false
}
func loadFileToSlice(filename string) ([]string, error) { func loadFileToSlice(filename string) ([]string, error) {
var ss []string var ss []string
content, err := ioutil.ReadFile(filename) content, err := ioutil.ReadFile(filename)

View File

@ -18,9 +18,9 @@ func GetSourceName(s int) string {
case 1: case 1:
return "check" return "check"
case 2: case 2:
return "index"
case 3:
return "random" return "random"
case 3:
return "index"
case 4: case 4:
return "redirect" return "redirect"
case 5: case 5:
@ -177,7 +177,7 @@ func (bl *Baseline) CollectURL() {
urls := reg.FindAllStringSubmatch(string(bl.Body), -1) urls := reg.FindAllStringSubmatch(string(bl.Body), -1)
for _, u := range urls { for _, u := range urls {
if !filterJs(u[1]) { if !filterJs(u[1]) {
bl.URLs = append(bl.URLs, u[1]) bl.URLs = append(bl.URLs, formatURL(u[1]))
} }
} }
} }
@ -186,7 +186,7 @@ func (bl *Baseline) CollectURL() {
urls := reg.FindAllStringSubmatch(string(bl.Body), -1) urls := reg.FindAllStringSubmatch(string(bl.Body), -1)
for _, u := range urls { for _, u := range urls {
if !filterUrl(u[1]) { if !filterUrl(u[1]) {
bl.URLs = append(bl.URLs, u[1]) bl.URLs = append(bl.URLs, formatURL(u[1]))
} }
} }
} }
@ -194,7 +194,7 @@ func (bl *Baseline) CollectURL() {
if bl.URLs != nil { if bl.URLs != nil {
bl.Extracteds = append(bl.Extracteds, &fingers.Extracted{ bl.Extracteds = append(bl.Extracteds, &fingers.Extracted{
Name: "crawl", Name: "crawl",
ExtractResult: bl.URLs, ExtractResult: RemoveDuplication(bl.URLs),
}) })
} }
} }

View File

@ -77,6 +77,22 @@ func IntsContains(s []int, e int) bool {
return false return false
} }
func RemoveDuplication(arr []string) []string {
set := make(map[string]struct{}, len(arr))
j := 0
for _, v := range arr {
_, ok := set[v]
if ok {
continue
}
set[v] = struct{}{}
arr[j] = v
j++
}
return arr[:j]
}
func HasStdin() bool { func HasStdin() bool {
stat, err := os.Stdin.Stat() stat, err := os.Stdin.Stat()
if err != nil { if err != nil {
@ -210,20 +226,22 @@ func FingerDetect(content string) Frameworks {
var ( var (
BadExt = []string{".js", ".css", ".scss", ",", ".jpeg", ".jpg", ".png", ".gif", ".ico", ".svg", ".vue", ".ts"} BadExt = []string{".js", ".css", ".scss", ",", ".jpeg", ".jpg", ".png", ".gif", ".ico", ".svg", ".vue", ".ts"}
//BadURL = []string{".js?", ".css?", ".jpeg?", ".jpg?", ".png?", ".gif?", "github.com", "www.w3.org", "example.com", "<", ">", "{", "}", "[", "]", "|", "^", ";", "/js/", ".src", ".url", ".att", ".href", "location.href", "javascript:", "location:", ".createObject", ":location", ".path", "*#__PURE__*", "\\n"} BadURL = []string{"www.w3.org", "example.com", ".src", ".url", ".att", ".href", "location.href", "javascript:", "location:", ".createObject", ":location", ".path", "*#__PURE__*"}
BadScoop = []string{"www.w3.org", "example.com"}
) )
func filterJs(u string) bool { func filterJs(u string) bool {
for _, scoop := range BadScoop { if commonFilter(u) {
if strings.Contains(u, scoop) { return true
return true
}
} }
return false return false
} }
func filterUrl(u string) bool { func filterUrl(u string) bool {
if commonFilter(u) {
return true
}
parsed, err := url.Parse(u) parsed, err := url.Parse(u)
if err != nil { if err != nil {
return true return true
@ -235,7 +253,26 @@ func filterUrl(u string) bool {
} }
} }
} }
for _, scoop := range BadScoop { return false
}
func formatURL(u string) string {
// 去掉frag与params, 节约url.parse性能, 防止带参数造成意外的影响
if i := strings.Index(u, "?"); i != -1 {
return u[:i]
}
if i := strings.Index(u, "#"); i != -1 {
return u[:i]
}
return u
}
func commonFilter(u string) bool {
if strings.HasPrefix(u, "http") && len(u) < 9 {
return true
}
for _, scoop := range BadURL {
if strings.Contains(u, scoop) { if strings.Contains(u, scoop) {
return true return true
} }