From a3082d3f1af7f0f4f0e6a8e631eb04bec7dddd0d Mon Sep 17 00:00:00 2001 From: M09Ic Date: Fri, 6 Jan 2023 04:18:21 +0800 Subject: [PATCH] =?UTF-8?q?=E9=80=9A=E8=BF=87fasthttp=E5=A4=8D=E7=94=A8buf?= =?UTF-8?q?=E5=AF=BC=E8=87=B4=E7=9A=84=E6=95=B0=E6=8D=AE=E5=BC=95=E7=94=A8?= =?UTF-8?q?=E9=94=99=E8=AF=AF=E7=9A=84bug=20=E4=BC=98=E5=8C=96=E4=BB=A3?= =?UTF-8?q?=E7=A0=81=E7=BB=93=E6=9E=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- go.mod | 5 +- go.sum | 33 ++--- internal/pool.go | 306 +++++++++++++++++++++++---------------------- internal/runner.go | 1 - pkg/baseline.go | 4 +- 5 files changed, 171 insertions(+), 178 deletions(-) diff --git a/go.mod b/go.mod index b41ead6..a8cb9a7 100644 --- a/go.mod +++ b/go.mod @@ -14,10 +14,9 @@ require ( require ( github.com/antonmedv/expr v1.9.0 - github.com/go-dedup/simhash v0.0.0-20170904020510-9ecaca7b509c github.com/gosuri/uiprogress v0.0.1 github.com/jessevdk/go-flags v1.5.0 - github.com/panjf2000/ants/v2 v2.5.0 + github.com/panjf2000/ants/v2 v2.7.0 github.com/valyala/fasthttp v1.43.0 sigs.k8s.io/yaml v1.3.0 ) @@ -25,6 +24,7 @@ require ( require ( github.com/andybalholm/brotli v1.0.4 // indirect github.com/go-dedup/megophone v0.0.0-20170830025436-f01be21026f5 // indirect + github.com/go-dedup/simhash v0.0.0-20170904020510-9ecaca7b509c // indirect github.com/go-dedup/text v0.0.0-20170907015346-8bb1b95e3cb7 // indirect github.com/gosuri/uilive v0.0.4 // indirect github.com/klauspost/compress v1.15.10 // indirect @@ -33,5 +33,4 @@ require ( github.com/valyala/bytebufferpool v1.0.0 // indirect golang.org/x/sys v0.2.0 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect - gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go.sum b/go.sum index 0c2bf43..6a14b1e 100644 --- a/go.sum +++ b/go.sum @@ -17,32 +17,10 @@ github.com/chainreactors/ipcs v0.0.9/go.mod h1:E9M3Ohyq0TYQLlV4i2dbM9ThBZB1Nnd7O github.com/chainreactors/ipcs v0.0.13 h1:TZww7XRr4qZPWqy9DjBzcJgxtSUwT4TAbcho4156bRI= github.com/chainreactors/ipcs v0.0.13/go.mod h1:E9M3Ohyq0TYQLlV4i2dbM9ThBZB1Nnd7Oexoie2xLII= github.com/chainreactors/logs v0.6.1/go.mod h1:Y0EtAnoF0kiASIJUnXN0pcOt420iRpHOAnOhEphzRHA= -github.com/chainreactors/logs v0.6.2 h1:Yz5oayjwxO6KkjfjnmtT5WKbWjTaBdttFcneaFTpBe0= -github.com/chainreactors/logs v0.6.2/go.mod h1:Y0EtAnoF0kiASIJUnXN0pcOt420iRpHOAnOhEphzRHA= -github.com/chainreactors/logs v0.7.1-0.20221214130332-9bc5319887fe h1:FRMBKyuuh6EoHefqprP+pSblHrUxTaSp9GPJahYa+Fc= -github.com/chainreactors/logs v0.7.1-0.20221214130332-9bc5319887fe/go.mod h1:Y0EtAnoF0kiASIJUnXN0pcOt420iRpHOAnOhEphzRHA= -github.com/chainreactors/logs v0.7.1-0.20221214130646-2e08f98a1f71 h1:SpyPYjRihGyBiqoMUggXzCc4t9A0tmAvYdjghDG8s+M= -github.com/chainreactors/logs v0.7.1-0.20221214130646-2e08f98a1f71/go.mod h1:Y0EtAnoF0kiASIJUnXN0pcOt420iRpHOAnOhEphzRHA= -github.com/chainreactors/logs v0.7.1-0.20221214152543-60422cf64610 h1:ErODIlY9NmlrwEi6np3bm7HmuRZSaH3+ID2fJ2ViUpM= -github.com/chainreactors/logs v0.7.1-0.20221214152543-60422cf64610/go.mod h1:Y0EtAnoF0kiASIJUnXN0pcOt420iRpHOAnOhEphzRHA= github.com/chainreactors/logs v0.7.1-0.20221214153111-85f123ff6580 h1:28gbL1t+Mm4AoP1MeKM9oeSHoPcUwIrzrLtmdusHMIo= github.com/chainreactors/logs v0.7.1-0.20221214153111-85f123ff6580/go.mod h1:Y0EtAnoF0kiASIJUnXN0pcOt420iRpHOAnOhEphzRHA= github.com/chainreactors/parsers v0.2.9-0.20221210155102-cc0814762410 h1:K7EV0wtUuN6Rvh/MgqaBXyElD3guPsgNR5kF8nrV7iw= github.com/chainreactors/parsers v0.2.9-0.20221210155102-cc0814762410/go.mod h1:Z9weht+lnFCk7UcwqFu6lXpS7u5vttiy0AJYOAyCCLA= -github.com/chainreactors/words v0.3.2-0.20221212161820-bae5f18558db h1:Rv6mcLAKXRXoZuifCwGTlXnuDbDpbDKC0JsTI1op/OA= -github.com/chainreactors/words v0.3.2-0.20221212161820-bae5f18558db/go.mod h1:QIWX1vMT5j/Mp9zx3/wgZh3FqskhjCbo/3Ffy/Hxj9w= -github.com/chainreactors/words v0.3.2-0.20221214061028-a7cf9f9f8ddb h1:9AV8SH+SvEqmcylzZMeWei5NYIhl/0hMR7Y269M0Eqw= -github.com/chainreactors/words v0.3.2-0.20221214061028-a7cf9f9f8ddb/go.mod h1:QIWX1vMT5j/Mp9zx3/wgZh3FqskhjCbo/3Ffy/Hxj9w= -github.com/chainreactors/words v0.3.2-0.20221214062855-48dff09b01ad h1:uL3TIQgvFY7dLoX0tAzIIXilCPIcNeLz/124gs+SA/Q= -github.com/chainreactors/words v0.3.2-0.20221214062855-48dff09b01ad/go.mod h1:QIWX1vMT5j/Mp9zx3/wgZh3FqskhjCbo/3Ffy/Hxj9w= -github.com/chainreactors/words v0.3.2-0.20221214154622-381fc37abdf9 h1:IUNopSuorfINmn4pOuSwZtxJbg8zsRIZ67a33SiYoQ0= -github.com/chainreactors/words v0.3.2-0.20221214154622-381fc37abdf9/go.mod h1:QIWX1vMT5j/Mp9zx3/wgZh3FqskhjCbo/3Ffy/Hxj9w= -github.com/chainreactors/words v0.3.2-0.20230105095023-67f7d4e9186a h1:NoFfxJfPXiS2fzdmRIzWj4K+V7BRC2BAXlxQfckTeN0= -github.com/chainreactors/words v0.3.2-0.20230105095023-67f7d4e9186a/go.mod h1:QIWX1vMT5j/Mp9zx3/wgZh3FqskhjCbo/3Ffy/Hxj9w= -github.com/chainreactors/words v0.3.2-0.20230105160347-858217c41ce5 h1:uTiOYpgf4jz+/uwp+kAliLrOkVXjsC51pNmd4xH0uB4= -github.com/chainreactors/words v0.3.2-0.20230105160347-858217c41ce5/go.mod h1:QIWX1vMT5j/Mp9zx3/wgZh3FqskhjCbo/3Ffy/Hxj9w= -github.com/chainreactors/words v0.3.2-0.20230105161438-ec98bdc6906d h1:vWZwr3IaoEGEGE/IB0Im4gDqrOHpGK3szKOFDG4GFrc= -github.com/chainreactors/words v0.3.2-0.20230105161438-ec98bdc6906d/go.mod h1:QIWX1vMT5j/Mp9zx3/wgZh3FqskhjCbo/3Ffy/Hxj9w= github.com/chainreactors/words v0.3.2-0.20230105161651-7c1fc4c9605a h1:vRAMDJ6UQV73uyiRBQnuE/+S7Q7JTpfubSpyRlooZ2U= github.com/chainreactors/words v0.3.2-0.20230105161651-7c1fc4c9605a/go.mod h1:QIWX1vMT5j/Mp9zx3/wgZh3FqskhjCbo/3Ffy/Hxj9w= github.com/davecgh/go-spew v0.0.0-20161028175848-04cdfd42973b/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -72,8 +50,9 @@ github.com/mattn/go-isatty v0.0.16 h1:bq3VjFmv/sOjHtdEhmkEV4x1AJtvUvOJ2PFAZ5+peK github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= github.com/mattn/go-runewidth v0.0.4/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzpuz5H//U1FU= github.com/mattn/go-runewidth v0.0.8/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI= -github.com/panjf2000/ants/v2 v2.5.0 h1:1rWGWSnxCsQBga+nQbA4/iY6VMeNoOIAM0ZWh9u3q2Q= github.com/panjf2000/ants/v2 v2.5.0/go.mod h1:cU93usDlihJZ5CfRGNDYsiBYvoilLvBF5Qp/BT2GNRE= +github.com/panjf2000/ants/v2 v2.7.0 h1:Y3Bgpfo9HDkBoHNVFbMfY5mAvi5TAA17y3HbzQ74p5Y= +github.com/panjf2000/ants/v2 v2.7.0/go.mod h1:KIBmYG9QQX5U2qzFP/yQJaq/nSb6rahS9iEHkrCMgM8= github.com/pmezard/go-difflib v0.0.0-20151028094244-d8ed2627bdf0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= @@ -81,10 +60,14 @@ github.com/rivo/tview v0.0.0-20200219210816-cd38d7432498/go.mod h1:6lkG1x+13OShE github.com/rivo/uniseg v0.1.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= github.com/sanity-io/litter v1.2.0/go.mod h1:JF6pZUFgu2Q0sBZ+HSV35P8TVPI1TTzEwyu9FXAw2W4= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= github.com/stretchr/testify v0.0.0-20161117074351-18a02ba4a312/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= -github.com/stretchr/testify v1.7.1 h1:5TQK59W5E3v0r2duFAb7P95B6hEeOyEnHRa8MjYSMTY= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk= +github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/twmb/murmur3 v1.1.6 h1:mqrRot1BRxm+Yct+vavLMou2/iJt0tNVTTC0QoIjaZg= github.com/twmb/murmur3 v1.1.6/go.mod h1:Qq/R7NUyOfr65zD+6Q5IHKsJLwP7exErjN6lyyq3OSQ= github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw= @@ -99,6 +82,8 @@ golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20220906165146-f3363e06e74c/go.mod h1:YDH+HFinaLZZlnHAfSS6ZXJJ9M9t4Dl22yv3iI2vPwk= +golang.org/x/sync v0.1.0 h1:wsuoTGHzEhffawBOhz5CYhcrV4IdKZbEyZjBMuTp12o= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190626150813-e07cf5db2756/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= diff --git a/internal/pool.go b/internal/pool.go index 935cac8..796b98a 100644 --- a/internal/pool.go +++ b/internal/pool.go @@ -47,111 +47,7 @@ func NewPool(ctx context.Context, config *pkg.Config) (*Pool, error) { failedCount: 1, } - p, _ := ants.NewPoolWithFunc(config.Thread, func(i interface{}) { - atomic.AddInt32(&pool.Statistor.ReqTotal, 1) - unit := i.(*Unit) - req, err := pool.genReq(unit.path) - if err != nil { - logs.Log.Error(err.Error()) - return - } - req.SetHeaders(pool.Headers) - - start := time.Now() - resp, reqerr := pool.client.Do(pctx, req) - if pool.ClientType == ihttp.FAST { - defer fasthttp.ReleaseResponse(resp.FastResponse) - defer fasthttp.ReleaseRequest(req.FastRequest) - } - - // compare与各种错误处理 - var bl *pkg.Baseline - if reqerr != nil && reqerr != fasthttp.ErrBodyTooLarge { - pool.failedCount++ - atomic.AddInt32(&pool.Statistor.FailedNumber, 1) - bl = &pkg.Baseline{UrlString: pool.BaseURL + unit.path, IsValid: false, ErrString: reqerr.Error(), Reason: ErrRequestFailed.Error()} - pool.failedBaselines = append(pool.failedBaselines, bl) - } else { - if unit.source <= 3 || unit.source == CrawlSource { - bl = pkg.NewBaseline(req.URI(), req.Host(), resp) - } else { - if pool.MatchExpr != nil { - // 如果非wordsource, 或自定义了match函数, 则所有数据送入tempch中 - bl = pkg.NewBaseline(req.URI(), req.Host(), resp) - } else if err = pool.PreCompare(resp); err == nil { - // 通过预对比跳过一些无用数据, 减少性能消耗 - bl = pkg.NewBaseline(req.URI(), req.Host(), resp) - if err != ErrRedirect && bl.RedirectURL != "" { - if bl.RedirectURL != "" && !strings.HasPrefix(bl.RedirectURL, "http") { - bl.RedirectURL = "/" + strings.TrimLeft(bl.RedirectURL, "/") - bl.RedirectURL = pool.BaseURL + bl.RedirectURL - } - pool.wg.Add(1) - pool.doRedirect(bl, unit.depth) - } - pool.addFuzzyBaseline(bl) - } else { - bl = pkg.NewInvalidBaseline(req.URI(), req.Host(), resp, err.Error()) - } - } - } - - if bl.BodyLength > ihttp.DefaultMaxBodySize { - bl.ExceedLength = true - } - bl.Source = int(unit.source) - bl.ReqDepth = unit.depth - bl.Spended = time.Since(start).Milliseconds() - switch unit.source { - case InitRandomSource: - bl.Collect() - pool.random = bl - pool.addFuzzyBaseline(bl) - pool.initwg.Done() - case InitIndexSource: - bl.Collect() - pool.index = bl - pool.wg.Add(1) - pool.doCrawl(bl) - pool.initwg.Done() - case CheckSource: - if bl.ErrString != "" { - logs.Log.Warnf("[check.error] %s maybe ip had banned, break (%d/%d), error: %s", pool.BaseURL, pool.failedCount, pool.BreakThreshold, bl.ErrString) - } else if i := pool.random.Compare(bl); i < 1 { - if i == 0 { - if pool.Fuzzy { - logs.Log.Warn("[check.fuzzy] maybe trigger risk control, " + bl.String()) - } - } else { - pool.failedCount += 2 - logs.Log.Warn("[check.failed] maybe trigger risk control, " + bl.String()) - pool.failedBaselines = append(pool.failedBaselines, bl) - } - } else { - pool.resetFailed() // 如果后续访问正常, 重置错误次数 - logs.Log.Debug("[check.pass] " + bl.String()) - } - - case WordSource: - // 异步进行性能消耗较大的深度对比 - pool.tempCh <- bl - pool.reqCount++ - if pool.reqCount%pool.CheckPeriod == 0 { - pool.reqCount++ - pool.doCheck() - } else if pool.failedCount%pool.ErrPeriod == 0 { - pool.failedCount++ - pool.doCheck() - } - pool.bar.Done() - case RedirectSource: - bl.FrontURL = unit.frontUrl - pool.tempCh <- bl - case CrawlSource, ActiveSource, RuleSource, BakSource: - pool.tempCh <- bl - } - - }) + p, _ := ants.NewPoolWithFunc(config.Thread, pool.Invoke) pool.reqPool = p // 挂起一个异步的处理结果线程, 不干扰主线程的请求并发 @@ -319,7 +215,6 @@ func (pool *Pool) Run(ctx context.Context, offset, limit int) { pool.closeCh <- struct{}{} } }() - Loop: for { select { @@ -370,6 +265,111 @@ Loop: pool.Close() } +func (pool *Pool) Invoke(v interface{}) { + atomic.AddInt32(&pool.Statistor.ReqTotal, 1) + unit := v.(*Unit) + req, err := pool.genReq(unit.path) + if err != nil { + logs.Log.Error(err.Error()) + return + } + req.SetHeaders(pool.Headers) + + start := time.Now() + resp, reqerr := pool.client.Do(pool.ctx, req) + if pool.ClientType == ihttp.FAST { + defer fasthttp.ReleaseResponse(resp.FastResponse) + defer fasthttp.ReleaseRequest(req.FastRequest) + } + + // compare与各种错误处理 + var bl *pkg.Baseline + if reqerr != nil && reqerr != fasthttp.ErrBodyTooLarge { + pool.failedCount++ + atomic.AddInt32(&pool.Statistor.FailedNumber, 1) + bl = &pkg.Baseline{UrlString: pool.BaseURL + unit.path, IsValid: false, ErrString: reqerr.Error(), Reason: ErrRequestFailed.Error()} + pool.failedBaselines = append(pool.failedBaselines, bl) + } else { + if unit.source <= 3 || unit.source == CrawlSource { + bl = pkg.NewBaseline(req.URI(), req.Host(), resp) + } else { + if pool.MatchExpr != nil { + // 如果非wordsource, 或自定义了match函数, 则所有数据送入tempch中 + bl = pkg.NewBaseline(req.URI(), req.Host(), resp) + } else if err = pool.PreCompare(resp); err == nil { + // 通过预对比跳过一些无用数据, 减少性能消耗 + bl = pkg.NewBaseline(req.URI(), req.Host(), resp) + if err != ErrRedirect && bl.RedirectURL != "" { + if bl.RedirectURL != "" && !strings.HasPrefix(bl.RedirectURL, "http") { + bl.RedirectURL = "/" + strings.TrimLeft(bl.RedirectURL, "/") + bl.RedirectURL = pool.BaseURL + bl.RedirectURL + } + pool.wg.Add(1) + pool.doRedirect(bl, unit.depth) + } + pool.addFuzzyBaseline(bl) + } else { + bl = pkg.NewInvalidBaseline(req.URI(), req.Host(), resp, err.Error()) + } + } + } + + if bl.BodyLength > ihttp.DefaultMaxBodySize { + bl.ExceedLength = true + } + bl.Source = int(unit.source) + bl.ReqDepth = unit.depth + bl.Spended = time.Since(start).Milliseconds() + switch unit.source { + case InitRandomSource: + bl.Collect() + pool.random = bl + pool.addFuzzyBaseline(bl) + pool.initwg.Done() + case InitIndexSource: + bl.Collect() + pool.index = bl + pool.wg.Add(1) + pool.doCrawl(bl) + pool.initwg.Done() + case CheckSource: + if bl.ErrString != "" { + logs.Log.Warnf("[check.error] %s maybe ip had banned, break (%d/%d), error: %s", pool.BaseURL, pool.failedCount, pool.BreakThreshold, bl.ErrString) + } else if i := pool.random.Compare(bl); i < 1 { + if i == 0 { + if pool.Fuzzy { + logs.Log.Warn("[check.fuzzy] maybe trigger risk control, " + bl.String()) + } + } else { + pool.failedCount += 2 + logs.Log.Warn("[check.failed] maybe trigger risk control, " + bl.String()) + pool.failedBaselines = append(pool.failedBaselines, bl) + } + } else { + pool.resetFailed() // 如果后续访问正常, 重置错误次数 + logs.Log.Debug("[check.pass] " + bl.String()) + } + + case WordSource: + // 异步进行性能消耗较大的深度对比 + pool.tempCh <- bl + pool.reqCount++ + if pool.reqCount%pool.CheckPeriod == 0 { + pool.reqCount++ + pool.doCheck() + } else if pool.failedCount%pool.ErrPeriod == 0 { + pool.failedCount++ + pool.doCheck() + } + pool.bar.Done() + case RedirectSource: + bl.FrontURL = unit.frontUrl + pool.tempCh <- bl + case CrawlSource, ActiveSource, RuleSource, BakSource: + pool.tempCh <- bl + } +} + func (pool *Pool) PreCompare(resp *ihttp.Response) error { status := resp.StatusCode() if IntsContains(WhiteStatus, status) { @@ -463,75 +463,83 @@ func (pool *Pool) doRedirect(bl *pkg.Baseline, depth int) { if uu, err := url.Parse(bl.RedirectURL); err == nil && uu.Hostname() == pool.index.Url.Hostname() { pool.wg.Add(1) - pool.additionCh <- &Unit{ + go pool.addAddition(&Unit{ path: uu.Path, source: RedirectSource, frontUrl: bl.UrlString, depth: depth + 1, - } + }) } } func (pool *Pool) doCrawl(bl *pkg.Baseline) { - defer pool.wg.Done() if !pool.Crawl { + pool.wg.Done() return } bl.CollectURL() - for _, u := range bl.URLs { - if strings.HasPrefix(u, "//") { - u = bl.Url.Scheme + u - } else if strings.HasPrefix(u, "/") { - // 绝对目录拼接 - u = pkg.URLJoin(pool.BaseURL, u) - } else if !strings.HasPrefix(u, "http") { - // 相对目录拼接 - u = pkg.URLJoin(pool.BaseURL, u) - } + go func() { + defer pool.wg.Done() + for _, u := range bl.URLs { + if strings.HasPrefix(u, "//") { + u = bl.Url.Scheme + u + } else if strings.HasPrefix(u, "/") { + // 绝对目录拼接 + u = pkg.URLJoin(pool.BaseURL, u) + } else if !strings.HasPrefix(u, "http") { + // 相对目录拼接 + u = pkg.URLJoin(pool.BaseURL, u) + } - if _, ok := pool.urls[u]; ok { - pool.urls[u]++ - } else { - // 通过map去重, 只有新的url才会进入到该逻辑 - pool.locker.Lock() - pool.urls[u] = 1 - pool.locker.Unlock() - if bl.ReqDepth < maxCrawl { - parsed, err := url.Parse(u) - if err != nil { - continue + if _, ok := pool.urls[u]; ok { + pool.urls[u]++ + } else { + // 通过map去重, 只有新的url才会进入到该逻辑 + pool.locker.Lock() + pool.urls[u] = 1 + pool.locker.Unlock() + if bl.ReqDepth < maxCrawl { + parsed, err := url.Parse(u) + if err != nil { + continue + } + if parsed.Host != bl.Url.Host { + // 自动限定scoop, 防止爬到其他网站 + continue + } + pool.wg.Add(1) + pool.addAddition(&Unit{ + path: parsed.Path, + source: CrawlSource, + depth: bl.ReqDepth + 1, + }) } - if parsed.Host != bl.Url.Host { - // 自动限定scoop, 防止爬到其他网站 - continue - } - pool.wg.Add(1) - go pool.addAddition(&Unit{ - path: parsed.Path, - source: CrawlSource, - depth: bl.ReqDepth + 1, - }) } } - } + }() + } func (pool *Pool) doRule(bl *pkg.Baseline) { - defer pool.wg.Done() if pool.AppendRule == nil { + pool.wg.Done() return } if bl.Source == int(RuleSource) || bl.Dir { + pool.wg.Done() return } - for u := range rule.RunAsStream(pool.AppendRule.Expressions, path.Base(bl.Path)) { - pool.wg.Add(1) - go pool.addAddition(&Unit{ - path: path.Join(path.Dir(bl.Path), u), - source: RuleSource, - }) - } + go func() { + defer pool.wg.Done() + for u := range rule.RunAsStream(pool.AppendRule.Expressions, path.Base(bl.Path)) { + pool.wg.Add(1) + pool.addAddition(&Unit{ + path: path.Join(path.Dir(bl.Path), u), + source: RuleSource, + }) + } + }() } func (pool *Pool) doActive() { diff --git a/internal/runner.go b/internal/runner.go index 30b2053..f332581 100644 --- a/internal/runner.go +++ b/internal/runner.go @@ -228,7 +228,6 @@ func (r *Runner) Prepare(ctx context.Context) error { } r.Done() }) - } if err != nil { diff --git a/pkg/baseline.go b/pkg/baseline.go index 0ca9035..06d0fbc 100644 --- a/pkg/baseline.go +++ b/pkg/baseline.go @@ -55,7 +55,9 @@ func NewBaseline(u, host string, resp *ihttp.Response) *Baseline { if resp.ClientType == ihttp.STANDARD { bl.Host = host } - bl.Body = resp.Body() + body := resp.Body() + bl.Body = make([]byte, len(body)) + copy(bl.Body, body) bl.BodyLength = resp.ContentLength() bl.Header = resp.Header() bl.HeaderLength = len(bl.Header)