通过fasthttp复用buf导致的数据引用错误的bug

优化代码结构
This commit is contained in:
M09Ic 2023-01-06 04:18:21 +08:00
parent 806f6355d0
commit a3082d3f1a
5 changed files with 171 additions and 178 deletions

5
go.mod
View File

@ -14,10 +14,9 @@ require (
require (
github.com/antonmedv/expr v1.9.0
github.com/go-dedup/simhash v0.0.0-20170904020510-9ecaca7b509c
github.com/gosuri/uiprogress v0.0.1
github.com/jessevdk/go-flags v1.5.0
github.com/panjf2000/ants/v2 v2.5.0
github.com/panjf2000/ants/v2 v2.7.0
github.com/valyala/fasthttp v1.43.0
sigs.k8s.io/yaml v1.3.0
)
@ -25,6 +24,7 @@ require (
require (
github.com/andybalholm/brotli v1.0.4 // indirect
github.com/go-dedup/megophone v0.0.0-20170830025436-f01be21026f5 // indirect
github.com/go-dedup/simhash v0.0.0-20170904020510-9ecaca7b509c // indirect
github.com/go-dedup/text v0.0.0-20170907015346-8bb1b95e3cb7 // indirect
github.com/gosuri/uilive v0.0.4 // indirect
github.com/klauspost/compress v1.15.10 // indirect
@ -33,5 +33,4 @@ require (
github.com/valyala/bytebufferpool v1.0.0 // indirect
golang.org/x/sys v0.2.0 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)

33
go.sum
View File

@ -17,32 +17,10 @@ github.com/chainreactors/ipcs v0.0.9/go.mod h1:E9M3Ohyq0TYQLlV4i2dbM9ThBZB1Nnd7O
github.com/chainreactors/ipcs v0.0.13 h1:TZww7XRr4qZPWqy9DjBzcJgxtSUwT4TAbcho4156bRI=
github.com/chainreactors/ipcs v0.0.13/go.mod h1:E9M3Ohyq0TYQLlV4i2dbM9ThBZB1Nnd7Oexoie2xLII=
github.com/chainreactors/logs v0.6.1/go.mod h1:Y0EtAnoF0kiASIJUnXN0pcOt420iRpHOAnOhEphzRHA=
github.com/chainreactors/logs v0.6.2 h1:Yz5oayjwxO6KkjfjnmtT5WKbWjTaBdttFcneaFTpBe0=
github.com/chainreactors/logs v0.6.2/go.mod h1:Y0EtAnoF0kiASIJUnXN0pcOt420iRpHOAnOhEphzRHA=
github.com/chainreactors/logs v0.7.1-0.20221214130332-9bc5319887fe h1:FRMBKyuuh6EoHefqprP+pSblHrUxTaSp9GPJahYa+Fc=
github.com/chainreactors/logs v0.7.1-0.20221214130332-9bc5319887fe/go.mod h1:Y0EtAnoF0kiASIJUnXN0pcOt420iRpHOAnOhEphzRHA=
github.com/chainreactors/logs v0.7.1-0.20221214130646-2e08f98a1f71 h1:SpyPYjRihGyBiqoMUggXzCc4t9A0tmAvYdjghDG8s+M=
github.com/chainreactors/logs v0.7.1-0.20221214130646-2e08f98a1f71/go.mod h1:Y0EtAnoF0kiASIJUnXN0pcOt420iRpHOAnOhEphzRHA=
github.com/chainreactors/logs v0.7.1-0.20221214152543-60422cf64610 h1:ErODIlY9NmlrwEi6np3bm7HmuRZSaH3+ID2fJ2ViUpM=
github.com/chainreactors/logs v0.7.1-0.20221214152543-60422cf64610/go.mod h1:Y0EtAnoF0kiASIJUnXN0pcOt420iRpHOAnOhEphzRHA=
github.com/chainreactors/logs v0.7.1-0.20221214153111-85f123ff6580 h1:28gbL1t+Mm4AoP1MeKM9oeSHoPcUwIrzrLtmdusHMIo=
github.com/chainreactors/logs v0.7.1-0.20221214153111-85f123ff6580/go.mod h1:Y0EtAnoF0kiASIJUnXN0pcOt420iRpHOAnOhEphzRHA=
github.com/chainreactors/parsers v0.2.9-0.20221210155102-cc0814762410 h1:K7EV0wtUuN6Rvh/MgqaBXyElD3guPsgNR5kF8nrV7iw=
github.com/chainreactors/parsers v0.2.9-0.20221210155102-cc0814762410/go.mod h1:Z9weht+lnFCk7UcwqFu6lXpS7u5vttiy0AJYOAyCCLA=
github.com/chainreactors/words v0.3.2-0.20221212161820-bae5f18558db h1:Rv6mcLAKXRXoZuifCwGTlXnuDbDpbDKC0JsTI1op/OA=
github.com/chainreactors/words v0.3.2-0.20221212161820-bae5f18558db/go.mod h1:QIWX1vMT5j/Mp9zx3/wgZh3FqskhjCbo/3Ffy/Hxj9w=
github.com/chainreactors/words v0.3.2-0.20221214061028-a7cf9f9f8ddb h1:9AV8SH+SvEqmcylzZMeWei5NYIhl/0hMR7Y269M0Eqw=
github.com/chainreactors/words v0.3.2-0.20221214061028-a7cf9f9f8ddb/go.mod h1:QIWX1vMT5j/Mp9zx3/wgZh3FqskhjCbo/3Ffy/Hxj9w=
github.com/chainreactors/words v0.3.2-0.20221214062855-48dff09b01ad h1:uL3TIQgvFY7dLoX0tAzIIXilCPIcNeLz/124gs+SA/Q=
github.com/chainreactors/words v0.3.2-0.20221214062855-48dff09b01ad/go.mod h1:QIWX1vMT5j/Mp9zx3/wgZh3FqskhjCbo/3Ffy/Hxj9w=
github.com/chainreactors/words v0.3.2-0.20221214154622-381fc37abdf9 h1:IUNopSuorfINmn4pOuSwZtxJbg8zsRIZ67a33SiYoQ0=
github.com/chainreactors/words v0.3.2-0.20221214154622-381fc37abdf9/go.mod h1:QIWX1vMT5j/Mp9zx3/wgZh3FqskhjCbo/3Ffy/Hxj9w=
github.com/chainreactors/words v0.3.2-0.20230105095023-67f7d4e9186a h1:NoFfxJfPXiS2fzdmRIzWj4K+V7BRC2BAXlxQfckTeN0=
github.com/chainreactors/words v0.3.2-0.20230105095023-67f7d4e9186a/go.mod h1:QIWX1vMT5j/Mp9zx3/wgZh3FqskhjCbo/3Ffy/Hxj9w=
github.com/chainreactors/words v0.3.2-0.20230105160347-858217c41ce5 h1:uTiOYpgf4jz+/uwp+kAliLrOkVXjsC51pNmd4xH0uB4=
github.com/chainreactors/words v0.3.2-0.20230105160347-858217c41ce5/go.mod h1:QIWX1vMT5j/Mp9zx3/wgZh3FqskhjCbo/3Ffy/Hxj9w=
github.com/chainreactors/words v0.3.2-0.20230105161438-ec98bdc6906d h1:vWZwr3IaoEGEGE/IB0Im4gDqrOHpGK3szKOFDG4GFrc=
github.com/chainreactors/words v0.3.2-0.20230105161438-ec98bdc6906d/go.mod h1:QIWX1vMT5j/Mp9zx3/wgZh3FqskhjCbo/3Ffy/Hxj9w=
github.com/chainreactors/words v0.3.2-0.20230105161651-7c1fc4c9605a h1:vRAMDJ6UQV73uyiRBQnuE/+S7Q7JTpfubSpyRlooZ2U=
github.com/chainreactors/words v0.3.2-0.20230105161651-7c1fc4c9605a/go.mod h1:QIWX1vMT5j/Mp9zx3/wgZh3FqskhjCbo/3Ffy/Hxj9w=
github.com/davecgh/go-spew v0.0.0-20161028175848-04cdfd42973b/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
@ -72,8 +50,9 @@ github.com/mattn/go-isatty v0.0.16 h1:bq3VjFmv/sOjHtdEhmkEV4x1AJtvUvOJ2PFAZ5+peK
github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
github.com/mattn/go-runewidth v0.0.4/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzpuz5H//U1FU=
github.com/mattn/go-runewidth v0.0.8/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI=
github.com/panjf2000/ants/v2 v2.5.0 h1:1rWGWSnxCsQBga+nQbA4/iY6VMeNoOIAM0ZWh9u3q2Q=
github.com/panjf2000/ants/v2 v2.5.0/go.mod h1:cU93usDlihJZ5CfRGNDYsiBYvoilLvBF5Qp/BT2GNRE=
github.com/panjf2000/ants/v2 v2.7.0 h1:Y3Bgpfo9HDkBoHNVFbMfY5mAvi5TAA17y3HbzQ74p5Y=
github.com/panjf2000/ants/v2 v2.7.0/go.mod h1:KIBmYG9QQX5U2qzFP/yQJaq/nSb6rahS9iEHkrCMgM8=
github.com/pmezard/go-difflib v0.0.0-20151028094244-d8ed2627bdf0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
@ -81,10 +60,14 @@ github.com/rivo/tview v0.0.0-20200219210816-cd38d7432498/go.mod h1:6lkG1x+13OShE
github.com/rivo/uniseg v0.1.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
github.com/sanity-io/litter v1.2.0/go.mod h1:JF6pZUFgu2Q0sBZ+HSV35P8TVPI1TTzEwyu9FXAw2W4=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
github.com/stretchr/testify v0.0.0-20161117074351-18a02ba4a312/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA=
github.com/stretchr/testify v1.7.1 h1:5TQK59W5E3v0r2duFAb7P95B6hEeOyEnHRa8MjYSMTY=
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/twmb/murmur3 v1.1.6 h1:mqrRot1BRxm+Yct+vavLMou2/iJt0tNVTTC0QoIjaZg=
github.com/twmb/murmur3 v1.1.6/go.mod h1:Qq/R7NUyOfr65zD+6Q5IHKsJLwP7exErjN6lyyq3OSQ=
github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw=
@ -99,6 +82,8 @@ golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn
golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/net v0.0.0-20220906165146-f3363e06e74c/go.mod h1:YDH+HFinaLZZlnHAfSS6ZXJJ9M9t4Dl22yv3iI2vPwk=
golang.org/x/sync v0.1.0 h1:wsuoTGHzEhffawBOhz5CYhcrV4IdKZbEyZjBMuTp12o=
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20190626150813-e07cf5db2756/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=

View File

@ -47,111 +47,7 @@ func NewPool(ctx context.Context, config *pkg.Config) (*Pool, error) {
failedCount: 1,
}
p, _ := ants.NewPoolWithFunc(config.Thread, func(i interface{}) {
atomic.AddInt32(&pool.Statistor.ReqTotal, 1)
unit := i.(*Unit)
req, err := pool.genReq(unit.path)
if err != nil {
logs.Log.Error(err.Error())
return
}
req.SetHeaders(pool.Headers)
start := time.Now()
resp, reqerr := pool.client.Do(pctx, req)
if pool.ClientType == ihttp.FAST {
defer fasthttp.ReleaseResponse(resp.FastResponse)
defer fasthttp.ReleaseRequest(req.FastRequest)
}
// compare与各种错误处理
var bl *pkg.Baseline
if reqerr != nil && reqerr != fasthttp.ErrBodyTooLarge {
pool.failedCount++
atomic.AddInt32(&pool.Statistor.FailedNumber, 1)
bl = &pkg.Baseline{UrlString: pool.BaseURL + unit.path, IsValid: false, ErrString: reqerr.Error(), Reason: ErrRequestFailed.Error()}
pool.failedBaselines = append(pool.failedBaselines, bl)
} else {
if unit.source <= 3 || unit.source == CrawlSource {
bl = pkg.NewBaseline(req.URI(), req.Host(), resp)
} else {
if pool.MatchExpr != nil {
// 如果非wordsource, 或自定义了match函数, 则所有数据送入tempch中
bl = pkg.NewBaseline(req.URI(), req.Host(), resp)
} else if err = pool.PreCompare(resp); err == nil {
// 通过预对比跳过一些无用数据, 减少性能消耗
bl = pkg.NewBaseline(req.URI(), req.Host(), resp)
if err != ErrRedirect && bl.RedirectURL != "" {
if bl.RedirectURL != "" && !strings.HasPrefix(bl.RedirectURL, "http") {
bl.RedirectURL = "/" + strings.TrimLeft(bl.RedirectURL, "/")
bl.RedirectURL = pool.BaseURL + bl.RedirectURL
}
pool.wg.Add(1)
pool.doRedirect(bl, unit.depth)
}
pool.addFuzzyBaseline(bl)
} else {
bl = pkg.NewInvalidBaseline(req.URI(), req.Host(), resp, err.Error())
}
}
}
if bl.BodyLength > ihttp.DefaultMaxBodySize {
bl.ExceedLength = true
}
bl.Source = int(unit.source)
bl.ReqDepth = unit.depth
bl.Spended = time.Since(start).Milliseconds()
switch unit.source {
case InitRandomSource:
bl.Collect()
pool.random = bl
pool.addFuzzyBaseline(bl)
pool.initwg.Done()
case InitIndexSource:
bl.Collect()
pool.index = bl
pool.wg.Add(1)
pool.doCrawl(bl)
pool.initwg.Done()
case CheckSource:
if bl.ErrString != "" {
logs.Log.Warnf("[check.error] %s maybe ip had banned, break (%d/%d), error: %s", pool.BaseURL, pool.failedCount, pool.BreakThreshold, bl.ErrString)
} else if i := pool.random.Compare(bl); i < 1 {
if i == 0 {
if pool.Fuzzy {
logs.Log.Warn("[check.fuzzy] maybe trigger risk control, " + bl.String())
}
} else {
pool.failedCount += 2
logs.Log.Warn("[check.failed] maybe trigger risk control, " + bl.String())
pool.failedBaselines = append(pool.failedBaselines, bl)
}
} else {
pool.resetFailed() // 如果后续访问正常, 重置错误次数
logs.Log.Debug("[check.pass] " + bl.String())
}
case WordSource:
// 异步进行性能消耗较大的深度对比
pool.tempCh <- bl
pool.reqCount++
if pool.reqCount%pool.CheckPeriod == 0 {
pool.reqCount++
pool.doCheck()
} else if pool.failedCount%pool.ErrPeriod == 0 {
pool.failedCount++
pool.doCheck()
}
pool.bar.Done()
case RedirectSource:
bl.FrontURL = unit.frontUrl
pool.tempCh <- bl
case CrawlSource, ActiveSource, RuleSource, BakSource:
pool.tempCh <- bl
}
})
p, _ := ants.NewPoolWithFunc(config.Thread, pool.Invoke)
pool.reqPool = p
// 挂起一个异步的处理结果线程, 不干扰主线程的请求并发
@ -319,7 +215,6 @@ func (pool *Pool) Run(ctx context.Context, offset, limit int) {
pool.closeCh <- struct{}{}
}
}()
Loop:
for {
select {
@ -370,6 +265,111 @@ Loop:
pool.Close()
}
func (pool *Pool) Invoke(v interface{}) {
atomic.AddInt32(&pool.Statistor.ReqTotal, 1)
unit := v.(*Unit)
req, err := pool.genReq(unit.path)
if err != nil {
logs.Log.Error(err.Error())
return
}
req.SetHeaders(pool.Headers)
start := time.Now()
resp, reqerr := pool.client.Do(pool.ctx, req)
if pool.ClientType == ihttp.FAST {
defer fasthttp.ReleaseResponse(resp.FastResponse)
defer fasthttp.ReleaseRequest(req.FastRequest)
}
// compare与各种错误处理
var bl *pkg.Baseline
if reqerr != nil && reqerr != fasthttp.ErrBodyTooLarge {
pool.failedCount++
atomic.AddInt32(&pool.Statistor.FailedNumber, 1)
bl = &pkg.Baseline{UrlString: pool.BaseURL + unit.path, IsValid: false, ErrString: reqerr.Error(), Reason: ErrRequestFailed.Error()}
pool.failedBaselines = append(pool.failedBaselines, bl)
} else {
if unit.source <= 3 || unit.source == CrawlSource {
bl = pkg.NewBaseline(req.URI(), req.Host(), resp)
} else {
if pool.MatchExpr != nil {
// 如果非wordsource, 或自定义了match函数, 则所有数据送入tempch中
bl = pkg.NewBaseline(req.URI(), req.Host(), resp)
} else if err = pool.PreCompare(resp); err == nil {
// 通过预对比跳过一些无用数据, 减少性能消耗
bl = pkg.NewBaseline(req.URI(), req.Host(), resp)
if err != ErrRedirect && bl.RedirectURL != "" {
if bl.RedirectURL != "" && !strings.HasPrefix(bl.RedirectURL, "http") {
bl.RedirectURL = "/" + strings.TrimLeft(bl.RedirectURL, "/")
bl.RedirectURL = pool.BaseURL + bl.RedirectURL
}
pool.wg.Add(1)
pool.doRedirect(bl, unit.depth)
}
pool.addFuzzyBaseline(bl)
} else {
bl = pkg.NewInvalidBaseline(req.URI(), req.Host(), resp, err.Error())
}
}
}
if bl.BodyLength > ihttp.DefaultMaxBodySize {
bl.ExceedLength = true
}
bl.Source = int(unit.source)
bl.ReqDepth = unit.depth
bl.Spended = time.Since(start).Milliseconds()
switch unit.source {
case InitRandomSource:
bl.Collect()
pool.random = bl
pool.addFuzzyBaseline(bl)
pool.initwg.Done()
case InitIndexSource:
bl.Collect()
pool.index = bl
pool.wg.Add(1)
pool.doCrawl(bl)
pool.initwg.Done()
case CheckSource:
if bl.ErrString != "" {
logs.Log.Warnf("[check.error] %s maybe ip had banned, break (%d/%d), error: %s", pool.BaseURL, pool.failedCount, pool.BreakThreshold, bl.ErrString)
} else if i := pool.random.Compare(bl); i < 1 {
if i == 0 {
if pool.Fuzzy {
logs.Log.Warn("[check.fuzzy] maybe trigger risk control, " + bl.String())
}
} else {
pool.failedCount += 2
logs.Log.Warn("[check.failed] maybe trigger risk control, " + bl.String())
pool.failedBaselines = append(pool.failedBaselines, bl)
}
} else {
pool.resetFailed() // 如果后续访问正常, 重置错误次数
logs.Log.Debug("[check.pass] " + bl.String())
}
case WordSource:
// 异步进行性能消耗较大的深度对比
pool.tempCh <- bl
pool.reqCount++
if pool.reqCount%pool.CheckPeriod == 0 {
pool.reqCount++
pool.doCheck()
} else if pool.failedCount%pool.ErrPeriod == 0 {
pool.failedCount++
pool.doCheck()
}
pool.bar.Done()
case RedirectSource:
bl.FrontURL = unit.frontUrl
pool.tempCh <- bl
case CrawlSource, ActiveSource, RuleSource, BakSource:
pool.tempCh <- bl
}
}
func (pool *Pool) PreCompare(resp *ihttp.Response) error {
status := resp.StatusCode()
if IntsContains(WhiteStatus, status) {
@ -463,75 +463,83 @@ func (pool *Pool) doRedirect(bl *pkg.Baseline, depth int) {
if uu, err := url.Parse(bl.RedirectURL); err == nil && uu.Hostname() == pool.index.Url.Hostname() {
pool.wg.Add(1)
pool.additionCh <- &Unit{
go pool.addAddition(&Unit{
path: uu.Path,
source: RedirectSource,
frontUrl: bl.UrlString,
depth: depth + 1,
}
})
}
}
func (pool *Pool) doCrawl(bl *pkg.Baseline) {
defer pool.wg.Done()
if !pool.Crawl {
pool.wg.Done()
return
}
bl.CollectURL()
for _, u := range bl.URLs {
if strings.HasPrefix(u, "//") {
u = bl.Url.Scheme + u
} else if strings.HasPrefix(u, "/") {
// 绝对目录拼接
u = pkg.URLJoin(pool.BaseURL, u)
} else if !strings.HasPrefix(u, "http") {
// 相对目录拼接
u = pkg.URLJoin(pool.BaseURL, u)
}
go func() {
defer pool.wg.Done()
for _, u := range bl.URLs {
if strings.HasPrefix(u, "//") {
u = bl.Url.Scheme + u
} else if strings.HasPrefix(u, "/") {
// 绝对目录拼接
u = pkg.URLJoin(pool.BaseURL, u)
} else if !strings.HasPrefix(u, "http") {
// 相对目录拼接
u = pkg.URLJoin(pool.BaseURL, u)
}
if _, ok := pool.urls[u]; ok {
pool.urls[u]++
} else {
// 通过map去重, 只有新的url才会进入到该逻辑
pool.locker.Lock()
pool.urls[u] = 1
pool.locker.Unlock()
if bl.ReqDepth < maxCrawl {
parsed, err := url.Parse(u)
if err != nil {
continue
if _, ok := pool.urls[u]; ok {
pool.urls[u]++
} else {
// 通过map去重, 只有新的url才会进入到该逻辑
pool.locker.Lock()
pool.urls[u] = 1
pool.locker.Unlock()
if bl.ReqDepth < maxCrawl {
parsed, err := url.Parse(u)
if err != nil {
continue
}
if parsed.Host != bl.Url.Host {
// 自动限定scoop, 防止爬到其他网站
continue
}
pool.wg.Add(1)
pool.addAddition(&Unit{
path: parsed.Path,
source: CrawlSource,
depth: bl.ReqDepth + 1,
})
}
if parsed.Host != bl.Url.Host {
// 自动限定scoop, 防止爬到其他网站
continue
}
pool.wg.Add(1)
go pool.addAddition(&Unit{
path: parsed.Path,
source: CrawlSource,
depth: bl.ReqDepth + 1,
})
}
}
}
}()
}
func (pool *Pool) doRule(bl *pkg.Baseline) {
defer pool.wg.Done()
if pool.AppendRule == nil {
pool.wg.Done()
return
}
if bl.Source == int(RuleSource) || bl.Dir {
pool.wg.Done()
return
}
for u := range rule.RunAsStream(pool.AppendRule.Expressions, path.Base(bl.Path)) {
pool.wg.Add(1)
go pool.addAddition(&Unit{
path: path.Join(path.Dir(bl.Path), u),
source: RuleSource,
})
}
go func() {
defer pool.wg.Done()
for u := range rule.RunAsStream(pool.AppendRule.Expressions, path.Base(bl.Path)) {
pool.wg.Add(1)
pool.addAddition(&Unit{
path: path.Join(path.Dir(bl.Path), u),
source: RuleSource,
})
}
}()
}
func (pool *Pool) doActive() {

View File

@ -228,7 +228,6 @@ func (r *Runner) Prepare(ctx context.Context) error {
}
r.Done()
})
}
if err != nil {

View File

@ -55,7 +55,9 @@ func NewBaseline(u, host string, resp *ihttp.Response) *Baseline {
if resp.ClientType == ihttp.STANDARD {
bl.Host = host
}
bl.Body = resp.Body()
body := resp.Body()
bl.Body = make([]byte, len(body))
copy(bl.Body, body)
bl.BodyLength = resp.ContentLength()
bl.Header = resp.Header()
bl.HeaderLength = len(bl.Header)