mirror of
https://github.com/chainreactors/spray.git
synced 2025-05-06 02:31:21 +00:00
通过fasthttp复用buf导致的数据引用错误的bug
优化代码结构
This commit is contained in:
parent
806f6355d0
commit
a3082d3f1a
5
go.mod
5
go.mod
@ -14,10 +14,9 @@ require (
|
||||
|
||||
require (
|
||||
github.com/antonmedv/expr v1.9.0
|
||||
github.com/go-dedup/simhash v0.0.0-20170904020510-9ecaca7b509c
|
||||
github.com/gosuri/uiprogress v0.0.1
|
||||
github.com/jessevdk/go-flags v1.5.0
|
||||
github.com/panjf2000/ants/v2 v2.5.0
|
||||
github.com/panjf2000/ants/v2 v2.7.0
|
||||
github.com/valyala/fasthttp v1.43.0
|
||||
sigs.k8s.io/yaml v1.3.0
|
||||
)
|
||||
@ -25,6 +24,7 @@ require (
|
||||
require (
|
||||
github.com/andybalholm/brotli v1.0.4 // indirect
|
||||
github.com/go-dedup/megophone v0.0.0-20170830025436-f01be21026f5 // indirect
|
||||
github.com/go-dedup/simhash v0.0.0-20170904020510-9ecaca7b509c // indirect
|
||||
github.com/go-dedup/text v0.0.0-20170907015346-8bb1b95e3cb7 // indirect
|
||||
github.com/gosuri/uilive v0.0.4 // indirect
|
||||
github.com/klauspost/compress v1.15.10 // indirect
|
||||
@ -33,5 +33,4 @@ require (
|
||||
github.com/valyala/bytebufferpool v1.0.0 // indirect
|
||||
golang.org/x/sys v0.2.0 // indirect
|
||||
gopkg.in/yaml.v2 v2.4.0 // indirect
|
||||
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||
)
|
||||
|
33
go.sum
33
go.sum
@ -17,32 +17,10 @@ github.com/chainreactors/ipcs v0.0.9/go.mod h1:E9M3Ohyq0TYQLlV4i2dbM9ThBZB1Nnd7O
|
||||
github.com/chainreactors/ipcs v0.0.13 h1:TZww7XRr4qZPWqy9DjBzcJgxtSUwT4TAbcho4156bRI=
|
||||
github.com/chainreactors/ipcs v0.0.13/go.mod h1:E9M3Ohyq0TYQLlV4i2dbM9ThBZB1Nnd7Oexoie2xLII=
|
||||
github.com/chainreactors/logs v0.6.1/go.mod h1:Y0EtAnoF0kiASIJUnXN0pcOt420iRpHOAnOhEphzRHA=
|
||||
github.com/chainreactors/logs v0.6.2 h1:Yz5oayjwxO6KkjfjnmtT5WKbWjTaBdttFcneaFTpBe0=
|
||||
github.com/chainreactors/logs v0.6.2/go.mod h1:Y0EtAnoF0kiASIJUnXN0pcOt420iRpHOAnOhEphzRHA=
|
||||
github.com/chainreactors/logs v0.7.1-0.20221214130332-9bc5319887fe h1:FRMBKyuuh6EoHefqprP+pSblHrUxTaSp9GPJahYa+Fc=
|
||||
github.com/chainreactors/logs v0.7.1-0.20221214130332-9bc5319887fe/go.mod h1:Y0EtAnoF0kiASIJUnXN0pcOt420iRpHOAnOhEphzRHA=
|
||||
github.com/chainreactors/logs v0.7.1-0.20221214130646-2e08f98a1f71 h1:SpyPYjRihGyBiqoMUggXzCc4t9A0tmAvYdjghDG8s+M=
|
||||
github.com/chainreactors/logs v0.7.1-0.20221214130646-2e08f98a1f71/go.mod h1:Y0EtAnoF0kiASIJUnXN0pcOt420iRpHOAnOhEphzRHA=
|
||||
github.com/chainreactors/logs v0.7.1-0.20221214152543-60422cf64610 h1:ErODIlY9NmlrwEi6np3bm7HmuRZSaH3+ID2fJ2ViUpM=
|
||||
github.com/chainreactors/logs v0.7.1-0.20221214152543-60422cf64610/go.mod h1:Y0EtAnoF0kiASIJUnXN0pcOt420iRpHOAnOhEphzRHA=
|
||||
github.com/chainreactors/logs v0.7.1-0.20221214153111-85f123ff6580 h1:28gbL1t+Mm4AoP1MeKM9oeSHoPcUwIrzrLtmdusHMIo=
|
||||
github.com/chainreactors/logs v0.7.1-0.20221214153111-85f123ff6580/go.mod h1:Y0EtAnoF0kiASIJUnXN0pcOt420iRpHOAnOhEphzRHA=
|
||||
github.com/chainreactors/parsers v0.2.9-0.20221210155102-cc0814762410 h1:K7EV0wtUuN6Rvh/MgqaBXyElD3guPsgNR5kF8nrV7iw=
|
||||
github.com/chainreactors/parsers v0.2.9-0.20221210155102-cc0814762410/go.mod h1:Z9weht+lnFCk7UcwqFu6lXpS7u5vttiy0AJYOAyCCLA=
|
||||
github.com/chainreactors/words v0.3.2-0.20221212161820-bae5f18558db h1:Rv6mcLAKXRXoZuifCwGTlXnuDbDpbDKC0JsTI1op/OA=
|
||||
github.com/chainreactors/words v0.3.2-0.20221212161820-bae5f18558db/go.mod h1:QIWX1vMT5j/Mp9zx3/wgZh3FqskhjCbo/3Ffy/Hxj9w=
|
||||
github.com/chainreactors/words v0.3.2-0.20221214061028-a7cf9f9f8ddb h1:9AV8SH+SvEqmcylzZMeWei5NYIhl/0hMR7Y269M0Eqw=
|
||||
github.com/chainreactors/words v0.3.2-0.20221214061028-a7cf9f9f8ddb/go.mod h1:QIWX1vMT5j/Mp9zx3/wgZh3FqskhjCbo/3Ffy/Hxj9w=
|
||||
github.com/chainreactors/words v0.3.2-0.20221214062855-48dff09b01ad h1:uL3TIQgvFY7dLoX0tAzIIXilCPIcNeLz/124gs+SA/Q=
|
||||
github.com/chainreactors/words v0.3.2-0.20221214062855-48dff09b01ad/go.mod h1:QIWX1vMT5j/Mp9zx3/wgZh3FqskhjCbo/3Ffy/Hxj9w=
|
||||
github.com/chainreactors/words v0.3.2-0.20221214154622-381fc37abdf9 h1:IUNopSuorfINmn4pOuSwZtxJbg8zsRIZ67a33SiYoQ0=
|
||||
github.com/chainreactors/words v0.3.2-0.20221214154622-381fc37abdf9/go.mod h1:QIWX1vMT5j/Mp9zx3/wgZh3FqskhjCbo/3Ffy/Hxj9w=
|
||||
github.com/chainreactors/words v0.3.2-0.20230105095023-67f7d4e9186a h1:NoFfxJfPXiS2fzdmRIzWj4K+V7BRC2BAXlxQfckTeN0=
|
||||
github.com/chainreactors/words v0.3.2-0.20230105095023-67f7d4e9186a/go.mod h1:QIWX1vMT5j/Mp9zx3/wgZh3FqskhjCbo/3Ffy/Hxj9w=
|
||||
github.com/chainreactors/words v0.3.2-0.20230105160347-858217c41ce5 h1:uTiOYpgf4jz+/uwp+kAliLrOkVXjsC51pNmd4xH0uB4=
|
||||
github.com/chainreactors/words v0.3.2-0.20230105160347-858217c41ce5/go.mod h1:QIWX1vMT5j/Mp9zx3/wgZh3FqskhjCbo/3Ffy/Hxj9w=
|
||||
github.com/chainreactors/words v0.3.2-0.20230105161438-ec98bdc6906d h1:vWZwr3IaoEGEGE/IB0Im4gDqrOHpGK3szKOFDG4GFrc=
|
||||
github.com/chainreactors/words v0.3.2-0.20230105161438-ec98bdc6906d/go.mod h1:QIWX1vMT5j/Mp9zx3/wgZh3FqskhjCbo/3Ffy/Hxj9w=
|
||||
github.com/chainreactors/words v0.3.2-0.20230105161651-7c1fc4c9605a h1:vRAMDJ6UQV73uyiRBQnuE/+S7Q7JTpfubSpyRlooZ2U=
|
||||
github.com/chainreactors/words v0.3.2-0.20230105161651-7c1fc4c9605a/go.mod h1:QIWX1vMT5j/Mp9zx3/wgZh3FqskhjCbo/3Ffy/Hxj9w=
|
||||
github.com/davecgh/go-spew v0.0.0-20161028175848-04cdfd42973b/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
@ -72,8 +50,9 @@ github.com/mattn/go-isatty v0.0.16 h1:bq3VjFmv/sOjHtdEhmkEV4x1AJtvUvOJ2PFAZ5+peK
|
||||
github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
|
||||
github.com/mattn/go-runewidth v0.0.4/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzpuz5H//U1FU=
|
||||
github.com/mattn/go-runewidth v0.0.8/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI=
|
||||
github.com/panjf2000/ants/v2 v2.5.0 h1:1rWGWSnxCsQBga+nQbA4/iY6VMeNoOIAM0ZWh9u3q2Q=
|
||||
github.com/panjf2000/ants/v2 v2.5.0/go.mod h1:cU93usDlihJZ5CfRGNDYsiBYvoilLvBF5Qp/BT2GNRE=
|
||||
github.com/panjf2000/ants/v2 v2.7.0 h1:Y3Bgpfo9HDkBoHNVFbMfY5mAvi5TAA17y3HbzQ74p5Y=
|
||||
github.com/panjf2000/ants/v2 v2.7.0/go.mod h1:KIBmYG9QQX5U2qzFP/yQJaq/nSb6rahS9iEHkrCMgM8=
|
||||
github.com/pmezard/go-difflib v0.0.0-20151028094244-d8ed2627bdf0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
@ -81,10 +60,14 @@ github.com/rivo/tview v0.0.0-20200219210816-cd38d7432498/go.mod h1:6lkG1x+13OShE
|
||||
github.com/rivo/uniseg v0.1.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
|
||||
github.com/sanity-io/litter v1.2.0/go.mod h1:JF6pZUFgu2Q0sBZ+HSV35P8TVPI1TTzEwyu9FXAw2W4=
|
||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
|
||||
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
|
||||
github.com/stretchr/testify v0.0.0-20161117074351-18a02ba4a312/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
|
||||
github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA=
|
||||
github.com/stretchr/testify v1.7.1 h1:5TQK59W5E3v0r2duFAb7P95B6hEeOyEnHRa8MjYSMTY=
|
||||
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
||||
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
|
||||
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
|
||||
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
|
||||
github.com/twmb/murmur3 v1.1.6 h1:mqrRot1BRxm+Yct+vavLMou2/iJt0tNVTTC0QoIjaZg=
|
||||
github.com/twmb/murmur3 v1.1.6/go.mod h1:Qq/R7NUyOfr65zD+6Q5IHKsJLwP7exErjN6lyyq3OSQ=
|
||||
github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw=
|
||||
@ -99,6 +82,8 @@ golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn
|
||||
golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
|
||||
golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
|
||||
golang.org/x/net v0.0.0-20220906165146-f3363e06e74c/go.mod h1:YDH+HFinaLZZlnHAfSS6ZXJJ9M9t4Dl22yv3iI2vPwk=
|
||||
golang.org/x/sync v0.1.0 h1:wsuoTGHzEhffawBOhz5CYhcrV4IdKZbEyZjBMuTp12o=
|
||||
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20190626150813-e07cf5db2756/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
|
306
internal/pool.go
306
internal/pool.go
@ -47,111 +47,7 @@ func NewPool(ctx context.Context, config *pkg.Config) (*Pool, error) {
|
||||
failedCount: 1,
|
||||
}
|
||||
|
||||
p, _ := ants.NewPoolWithFunc(config.Thread, func(i interface{}) {
|
||||
atomic.AddInt32(&pool.Statistor.ReqTotal, 1)
|
||||
unit := i.(*Unit)
|
||||
req, err := pool.genReq(unit.path)
|
||||
if err != nil {
|
||||
logs.Log.Error(err.Error())
|
||||
return
|
||||
}
|
||||
req.SetHeaders(pool.Headers)
|
||||
|
||||
start := time.Now()
|
||||
resp, reqerr := pool.client.Do(pctx, req)
|
||||
if pool.ClientType == ihttp.FAST {
|
||||
defer fasthttp.ReleaseResponse(resp.FastResponse)
|
||||
defer fasthttp.ReleaseRequest(req.FastRequest)
|
||||
}
|
||||
|
||||
// compare与各种错误处理
|
||||
var bl *pkg.Baseline
|
||||
if reqerr != nil && reqerr != fasthttp.ErrBodyTooLarge {
|
||||
pool.failedCount++
|
||||
atomic.AddInt32(&pool.Statistor.FailedNumber, 1)
|
||||
bl = &pkg.Baseline{UrlString: pool.BaseURL + unit.path, IsValid: false, ErrString: reqerr.Error(), Reason: ErrRequestFailed.Error()}
|
||||
pool.failedBaselines = append(pool.failedBaselines, bl)
|
||||
} else {
|
||||
if unit.source <= 3 || unit.source == CrawlSource {
|
||||
bl = pkg.NewBaseline(req.URI(), req.Host(), resp)
|
||||
} else {
|
||||
if pool.MatchExpr != nil {
|
||||
// 如果非wordsource, 或自定义了match函数, 则所有数据送入tempch中
|
||||
bl = pkg.NewBaseline(req.URI(), req.Host(), resp)
|
||||
} else if err = pool.PreCompare(resp); err == nil {
|
||||
// 通过预对比跳过一些无用数据, 减少性能消耗
|
||||
bl = pkg.NewBaseline(req.URI(), req.Host(), resp)
|
||||
if err != ErrRedirect && bl.RedirectURL != "" {
|
||||
if bl.RedirectURL != "" && !strings.HasPrefix(bl.RedirectURL, "http") {
|
||||
bl.RedirectURL = "/" + strings.TrimLeft(bl.RedirectURL, "/")
|
||||
bl.RedirectURL = pool.BaseURL + bl.RedirectURL
|
||||
}
|
||||
pool.wg.Add(1)
|
||||
pool.doRedirect(bl, unit.depth)
|
||||
}
|
||||
pool.addFuzzyBaseline(bl)
|
||||
} else {
|
||||
bl = pkg.NewInvalidBaseline(req.URI(), req.Host(), resp, err.Error())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if bl.BodyLength > ihttp.DefaultMaxBodySize {
|
||||
bl.ExceedLength = true
|
||||
}
|
||||
bl.Source = int(unit.source)
|
||||
bl.ReqDepth = unit.depth
|
||||
bl.Spended = time.Since(start).Milliseconds()
|
||||
switch unit.source {
|
||||
case InitRandomSource:
|
||||
bl.Collect()
|
||||
pool.random = bl
|
||||
pool.addFuzzyBaseline(bl)
|
||||
pool.initwg.Done()
|
||||
case InitIndexSource:
|
||||
bl.Collect()
|
||||
pool.index = bl
|
||||
pool.wg.Add(1)
|
||||
pool.doCrawl(bl)
|
||||
pool.initwg.Done()
|
||||
case CheckSource:
|
||||
if bl.ErrString != "" {
|
||||
logs.Log.Warnf("[check.error] %s maybe ip had banned, break (%d/%d), error: %s", pool.BaseURL, pool.failedCount, pool.BreakThreshold, bl.ErrString)
|
||||
} else if i := pool.random.Compare(bl); i < 1 {
|
||||
if i == 0 {
|
||||
if pool.Fuzzy {
|
||||
logs.Log.Warn("[check.fuzzy] maybe trigger risk control, " + bl.String())
|
||||
}
|
||||
} else {
|
||||
pool.failedCount += 2
|
||||
logs.Log.Warn("[check.failed] maybe trigger risk control, " + bl.String())
|
||||
pool.failedBaselines = append(pool.failedBaselines, bl)
|
||||
}
|
||||
} else {
|
||||
pool.resetFailed() // 如果后续访问正常, 重置错误次数
|
||||
logs.Log.Debug("[check.pass] " + bl.String())
|
||||
}
|
||||
|
||||
case WordSource:
|
||||
// 异步进行性能消耗较大的深度对比
|
||||
pool.tempCh <- bl
|
||||
pool.reqCount++
|
||||
if pool.reqCount%pool.CheckPeriod == 0 {
|
||||
pool.reqCount++
|
||||
pool.doCheck()
|
||||
} else if pool.failedCount%pool.ErrPeriod == 0 {
|
||||
pool.failedCount++
|
||||
pool.doCheck()
|
||||
}
|
||||
pool.bar.Done()
|
||||
case RedirectSource:
|
||||
bl.FrontURL = unit.frontUrl
|
||||
pool.tempCh <- bl
|
||||
case CrawlSource, ActiveSource, RuleSource, BakSource:
|
||||
pool.tempCh <- bl
|
||||
}
|
||||
|
||||
})
|
||||
p, _ := ants.NewPoolWithFunc(config.Thread, pool.Invoke)
|
||||
|
||||
pool.reqPool = p
|
||||
// 挂起一个异步的处理结果线程, 不干扰主线程的请求并发
|
||||
@ -319,7 +215,6 @@ func (pool *Pool) Run(ctx context.Context, offset, limit int) {
|
||||
pool.closeCh <- struct{}{}
|
||||
}
|
||||
}()
|
||||
|
||||
Loop:
|
||||
for {
|
||||
select {
|
||||
@ -370,6 +265,111 @@ Loop:
|
||||
pool.Close()
|
||||
}
|
||||
|
||||
func (pool *Pool) Invoke(v interface{}) {
|
||||
atomic.AddInt32(&pool.Statistor.ReqTotal, 1)
|
||||
unit := v.(*Unit)
|
||||
req, err := pool.genReq(unit.path)
|
||||
if err != nil {
|
||||
logs.Log.Error(err.Error())
|
||||
return
|
||||
}
|
||||
req.SetHeaders(pool.Headers)
|
||||
|
||||
start := time.Now()
|
||||
resp, reqerr := pool.client.Do(pool.ctx, req)
|
||||
if pool.ClientType == ihttp.FAST {
|
||||
defer fasthttp.ReleaseResponse(resp.FastResponse)
|
||||
defer fasthttp.ReleaseRequest(req.FastRequest)
|
||||
}
|
||||
|
||||
// compare与各种错误处理
|
||||
var bl *pkg.Baseline
|
||||
if reqerr != nil && reqerr != fasthttp.ErrBodyTooLarge {
|
||||
pool.failedCount++
|
||||
atomic.AddInt32(&pool.Statistor.FailedNumber, 1)
|
||||
bl = &pkg.Baseline{UrlString: pool.BaseURL + unit.path, IsValid: false, ErrString: reqerr.Error(), Reason: ErrRequestFailed.Error()}
|
||||
pool.failedBaselines = append(pool.failedBaselines, bl)
|
||||
} else {
|
||||
if unit.source <= 3 || unit.source == CrawlSource {
|
||||
bl = pkg.NewBaseline(req.URI(), req.Host(), resp)
|
||||
} else {
|
||||
if pool.MatchExpr != nil {
|
||||
// 如果非wordsource, 或自定义了match函数, 则所有数据送入tempch中
|
||||
bl = pkg.NewBaseline(req.URI(), req.Host(), resp)
|
||||
} else if err = pool.PreCompare(resp); err == nil {
|
||||
// 通过预对比跳过一些无用数据, 减少性能消耗
|
||||
bl = pkg.NewBaseline(req.URI(), req.Host(), resp)
|
||||
if err != ErrRedirect && bl.RedirectURL != "" {
|
||||
if bl.RedirectURL != "" && !strings.HasPrefix(bl.RedirectURL, "http") {
|
||||
bl.RedirectURL = "/" + strings.TrimLeft(bl.RedirectURL, "/")
|
||||
bl.RedirectURL = pool.BaseURL + bl.RedirectURL
|
||||
}
|
||||
pool.wg.Add(1)
|
||||
pool.doRedirect(bl, unit.depth)
|
||||
}
|
||||
pool.addFuzzyBaseline(bl)
|
||||
} else {
|
||||
bl = pkg.NewInvalidBaseline(req.URI(), req.Host(), resp, err.Error())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if bl.BodyLength > ihttp.DefaultMaxBodySize {
|
||||
bl.ExceedLength = true
|
||||
}
|
||||
bl.Source = int(unit.source)
|
||||
bl.ReqDepth = unit.depth
|
||||
bl.Spended = time.Since(start).Milliseconds()
|
||||
switch unit.source {
|
||||
case InitRandomSource:
|
||||
bl.Collect()
|
||||
pool.random = bl
|
||||
pool.addFuzzyBaseline(bl)
|
||||
pool.initwg.Done()
|
||||
case InitIndexSource:
|
||||
bl.Collect()
|
||||
pool.index = bl
|
||||
pool.wg.Add(1)
|
||||
pool.doCrawl(bl)
|
||||
pool.initwg.Done()
|
||||
case CheckSource:
|
||||
if bl.ErrString != "" {
|
||||
logs.Log.Warnf("[check.error] %s maybe ip had banned, break (%d/%d), error: %s", pool.BaseURL, pool.failedCount, pool.BreakThreshold, bl.ErrString)
|
||||
} else if i := pool.random.Compare(bl); i < 1 {
|
||||
if i == 0 {
|
||||
if pool.Fuzzy {
|
||||
logs.Log.Warn("[check.fuzzy] maybe trigger risk control, " + bl.String())
|
||||
}
|
||||
} else {
|
||||
pool.failedCount += 2
|
||||
logs.Log.Warn("[check.failed] maybe trigger risk control, " + bl.String())
|
||||
pool.failedBaselines = append(pool.failedBaselines, bl)
|
||||
}
|
||||
} else {
|
||||
pool.resetFailed() // 如果后续访问正常, 重置错误次数
|
||||
logs.Log.Debug("[check.pass] " + bl.String())
|
||||
}
|
||||
|
||||
case WordSource:
|
||||
// 异步进行性能消耗较大的深度对比
|
||||
pool.tempCh <- bl
|
||||
pool.reqCount++
|
||||
if pool.reqCount%pool.CheckPeriod == 0 {
|
||||
pool.reqCount++
|
||||
pool.doCheck()
|
||||
} else if pool.failedCount%pool.ErrPeriod == 0 {
|
||||
pool.failedCount++
|
||||
pool.doCheck()
|
||||
}
|
||||
pool.bar.Done()
|
||||
case RedirectSource:
|
||||
bl.FrontURL = unit.frontUrl
|
||||
pool.tempCh <- bl
|
||||
case CrawlSource, ActiveSource, RuleSource, BakSource:
|
||||
pool.tempCh <- bl
|
||||
}
|
||||
}
|
||||
|
||||
func (pool *Pool) PreCompare(resp *ihttp.Response) error {
|
||||
status := resp.StatusCode()
|
||||
if IntsContains(WhiteStatus, status) {
|
||||
@ -463,75 +463,83 @@ func (pool *Pool) doRedirect(bl *pkg.Baseline, depth int) {
|
||||
|
||||
if uu, err := url.Parse(bl.RedirectURL); err == nil && uu.Hostname() == pool.index.Url.Hostname() {
|
||||
pool.wg.Add(1)
|
||||
pool.additionCh <- &Unit{
|
||||
go pool.addAddition(&Unit{
|
||||
path: uu.Path,
|
||||
source: RedirectSource,
|
||||
frontUrl: bl.UrlString,
|
||||
depth: depth + 1,
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func (pool *Pool) doCrawl(bl *pkg.Baseline) {
|
||||
defer pool.wg.Done()
|
||||
if !pool.Crawl {
|
||||
pool.wg.Done()
|
||||
return
|
||||
}
|
||||
bl.CollectURL()
|
||||
for _, u := range bl.URLs {
|
||||
if strings.HasPrefix(u, "//") {
|
||||
u = bl.Url.Scheme + u
|
||||
} else if strings.HasPrefix(u, "/") {
|
||||
// 绝对目录拼接
|
||||
u = pkg.URLJoin(pool.BaseURL, u)
|
||||
} else if !strings.HasPrefix(u, "http") {
|
||||
// 相对目录拼接
|
||||
u = pkg.URLJoin(pool.BaseURL, u)
|
||||
}
|
||||
go func() {
|
||||
defer pool.wg.Done()
|
||||
for _, u := range bl.URLs {
|
||||
if strings.HasPrefix(u, "//") {
|
||||
u = bl.Url.Scheme + u
|
||||
} else if strings.HasPrefix(u, "/") {
|
||||
// 绝对目录拼接
|
||||
u = pkg.URLJoin(pool.BaseURL, u)
|
||||
} else if !strings.HasPrefix(u, "http") {
|
||||
// 相对目录拼接
|
||||
u = pkg.URLJoin(pool.BaseURL, u)
|
||||
}
|
||||
|
||||
if _, ok := pool.urls[u]; ok {
|
||||
pool.urls[u]++
|
||||
} else {
|
||||
// 通过map去重, 只有新的url才会进入到该逻辑
|
||||
pool.locker.Lock()
|
||||
pool.urls[u] = 1
|
||||
pool.locker.Unlock()
|
||||
if bl.ReqDepth < maxCrawl {
|
||||
parsed, err := url.Parse(u)
|
||||
if err != nil {
|
||||
continue
|
||||
if _, ok := pool.urls[u]; ok {
|
||||
pool.urls[u]++
|
||||
} else {
|
||||
// 通过map去重, 只有新的url才会进入到该逻辑
|
||||
pool.locker.Lock()
|
||||
pool.urls[u] = 1
|
||||
pool.locker.Unlock()
|
||||
if bl.ReqDepth < maxCrawl {
|
||||
parsed, err := url.Parse(u)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if parsed.Host != bl.Url.Host {
|
||||
// 自动限定scoop, 防止爬到其他网站
|
||||
continue
|
||||
}
|
||||
pool.wg.Add(1)
|
||||
pool.addAddition(&Unit{
|
||||
path: parsed.Path,
|
||||
source: CrawlSource,
|
||||
depth: bl.ReqDepth + 1,
|
||||
})
|
||||
}
|
||||
if parsed.Host != bl.Url.Host {
|
||||
// 自动限定scoop, 防止爬到其他网站
|
||||
continue
|
||||
}
|
||||
pool.wg.Add(1)
|
||||
go pool.addAddition(&Unit{
|
||||
path: parsed.Path,
|
||||
source: CrawlSource,
|
||||
depth: bl.ReqDepth + 1,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
}
|
||||
|
||||
func (pool *Pool) doRule(bl *pkg.Baseline) {
|
||||
defer pool.wg.Done()
|
||||
if pool.AppendRule == nil {
|
||||
pool.wg.Done()
|
||||
return
|
||||
}
|
||||
if bl.Source == int(RuleSource) || bl.Dir {
|
||||
pool.wg.Done()
|
||||
return
|
||||
}
|
||||
|
||||
for u := range rule.RunAsStream(pool.AppendRule.Expressions, path.Base(bl.Path)) {
|
||||
pool.wg.Add(1)
|
||||
go pool.addAddition(&Unit{
|
||||
path: path.Join(path.Dir(bl.Path), u),
|
||||
source: RuleSource,
|
||||
})
|
||||
}
|
||||
go func() {
|
||||
defer pool.wg.Done()
|
||||
for u := range rule.RunAsStream(pool.AppendRule.Expressions, path.Base(bl.Path)) {
|
||||
pool.wg.Add(1)
|
||||
pool.addAddition(&Unit{
|
||||
path: path.Join(path.Dir(bl.Path), u),
|
||||
source: RuleSource,
|
||||
})
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
func (pool *Pool) doActive() {
|
||||
|
@ -228,7 +228,6 @@ func (r *Runner) Prepare(ctx context.Context) error {
|
||||
}
|
||||
r.Done()
|
||||
})
|
||||
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
|
@ -55,7 +55,9 @@ func NewBaseline(u, host string, resp *ihttp.Response) *Baseline {
|
||||
if resp.ClientType == ihttp.STANDARD {
|
||||
bl.Host = host
|
||||
}
|
||||
bl.Body = resp.Body()
|
||||
body := resp.Body()
|
||||
bl.Body = make([]byte, len(body))
|
||||
copy(bl.Body, body)
|
||||
bl.BodyLength = resp.ContentLength()
|
||||
bl.Header = resp.Header()
|
||||
bl.HeaderLength = len(bl.Header)
|
||||
|
Loading…
x
Reference in New Issue
Block a user