添加--scope与--no-scope参数, 用来指定爬虫的作用范围

修复recursive中的多个bug
This commit is contained in:
M09Ic 2023-04-14 20:05:21 +08:00
parent 1bcf2f297d
commit 3698d01903
8 changed files with 210 additions and 96 deletions

2
go.mod
View File

@ -12,7 +12,7 @@ require (
)
require (
github.com/antonmedv/expr v1.9.0
github.com/antonmedv/expr v1.12.5
github.com/chainreactors/ipcs v0.0.13
github.com/chainreactors/utils v0.0.14-0.20230314084720-a4d745cabc56
github.com/gosuri/uiprogress v0.0.1

24
go.sum
View File

@ -1,9 +1,8 @@
github.com/DATA-DOG/go-sqlmock v1.3.3/go.mod h1:f/Ixk793poVmq4qj/V1dPUg2JEAKC73Q5eFN3EC/SaM=
github.com/M09ic/go-ntlmssp v0.0.0-20230312133735-dcccd454dfe0/go.mod h1:yMNEF6ulbFipt3CakMhcmcNVACshPRG4Ap4l00V+mMs=
github.com/andybalholm/brotli v1.0.4 h1:V7DdXeJtZscaqfNuAdSRuRFzuiKlHSC/Zh3zl9qY3JY=
github.com/andybalholm/brotli v1.0.4/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig=
github.com/antonmedv/expr v1.9.0 h1:j4HI3NHEdgDnN9p6oI6Ndr0G5QryMY0FNxT4ONrFDGU=
github.com/antonmedv/expr v1.9.0/go.mod h1:5qsM3oLGDND7sDmQGDXHkYfkjYMUX14qsgqmHhwGEk8=
github.com/antonmedv/expr v1.12.5 h1:Fq4okale9swwL3OeLLs9WD9H6GbgBLJyN/NUHRv+n0E=
github.com/antonmedv/expr v1.12.5/go.mod h1:FPC8iWArxls7axbVLsW+kpg1mz29A1b2M6jt+hZfDkU=
github.com/chainreactors/files v0.2.0/go.mod h1:/Xa9YXhjBlaC33JTD6ZTJFig6pcplak2IDcovf42/6A=
github.com/chainreactors/files v0.2.3/go.mod h1:/Xa9YXhjBlaC33JTD6ZTJFig6pcplak2IDcovf42/6A=
github.com/chainreactors/files v0.2.5-0.20230310102018-3d10f74c7d6b h1:FRKGDHJrXrYfHnoehgE98vBoKvMpa/8/+d4wG0Zgpg4=
@ -22,8 +21,6 @@ github.com/chainreactors/logs v0.7.1-0.20230316032643-ed7d85ca234f/go.mod h1:Y0E
github.com/chainreactors/neutron v0.0.0-20230227122754-80dc76323a1c/go.mod h1:GjZPKmcyVoQvngG+GBHxXbpXBcjIcvHGO9xj/VXRf3w=
github.com/chainreactors/parsers v0.3.0/go.mod h1:Z9weht+lnFCk7UcwqFu6lXpS7u5vttiy0AJYOAyCCLA=
github.com/chainreactors/parsers v0.3.1-0.20230313041950-25d5f9059c79/go.mod h1:tA33N6UbYFnIT3k5tufOMfETxmEP20RZFyTSEnVXNUA=
github.com/chainreactors/parsers v0.3.1-0.20230327070646-7dbe644d2b3b h1:EubRBdVAj9COEmfkCB2yseejcAhDgndRipp/zDzJ0FU=
github.com/chainreactors/parsers v0.3.1-0.20230327070646-7dbe644d2b3b/go.mod h1:tA33N6UbYFnIT3k5tufOMfETxmEP20RZFyTSEnVXNUA=
github.com/chainreactors/parsers v0.3.1-0.20230403160559-9ed502452575 h1:uHE9O8x70FXwge5p68U/lGC9Xs8Leg8hWJR9PHKGzsk=
github.com/chainreactors/parsers v0.3.1-0.20230403160559-9ed502452575/go.mod h1:tA33N6UbYFnIT3k5tufOMfETxmEP20RZFyTSEnVXNUA=
github.com/chainreactors/utils v0.0.14-0.20230314084720-a4d745cabc56 h1:1uhvEh7Of4fQJXRMsfGEZGy5NcETsM2yataQ0oYSw0k=
@ -31,15 +28,12 @@ github.com/chainreactors/utils v0.0.14-0.20230314084720-a4d745cabc56/go.mod h1:N
github.com/chainreactors/words v0.4.1-0.20230327065326-448a905ac8c2 h1:/v8gTORQIRJl2lgNt82OOeP/04QZyNTGKcmjfstVN5E=
github.com/chainreactors/words v0.4.1-0.20230327065326-448a905ac8c2/go.mod h1:QIWX1vMT5j/Mp9zx3/wgZh3FqskhjCbo/3Ffy/Hxj9w=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/davecgh/go-spew v0.0.0-20161028175848-04cdfd42973b/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/dsnet/compress v0.0.1/go.mod h1:Aw8dCMJ7RioblQeTqt88akK31OvO8Dhf5JflhBbQEHo=
github.com/dsnet/golib v0.0.0-20171103203638-1ea166775780/go.mod h1:Lj+Z9rebOhdfkVLjJ8T6VcRQv3SXugXy999NBtR9aFY=
github.com/frankban/quicktest v1.14.4/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0=
github.com/gdamore/encoding v1.0.0/go.mod h1:alR0ol34c49FCSBLjhosxzcPHQbf2trDkoo5dl+VrEg=
github.com/gdamore/tcell v1.3.0/go.mod h1:Hjvr+Ofd+gLglo7RYKxxnzCBmev3BzsS67MebKS4zMM=
github.com/go-dedup/megophone v0.0.0-20170830025436-f01be21026f5 h1:4U+x+EB1P66zwYgTjxWXSOT8vF+651Ksr1lojiCZnT8=
github.com/go-dedup/megophone v0.0.0-20170830025436-f01be21026f5/go.mod h1:poR/Cp00iqtqu9ltFwl6C00sKC0HY13u/Gh05ZBmP54=
github.com/go-dedup/simhash v0.0.0-20170904020510-9ecaca7b509c h1:mucYYQn+sMGNSxidhleonzAdwL203RxhjJGnxQU4NWU=
@ -62,12 +56,8 @@ github.com/klauspost/compress v1.15.10/go.mod h1:QPwzmACJjUTFsnSHH934V6woptycfrD
github.com/klauspost/cpuid v1.2.0/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/lucasb-eyer/go-colorful v1.0.2/go.mod h1:0MS4r+7BZKSJ5mw4/S5MPN+qHFF1fYclkSPilDOKW0s=
github.com/lucasb-eyer/go-colorful v1.0.3/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0=
github.com/mattn/go-isatty v0.0.16 h1:bq3VjFmv/sOjHtdEhmkEV4x1AJtvUvOJ2PFAZ5+peKQ=
github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
github.com/mattn/go-runewidth v0.0.4/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzpuz5H//U1FU=
github.com/mattn/go-runewidth v0.0.8/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI=
github.com/mholt/archiver v3.1.1+incompatible/go.mod h1:Dh2dOXnSdiLxRiPoVfIr/fI1TwETms9B8CTWfeh7ROU=
github.com/nwaples/rardecode v1.1.3/go.mod h1:5DzqNKiOdpKKBH87u8VlvAnPZMXcGRhxWkRpHbbfGS0=
github.com/panjf2000/ants/v2 v2.5.0/go.mod h1:cU93usDlihJZ5CfRGNDYsiBYvoilLvBF5Qp/BT2GNRE=
@ -75,18 +65,12 @@ github.com/panjf2000/ants/v2 v2.7.0 h1:Y3Bgpfo9HDkBoHNVFbMfY5mAvi5TAA17y3HbzQ74p
github.com/panjf2000/ants/v2 v2.7.0/go.mod h1:KIBmYG9QQX5U2qzFP/yQJaq/nSb6rahS9iEHkrCMgM8=
github.com/pierrec/lz4 v2.6.1+incompatible/go.mod h1:pdkljMzZIN41W+lC3N2tnIh5sFi+IEE17M5jbnwPHcY=
github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA=
github.com/pmezard/go-difflib v0.0.0-20151028094244-d8ed2627bdf0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/rivo/tview v0.0.0-20200219210816-cd38d7432498/go.mod h1:6lkG1x+13OShEf0EaOCaTQYyB7d5nSbb181KtjlS+84=
github.com/rivo/uniseg v0.1.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs=
github.com/sanity-io/litter v1.2.0/go.mod h1:JF6pZUFgu2Q0sBZ+HSV35P8TVPI1TTzEwyu9FXAw2W4=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
github.com/stretchr/testify v0.0.0-20161117074351-18a02ba4a312/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA=
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
@ -112,8 +96,6 @@ golang.org/x/sync v0.1.0 h1:wsuoTGHzEhffawBOhz5CYhcrV4IdKZbEyZjBMuTp12o=
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20190626150813-e07cf5db2756/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200212091648-12a6c2dcc1e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210320140829-1e4c9ba3b0c4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
@ -126,7 +108,6 @@ golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/time v0.3.0 h1:rg5rLMjNzMS1RkNLzCG38eapWhnYLFYXDXj2gOlr8j4=
@ -134,7 +115,6 @@ golang.org/x/time v0.3.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

View File

@ -99,19 +99,21 @@ type PluginOptions struct {
}
type ModeOptions struct {
RateLimit int `long:"rate-limit" default:"0" description:"Int, request rate limit (rate/s), e.g.: --rate-limit 100"`
Force bool `long:"force" description:"Bool, skip error break"`
CheckOnly bool `long:"check-only" description:"Bool, check only"`
Recursive string `long:"recursive" default:"current.IsDir()" description:"String,custom recursive rule, e.g.: --recursive current.IsDir()"`
Depth int `long:"depth" default:"0" description:"Int, recursive depth"`
CheckPeriod int `long:"check-period" default:"200" description:"Int, check period when request"`
ErrPeriod int `long:"error-period" default:"10" description:"Int, check period when error"`
BreakThreshold int `long:"error-threshold" default:"20" description:"Int, break when the error exceeds the threshold "`
BlackStatus string `long:"black-status" default:"400,410" description:"Strings (comma split),custom black status, "`
WhiteStatus string `long:"white-status" default:"200" description:"Strings (comma split), custom white status"`
FuzzyStatus string `long:"fuzzy-status" default:"404,403,500,501,502,503" description:"Strings (comma split), custom fuzzy status"`
UniqueStatus string `long:"unique-status" default:"403" description:"Strings (comma split), custom unique status"`
Unique bool `long:"unique" description:"Bool, unique response"`
RateLimit int `long:"rate-limit" default:"0" description:"Int, request rate limit (rate/s), e.g.: --rate-limit 100"`
Force bool `long:"force" description:"Bool, skip error break"`
CheckOnly bool `long:"check-only" description:"Bool, check only"`
NoScope bool `long:"no-scope" description:"Bool, no scope"`
Scope []string `long:"scope" description:"String, custom scope, e.g.: --scope *.example.com"`
Recursive string `long:"recursive" default:"current.IsDir()" description:"String,custom recursive rule, e.g.: --recursive current.IsDir()"`
Depth int `long:"depth" default:"0" description:"Int, recursive depth"`
CheckPeriod int `long:"check-period" default:"200" description:"Int, check period when request"`
ErrPeriod int `long:"error-period" default:"10" description:"Int, check period when error"`
BreakThreshold int `long:"error-threshold" default:"20" description:"Int, break when the error exceeds the threshold "`
BlackStatus string `long:"black-status" default:"400,410" description:"Strings (comma split),custom black status, "`
WhiteStatus string `long:"white-status" default:"200" description:"Strings (comma split), custom white status"`
FuzzyStatus string `long:"fuzzy-status" default:"404,403,500,501,502,503" description:"Strings (comma split), custom fuzzy status"`
UniqueStatus string `long:"unique-status" default:"403" description:"Strings (comma split), custom unique status"`
Unique bool `long:"unique" description:"Bool, unique response"`
SimhashDistance int `long:"distance" default:"5"`
}
@ -156,6 +158,7 @@ func (opt *Option) PrepareRunner() (*Runner, error) {
ErrPeriod: opt.ErrPeriod,
BreakThreshold: opt.BreakThreshold,
Crawl: opt.Crawl,
Scope: opt.Scope,
Active: opt.Active,
Bak: opt.Bak,
Common: opt.Common,
@ -185,11 +188,12 @@ func (opt *Option) PrepareRunner() (*Runner, error) {
r.ErrPeriod = max
}
// 选择client
if opt.Client == "auto" {
r.ClientType = ihttp.Auto
} else if opt.Client == "fast" {
r.ClientType = ihttp.FAST
} else if opt.Client == "standard" {
} else if opt.Client == "standard" || opt.Client == "base" || opt.Client == "http" {
r.ClientType = ihttp.STANDARD
}
@ -234,6 +238,10 @@ func (opt *Option) PrepareRunner() (*Runner, error) {
logs.Log.Important("Advance Mod: " + s.String())
}
if opt.NoScope {
r.Scope = []string{"*"}
}
BlackStatus = parseStatus(BlackStatus, opt.BlackStatus)
WhiteStatus = parseStatus(WhiteStatus, opt.WhiteStatus)
if opt.FuzzyStatus == "all" {
@ -352,7 +360,7 @@ func (opt *Option) PrepareRunner() (*Runner, error) {
taskfrom = "resume " + opt.ResumeFrom
go func() {
for _, stat := range stats {
tasks <- &Task{baseUrl: stat.BaseUrl, origin: stat}
tasks <- &Task{baseUrl: stat.BaseUrl, origin: NewOrigin(stat)}
}
close(tasks)
}()
@ -509,17 +517,35 @@ func (opt *Option) PrepareRunner() (*Runner, error) {
r.FilterExpr = exp
}
if opt.Recursive != "current.IsDir()" {
// 初始化递归
var express string
if opt.Recursive != "current.IsDir()" && opt.Depth != 0 {
// 默认不打开递归, 除非指定了非默认的递归表达式
MaxRecursion = 1
exp, err := expr.Compile(opt.Recursive)
express = opt.Recursive
}
if opt.Depth != 0 {
// 手动设置的depth优先级高于默认
MaxRecursion = opt.Depth
express = opt.Recursive
}
if express != "" {
exp, err := expr.Compile(express)
if err != nil {
return nil, err
}
r.RecursiveExpr = exp
}
if opt.Depth != 0 {
MaxRecursion = opt.Depth
}
// 自定义scope
//var scopeexpr string
//if opt.NoScope {
// scopeexpr = "glob(current, '*')"
//} else if opt.Scope != "" {
// scopeexpr = fmt.Sprintf("glob(current, '*%s*')", opt.Scope)
//}
// prepare header
for _, h := range opt.Headers {

View File

@ -50,6 +50,7 @@ func NewPool(ctx context.Context, config *pkg.Config) (*Pool, error) {
client: ihttp.NewClient(config.Thread, 2, config.ClientType),
baselines: make(map[int]*pkg.Baseline),
urls: make(map[string]struct{}),
scopeurls: make(map[string]struct{}),
uniques: make(map[uint16]struct{}),
tempCh: make(chan *pkg.Baseline, 100),
checkCh: make(chan int, 100),
@ -70,8 +71,8 @@ func NewPool(ctx context.Context, config *pkg.Config) (*Pool, error) {
pool.dir = Dir(pool.url.Path)
}
p, _ := ants.NewPoolWithFunc(config.Thread, pool.Invoke)
pool.reqPool = p
pool.reqPool, _ = ants.NewPoolWithFunc(config.Thread, pool.Invoke)
pool.scopePool, _ = ants.NewPoolWithFunc(config.Thread, pool.NoScopeInvoke)
// 挂起一个异步的处理结果线程, 不干扰主线程的请求并发
go pool.Handler()
@ -79,7 +80,7 @@ func NewPool(ctx context.Context, config *pkg.Config) (*Pool, error) {
}
type Pool struct {
*pkg.Config
*pkg.Config // read only
base string // url的根目录, 在爬虫或者redirect时, 会需要用到根目录进行拼接
dir string
isDir bool
@ -87,12 +88,13 @@ type Pool struct {
Statistor *pkg.Statistor
client *ihttp.Client
reqPool *ants.PoolWithFunc
scopePool *ants.PoolWithFunc
bar *pkg.Bar
ctx context.Context
cancel context.CancelFunc
tempCh chan *pkg.Baseline // 待处理的baseline
checkCh chan int // 独立的check管道 防止与redirect/crawl冲突
additionCh chan *Unit
additionCh chan *Unit // 插件添加的任务, 待处理管道
closeCh chan struct{}
closed bool
wordOffset int
@ -103,15 +105,41 @@ type Pool struct {
index *pkg.Baseline
baselines map[int]*pkg.Baseline
urls map[string]struct{}
scopeurls map[string]struct{}
uniques map[uint16]struct{}
analyzeDone bool
worder *words.Worder
limiter *rate.Limiter
locker sync.Mutex
scopeLocker sync.Mutex
waiter sync.WaitGroup
initwg sync.WaitGroup // 初始化用, 之后改成锁
}
func (pool *Pool) checkRedirect(redirectURL string) bool {
if pool.random.RedirectURL == "" {
// 如果random的redirectURL为空, 此时该项
return true
}
if redirectURL == pool.random.RedirectURL {
// 相同的RedirectURL将被认为是无效数据
return false
} else {
// path为3xx, 且与baseline中的RedirectURL不同时, 为有效数据
return true
}
}
func (pool *Pool) genReq(mod pkg.SprayMod, s string) (*ihttp.Request, error) {
if mod == pkg.HostSpray {
return ihttp.BuildHostRequest(pool.ClientType, pool.BaseURL, s)
} else if mod == pkg.PathSpray {
return ihttp.BuildPathRequest(pool.ClientType, pool.base, s)
}
return nil, fmt.Errorf("unknown mod")
}
func (pool *Pool) Init() error {
// 分成两步是为了避免闭包的线程安全问题
pool.initwg.Add(2)
@ -149,7 +177,7 @@ func (pool *Pool) Init() error {
return nil
}
func (pool *Pool) Run(ctx context.Context, offset, limit int) {
func (pool *Pool) Run(offset, limit int) {
pool.worder.RunWithRules()
if pool.Active {
pool.waiter.Add(1)
@ -227,7 +255,7 @@ Loop:
}
case <-pool.closeCh:
break Loop
case <-ctx.Done():
case <-pool.ctx.Done():
break Loop
case <-pool.ctx.Done():
break Loop
@ -325,6 +353,8 @@ func (pool *Pool) Invoke(v interface{}) {
pool.locker.Unlock()
if bl.Status == 200 || (bl.Status/100) == 3 {
// 保留index输出结果
pool.waiter.Add(1)
pool.doCrawl(bl)
pool.OutputCh <- bl
}
pool.initwg.Done()
@ -364,6 +394,37 @@ func (pool *Pool) Invoke(v interface{}) {
}
}
func (pool *Pool) NoScopeInvoke(v interface{}) {
defer pool.waiter.Done()
unit := v.(*Unit)
req, err := ihttp.BuildPathRequest(pool.ClientType, unit.path, "")
if err != nil {
logs.Log.Error(err.Error())
return
}
req.SetHeaders(pool.Headers)
req.SetHeader("User-Agent", RandomUA())
resp, reqerr := pool.client.Do(pool.ctx, req)
if pool.ClientType == ihttp.FAST {
defer fasthttp.ReleaseResponse(resp.FastResponse)
defer fasthttp.ReleaseRequest(req.FastRequest)
}
if reqerr != nil {
logs.Log.Error(reqerr.Error())
return
}
if resp.StatusCode() == 200 {
bl := pkg.NewBaseline(req.URI(), req.Host(), resp)
bl.Source = unit.source
bl.ReqDepth = unit.depth
bl.Collect()
bl.CollectURL()
pool.waiter.Add(1)
pool.doScopeCrawl(bl)
pool.OutputCh <- bl
}
}
func (pool *Pool) Handler() {
for bl := range pool.tempCh {
if bl.IsValid {
@ -454,30 +515,6 @@ func (pool *Pool) Handler() {
pool.analyzeDone = true
}
func (pool *Pool) checkRedirect(redirectURL string) bool {
if pool.random.RedirectURL == "" {
// 如果random的redirectURL为空, 此时该项
return true
}
if redirectURL == pool.random.RedirectURL {
// 相同的RedirectURL将被认为是无效数据
return false
} else {
// path为3xx, 且与baseline中的RedirectURL不同时, 为有效数据
return true
}
}
func (pool *Pool) genReq(mod pkg.SprayMod, s string) (*ihttp.Request, error) {
if mod == pkg.HostSpray {
return ihttp.BuildHostRequest(pool.ClientType, pool.BaseURL, s)
} else if mod == pkg.PathSpray {
return ihttp.BuildPathRequest(pool.ClientType, pool.base, s)
}
return nil, fmt.Errorf("unknown mod")
}
func (pool *Pool) PreCompare(resp *ihttp.Response) error {
status := resp.StatusCode()
if iutils.IntsContains(WhiteStatus, status) {
@ -603,14 +640,15 @@ func (pool *Pool) doCrawl(bl *pkg.Baseline) {
return
}
pool.waiter.Add(1)
pool.doScopeCrawl(bl)
go func() {
defer pool.waiter.Done()
for _, u := range bl.URLs {
if u = FormatURL(bl.Url.Path, u); u == "" {
continue
}
// 通过map去重, 只有新的url才会进入到该逻辑
pool.waiter.Add(1)
pool.addAddition(&Unit{
path: u,
@ -622,6 +660,31 @@ func (pool *Pool) doCrawl(bl *pkg.Baseline) {
}
func (pool *Pool) doScopeCrawl(bl *pkg.Baseline) {
if bl.ReqDepth >= MaxCrawl {
pool.waiter.Done()
return
}
go func() {
defer pool.waiter.Done()
for _, u := range bl.URLs {
if strings.HasPrefix(u, "http") {
if v, _ := url.Parse(u); v == nil || !MatchWithGlobs(v.Host, pool.Scope) {
continue
}
pool.scopeLocker.Lock()
if _, ok := pool.scopeurls[u]; !ok {
pool.urls[u] = struct{}{}
pool.waiter.Add(1)
pool.scopePool.Invoke(&Unit{path: u, source: CrawlSource, depth: bl.ReqDepth + 1})
}
pool.scopeLocker.Unlock()
}
}
}()
}
func (pool *Pool) doRule(bl *pkg.Baseline) {
if pool.AppendRule == nil {
pool.waiter.Done()
@ -724,7 +787,7 @@ func (pool *Pool) addFuzzyBaseline(bl *pkg.Baseline) {
if _, ok := pool.baselines[bl.Status]; !ok && (enableAllFuzzy || iutils.IntsContains(FuzzyStatus, bl.Status)) {
bl.Collect()
pool.waiter.Add(1)
pool.doCrawl(bl)
pool.doCrawl(bl) // 非有效页面也可能存在一些特殊的url可以用来爬取
pool.baselines[bl.Status] = bl
logs.Log.Infof("[baseline.%dinit] %s", bl.Status, bl.Format([]string{"status", "length", "spend", "title", "frame", "redirect"}))
}

View File

@ -76,6 +76,7 @@ type Runner struct {
Force bool
IgnoreWaf bool
Crawl bool
Scope []string
Active bool
Bak bool
Common bool
@ -101,6 +102,7 @@ func (r *Runner) PrepareConfig() *pkg.Config {
AppendRule: r.AppendRules,
IgnoreWaf: r.IgnoreWaf,
Crawl: r.Crawl,
Scope: r.Scope,
Active: r.Active,
Bak: r.Bak,
Common: r.Common,
@ -147,7 +149,7 @@ func (r *Runner) Prepare(ctx context.Context) error {
r.poolwg.Done()
})
} else {
// spray 完整探测模式
// 完整探测模式
go func() {
for t := range r.Tasks {
r.taskCh <- t
@ -183,27 +185,14 @@ func (r *Runner) Prepare(ctx context.Context) error {
}
if t.origin != nil && len(r.Wordlist) == 0 {
// 如果是从断点续传中恢复的任务, 则自动设置word,dict与rule, 不过优先级低于命令行参数
pool.Statistor = pkg.NewStatistorFromStat(t.origin)
wl, err := loadWordlist(t.origin.Word, t.origin.Dictionaries)
pool.Statistor = pkg.NewStatistorFromStat(t.origin.Statistor)
pool.worder, err = t.origin.InitWorder(r.Fns)
if err != nil {
logs.Log.Error(err.Error())
r.Done()
return
}
pool.worder = words.NewWorder(wl)
pool.worder.Fns = r.Fns
rules, err := loadRuleWithFiles(t.origin.RuleFiles, t.origin.RuleFilter)
if err != nil {
logs.Log.Error(err.Error())
r.Done()
return
}
pool.worder.Rules = rules
if len(rules) > 0 {
pool.Statistor.Total = len(rules) * len(wl)
} else {
pool.Statistor.Total = len(wl)
}
pool.Statistor.Total = t.origin.sum
} else {
pool.Statistor = pkg.NewStatistor(t.baseUrl)
pool.worder = words.NewWorder(r.Wordlist)
@ -230,7 +219,7 @@ func (r *Runner) Prepare(ctx context.Context) error {
}
}
pool.Run(ctx, pool.Statistor.Offset, limit)
pool.Run(pool.Statistor.Offset, limit)
if pool.isFailed && len(pool.failedBaselines) > 0 {
// 如果因为错误积累退出, end将指向第一个错误发生时, 防止resume时跳过大量目标
@ -248,7 +237,19 @@ func (r *Runner) Prepare(ctx context.Context) error {
return nil
}
func (r *Runner) AddRecursive(bl *pkg.Baseline) {
// 递归新任务
task := &Task{
baseUrl: bl.UrlString,
depth: bl.RecuDepth + 1,
origin: NewOrigin(pkg.NewStatistor(bl.UrlString)),
}
r.AddPool(task)
}
func (r *Runner) AddPool(task *Task) {
// 递归新任务
if _, ok := r.PoolName[task.baseUrl]; ok {
logs.Log.Importantf("already added pool, skip %s", task.baseUrl)
return
@ -407,7 +408,7 @@ func (r *Runner) Output() {
if bl.IsValid {
saveFunc(bl)
if bl.Recu {
r.AddPool(&Task{baseUrl: bl.UrlString, depth: bl.RecuDepth + 1})
r.AddRecursive(bl)
}
} else {
debugPrint(bl)

View File

@ -2,6 +2,7 @@ package internal
import (
"github.com/chainreactors/spray/pkg"
"github.com/chainreactors/words"
"github.com/chainreactors/words/rule"
)
@ -76,5 +77,36 @@ type Task struct {
baseUrl string
depth int
rule []rule.Expression
origin *pkg.Statistor
origin *Origin
}
func NewOrigin(stat *pkg.Statistor) *Origin {
return &Origin{Statistor: stat}
}
type Origin struct {
*pkg.Statistor
sum int
}
func (o *Origin) InitWorder(fns []func(string) string) (*words.Worder, error) {
var worder *words.Worder
wl, err := loadWordlist(o.Word, o.Dictionaries)
if err != nil {
return nil, err
}
worder = words.NewWorder(wl)
worder.Fns = fns
rules, err := loadRuleWithFiles(o.RuleFiles, o.RuleFilter)
if err != nil {
return nil, err
}
worder.Rules = rules
if len(rules) > 0 {
o.sum = len(rules) * len(wl)
} else {
o.sum = len(wl)
}
return worder, nil
}

View File

@ -12,6 +12,7 @@ import (
"math/rand"
"net/url"
"path"
"path/filepath"
"strconv"
"strings"
)
@ -265,3 +266,13 @@ func CompareWithExpr(exp *vm.Program, params map[string]interface{}) bool {
return false
}
}
func MatchWithGlobs(u string, globs []string) bool {
for _, glob := range globs {
ok, err := filepath.Match(glob, u)
if err == nil && ok {
return true
}
}
return false
}

View File

@ -41,6 +41,7 @@ type Config struct {
Fuzzy bool
IgnoreWaf bool
Crawl bool
Scope []string
Active bool
Bak bool
Common bool