spray/internal/pool.go

630 lines
16 KiB
Go
Raw Normal View History

2022-09-08 15:57:17 +08:00
package internal
import (
"context"
"fmt"
"github.com/antonmedv/expr"
"github.com/antonmedv/expr/vm"
"github.com/chainreactors/logs"
"github.com/chainreactors/spray/pkg"
"github.com/chainreactors/spray/pkg/ihttp"
2022-09-15 19:27:07 +08:00
"github.com/chainreactors/words"
"github.com/chainreactors/words/rule"
2022-09-08 15:57:17 +08:00
"github.com/panjf2000/ants/v2"
"github.com/valyala/fasthttp"
2022-11-29 20:50:00 +08:00
"net/url"
"path"
"strconv"
2022-11-29 20:50:00 +08:00
"strings"
2022-09-08 15:57:17 +08:00
"sync"
"sync/atomic"
2022-09-15 19:27:07 +08:00
"time"
2022-09-08 15:57:17 +08:00
)
var (
2023-01-03 17:09:32 +08:00
max = 2147483647
maxRedirect = 3
maxCrawl = 3
maxRecursion = 0
2022-09-08 15:57:17 +08:00
)
func NewPool(ctx context.Context, config *pkg.Config) (*Pool, error) {
2022-09-19 14:42:29 +08:00
pctx, cancel := context.WithCancel(ctx)
2022-09-08 15:57:17 +08:00
pool := &Pool{
Config: config,
ctx: pctx,
2022-09-23 01:47:24 +08:00
cancel: cancel,
client: ihttp.NewClient(config.Thread, 2, config.ClientType),
baselines: make(map[int]*pkg.Baseline),
2023-01-03 17:09:32 +08:00
urls: make(map[string]int),
tempCh: make(chan *pkg.Baseline, config.Thread),
checkCh: make(chan int),
2023-01-03 17:09:32 +08:00
additionCh: make(chan *Unit, 100),
2023-01-06 03:31:28 +08:00
closeCh: make(chan struct{}),
2022-09-23 11:20:41 +08:00
wg: sync.WaitGroup{},
initwg: sync.WaitGroup{},
reqCount: 1,
failedCount: 1,
2022-09-08 15:57:17 +08:00
}
p, _ := ants.NewPoolWithFunc(config.Thread, func(i interface{}) {
atomic.AddInt32(&pool.Statistor.ReqTotal, 1)
2022-09-08 15:57:17 +08:00
unit := i.(*Unit)
req, err := pool.genReq(unit.path)
if err != nil {
logs.Log.Error(err.Error())
2022-12-02 15:21:17 +08:00
return
}
req.SetHeaders(pool.Headers)
start := time.Now()
2022-09-23 11:20:41 +08:00
resp, reqerr := pool.client.Do(pctx, req)
if pool.ClientType == ihttp.FAST {
defer fasthttp.ReleaseResponse(resp.FastResponse)
defer fasthttp.ReleaseRequest(req.FastRequest)
}
2022-12-11 00:24:28 +08:00
// compare与各种错误处理
2022-12-02 18:05:33 +08:00
var bl *pkg.Baseline
2022-09-23 11:20:41 +08:00
if reqerr != nil && reqerr != fasthttp.ErrBodyTooLarge {
2022-10-19 16:38:23 +08:00
pool.failedCount++
atomic.AddInt32(&pool.Statistor.FailedNumber, 1)
2022-11-29 20:50:00 +08:00
bl = &pkg.Baseline{UrlString: pool.BaseURL + unit.path, IsValid: false, ErrString: reqerr.Error(), Reason: ErrRequestFailed.Error()}
pool.failedBaselines = append(pool.failedBaselines, bl)
2022-09-08 15:57:17 +08:00
} else {
2023-01-06 03:31:28 +08:00
if unit.source <= 3 || unit.source == CrawlSource {
bl = pkg.NewBaseline(req.URI(), req.Host(), resp)
2022-09-08 15:57:17 +08:00
} else {
2022-11-29 21:55:27 +08:00
if pool.MatchExpr != nil {
// 如果非wordsource, 或自定义了match函数, 则所有数据送入tempch中
bl = pkg.NewBaseline(req.URI(), req.Host(), resp)
} else if err = pool.PreCompare(resp); err == nil {
// 通过预对比跳过一些无用数据, 减少性能消耗
bl = pkg.NewBaseline(req.URI(), req.Host(), resp)
2022-11-29 21:55:27 +08:00
if err != ErrRedirect && bl.RedirectURL != "" {
if bl.RedirectURL != "" && !strings.HasPrefix(bl.RedirectURL, "http") {
bl.RedirectURL = "/" + strings.TrimLeft(bl.RedirectURL, "/")
bl.RedirectURL = pool.BaseURL + bl.RedirectURL
}
pool.wg.Add(1)
2023-01-03 17:09:32 +08:00
pool.doRedirect(bl, unit.depth)
2022-11-29 21:55:27 +08:00
}
pool.addFuzzyBaseline(bl)
} else {
bl = pkg.NewInvalidBaseline(req.URI(), req.Host(), resp, err.Error())
}
2022-09-08 15:57:17 +08:00
}
}
if bl.BodyLength > ihttp.DefaultMaxBodySize {
bl.ExceedLength = true
}
2023-01-03 18:27:06 +08:00
bl.Source = int(unit.source)
2023-01-03 17:09:32 +08:00
bl.ReqDepth = unit.depth
bl.Spended = time.Since(start).Milliseconds()
2022-09-08 15:57:17 +08:00
switch unit.source {
case InitRandomSource:
2023-01-06 03:31:28 +08:00
bl.Collect()
2022-12-11 00:24:28 +08:00
pool.random = bl
pool.addFuzzyBaseline(bl)
2022-10-27 23:40:15 +08:00
pool.initwg.Done()
case InitIndexSource:
2023-01-06 03:31:28 +08:00
bl.Collect()
pool.index = bl
pool.wg.Add(1)
2023-01-03 17:09:32 +08:00
pool.doCrawl(bl)
pool.initwg.Done()
2022-09-20 18:09:06 +08:00
case CheckSource:
if bl.ErrString != "" {
logs.Log.Warnf("[check.error] %s maybe ip had banned, break (%d/%d), error: %s", pool.BaseURL, pool.failedCount, pool.BreakThreshold, bl.ErrString)
2022-12-11 00:24:28 +08:00
} else if i := pool.random.Compare(bl); i < 1 {
if i == 0 {
if pool.Fuzzy {
logs.Log.Warn("[check.fuzzy] maybe trigger risk control, " + bl.String())
}
} else {
pool.failedCount += 2
logs.Log.Warn("[check.failed] maybe trigger risk control, " + bl.String())
pool.failedBaselines = append(pool.failedBaselines, bl)
}
} else {
pool.resetFailed() // 如果后续访问正常, 重置错误次数
logs.Log.Debug("[check.pass] " + bl.String())
}
2022-09-08 15:57:17 +08:00
case WordSource:
// 异步进行性能消耗较大的深度对比
pool.tempCh <- bl
2022-11-11 10:37:30 +08:00
pool.reqCount++
if pool.reqCount%pool.CheckPeriod == 0 {
pool.reqCount++
2023-01-03 17:09:32 +08:00
pool.doCheck()
} else if pool.failedCount%pool.ErrPeriod == 0 {
pool.failedCount++
2023-01-03 17:09:32 +08:00
pool.doCheck()
2022-09-23 11:20:41 +08:00
}
pool.bar.Done()
2022-11-29 21:55:27 +08:00
case RedirectSource:
bl.FrontURL = unit.frontUrl
pool.tempCh <- bl
case CrawlSource, ActiveSource, RuleSource, BakSource:
2023-01-03 17:09:32 +08:00
pool.tempCh <- bl
2022-09-08 15:57:17 +08:00
}
2022-10-19 16:38:23 +08:00
2022-09-08 15:57:17 +08:00
})
pool.reqPool = p
2022-12-11 00:24:28 +08:00
// 挂起一个异步的处理结果线程, 不干扰主线程的请求并发
go func() {
for bl := range pool.tempCh {
2022-11-23 10:59:15 +08:00
if _, ok := pool.Statistor.Counts[bl.Status]; ok {
pool.Statistor.Counts[bl.Status]++
} else {
pool.Statistor.Counts[bl.Status] = 1
}
var params map[string]interface{}
if pool.MatchExpr != nil || pool.FilterExpr != nil || pool.RecuExpr != nil {
params = map[string]interface{}{
"index": pool.index,
"random": pool.random,
"current": bl,
}
for _, status := range FuzzyStatus {
if bl, ok := pool.baselines[status]; ok {
params["bl"+strconv.Itoa(status)] = bl
} else {
params["bl"+strconv.Itoa(status)] = &pkg.Baseline{}
}
2022-12-11 00:24:28 +08:00
}
}
var status bool
if pool.MatchExpr != nil {
2022-12-11 00:24:28 +08:00
if CompareWithExpr(pool.MatchExpr, params) {
status = true
}
} else {
if pool.BaseCompare(bl) {
status = true
}
}
if status {
pool.Statistor.FoundNumber++
2022-12-11 00:24:28 +08:00
if pool.FilterExpr != nil && CompareWithExpr(pool.FilterExpr, params) {
pool.Statistor.FilteredNumber++
bl.Reason = ErrCustomFilter.Error()
bl.IsValid = false
}
} else {
bl.IsValid = false
}
2022-12-11 00:24:28 +08:00
// 如果要进行递归判断, 要满足 bl有效, mod为path-spray, 当前深度小于最大递归深度
2023-01-03 17:09:32 +08:00
if bl.IsValid {
pool.wg.Add(2)
2023-01-06 03:31:28 +08:00
pool.doCrawl(bl)
pool.doRule(bl)
2023-01-03 17:09:32 +08:00
if bl.RecuDepth < maxRecursion {
if CompareWithExpr(pool.RecuExpr, params) {
bl.Recu = true
}
2022-12-11 00:24:28 +08:00
}
}
pool.OutputCh <- bl
pool.wg.Done()
}
pool.analyzeDone = true
}()
2022-09-08 15:57:17 +08:00
return pool, nil
}
type Pool struct {
*pkg.Config
Statistor *pkg.Statistor
client *ihttp.Client
reqPool *ants.PoolWithFunc
bar *pkg.Bar
ctx context.Context
cancel context.CancelFunc
tempCh chan *pkg.Baseline // 待处理的baseline
checkCh chan int // 独立的check管道 防止与redirect/crawl冲突
2023-01-03 17:09:32 +08:00
additionCh chan *Unit
2023-01-06 03:31:28 +08:00
closeCh chan struct{}
reqCount int
failedCount int
isFailed bool
failedBaselines []*pkg.Baseline
2022-12-11 00:24:28 +08:00
random *pkg.Baseline
index *pkg.Baseline
baselines map[int]*pkg.Baseline
2023-01-03 17:09:32 +08:00
urls map[string]int
analyzeDone bool
worder *words.Worder
2022-12-11 00:24:28 +08:00
locker sync.Mutex
wg sync.WaitGroup
initwg sync.WaitGroup // 初始化用, 之后改成锁
2022-09-08 15:57:17 +08:00
}
func (pool *Pool) Init() error {
2022-12-09 19:30:12 +08:00
// 分成两步是为了避免闭包的线程安全问题
pool.initwg.Add(1)
pool.reqPool.Invoke(newUnit("/", InitIndexSource))
pool.initwg.Wait()
if pool.index.ErrString != "" {
return fmt.Errorf(pool.index.String())
2022-12-02 18:05:33 +08:00
}
2023-01-06 03:31:28 +08:00
logs.Log.Info("[baseline.index] " + pool.index.Format([]string{"status", "length", "spend", "title", "frame", "redirect"}))
if pool.index.Status == 200 || (pool.index.Status/100) == 3 {
pool.OutputCh <- pool.index
}
2022-12-09 19:30:12 +08:00
pool.initwg.Add(1)
pool.reqPool.Invoke(newUnit(pkg.RandPath(), InitRandomSource))
pool.initwg.Wait()
2022-09-08 15:57:17 +08:00
// 检测基本访问能力
if pool.random.ErrString != "" {
return fmt.Errorf(pool.random.String())
2022-09-08 15:57:17 +08:00
}
2023-01-06 03:31:28 +08:00
logs.Log.Info("[baseline.random] " + pool.random.Format([]string{"status", "length", "spend", "title", "frame", "redirect"}))
if pool.random.RedirectURL != "" {
2022-11-29 20:50:00 +08:00
// 自定协议升级
// 某些网站http会重定向到https, 如果发现随机目录出现这种情况, 则自定将baseurl升级为https
rurl, err := url.Parse(pool.random.RedirectURL)
if err == nil && rurl.Hostname() == pool.random.Url.Hostname() && pool.random.Url.Scheme == "http" && rurl.Scheme == "https" {
logs.Log.Infof("baseurl %s upgrade http to https", pool.BaseURL)
pool.BaseURL = strings.Replace(pool.BaseURL, "http", "https", 1)
2022-11-29 20:50:00 +08:00
}
}
2022-09-08 15:57:17 +08:00
return nil
}
2023-01-03 17:09:32 +08:00
func (pool *Pool) checkRedirect(redirectURL string) bool {
if redirectURL == pool.random.RedirectURL {
// 相同的RedirectURL将被认为是无效数据
return false
} else {
// path为3xx, 且与baseline中的RedirectURL不同时, 为有效数据
return true
}
}
func (pool *Pool) genReq(s string) (*ihttp.Request, error) {
if pool.Mod == pkg.HostSpray {
return ihttp.BuildHostRequest(pool.ClientType, pool.BaseURL, s)
} else if pool.Mod == pkg.PathSpray {
return ihttp.BuildPathRequest(pool.ClientType, pool.BaseURL, s)
}
return nil, fmt.Errorf("unknown mod")
}
2023-01-05 14:56:23 +08:00
func (pool *Pool) Run(ctx context.Context, offset, limit int) {
pool.worder.RunWithRules()
2023-01-03 17:16:55 +08:00
if pool.Active {
pool.wg.Add(1)
2023-01-03 17:16:55 +08:00
go pool.doActive()
}
if pool.Bak {
pool.wg.Add(1)
go pool.doBak()
}
2023-01-06 03:31:28 +08:00
go func() {
for {
pool.wg.Wait()
pool.closeCh <- struct{}{}
}
}()
2022-09-15 19:27:07 +08:00
Loop:
for {
select {
case u, ok := <-pool.worder.C:
2022-09-15 19:27:07 +08:00
if !ok {
2023-01-06 03:31:28 +08:00
continue
2022-09-15 19:27:07 +08:00
}
pool.Statistor.End++
if int(pool.reqCount) < offset {
pool.reqCount++
2022-11-10 15:48:38 +08:00
continue
}
if pool.Statistor.End > limit {
2023-01-06 03:31:28 +08:00
continue
2022-11-10 15:48:38 +08:00
}
2022-11-10 04:48:07 +08:00
if u == "" {
continue
}
pool.wg.Add(1)
2023-01-06 03:31:28 +08:00
pool.reqPool.Invoke(newUnit(u, WordSource))
case source := <-pool.checkCh:
pool.Statistor.CheckNumber++
if pool.Mod == pkg.HostSpray {
2023-01-06 03:31:28 +08:00
pool.reqPool.Invoke(newUnit(pkg.RandHost(), source))
} else if pool.Mod == pkg.PathSpray {
2023-01-06 03:31:28 +08:00
pool.reqPool.Invoke(newUnit(pkg.RandPath(), source))
}
case unit, ok := <-pool.additionCh:
if !ok {
continue
}
2023-01-06 03:31:28 +08:00
pool.reqPool.Invoke(unit)
case <-pool.closeCh:
break Loop
2022-09-15 19:27:07 +08:00
case <-ctx.Done():
break Loop
case <-pool.ctx.Done():
2022-09-19 14:42:29 +08:00
break Loop
2022-09-15 19:27:07 +08:00
}
2022-09-08 15:57:17 +08:00
}
2023-01-03 17:09:32 +08:00
for pool.analyzeDone {
time.Sleep(time.Duration(100) * time.Millisecond)
2023-01-03 17:09:32 +08:00
}
pool.Statistor.EndTime = time.Now().Unix()
pool.Close()
2022-09-08 15:57:17 +08:00
}
func (pool *Pool) PreCompare(resp *ihttp.Response) error {
status := resp.StatusCode()
if IntsContains(WhiteStatus, status) {
// 如果为白名单状态码则直接返回
return nil
}
if pool.random != nil && pool.random.Status != 200 && pool.random.Status == status {
return ErrSameStatus
}
if IntsContains(BlackStatus, status) {
2022-09-15 19:27:07 +08:00
return ErrBadStatus
2022-09-08 15:57:17 +08:00
}
if IntsContains(WAFStatus, status) {
return ErrWaf
2022-09-08 15:57:17 +08:00
}
2023-01-03 17:09:32 +08:00
if !pool.checkRedirect(resp.GetHeader("Location")) {
return ErrRedirect
2022-09-26 17:19:08 +08:00
}
2022-09-08 15:57:17 +08:00
2022-09-15 19:27:07 +08:00
return nil
2022-09-08 15:57:17 +08:00
}
func (pool *Pool) BaseCompare(bl *pkg.Baseline) bool {
if !bl.IsValid {
return false
}
2022-11-11 11:55:49 +08:00
var status = -1
base, ok := pool.baselines[bl.Status] // 挑选对应状态码的baseline进行compare
if !ok {
if pool.random.Status == bl.Status {
// 当other的状态码与base相同时, 会使用base
ok = true
base = pool.random
} else if pool.index.Status == bl.Status {
// 当other的状态码与index相同时, 会使用index
ok = true
base = pool.index
}
}
2022-11-11 11:55:49 +08:00
if ok {
if status = base.Compare(bl); status == 1 {
bl.Reason = ErrCompareFailed.Error()
return false
}
}
bl.Collect()
//if !pool.IgnoreWaf {
// // 部分情况下waf的特征可能是全局, 指定了--ignore-waf则不会进行waf的指纹检测
// for _, f := range bl.Frameworks {
// if f.HasTag("waf") {
// pool.Statistor.WafedNumber++
// bl.Reason = ErrWaf.Error()
// return false
// }
// }
//}
2022-09-23 11:20:41 +08:00
if ok && status == 0 && base.FuzzyCompare(bl) {
pool.Statistor.FuzzyNumber++
bl.Reason = ErrFuzzyCompareFailed.Error()
2023-01-03 17:09:32 +08:00
pool.putToFuzzy(bl)
return false
}
return true
}
2022-12-11 00:24:28 +08:00
func CompareWithExpr(exp *vm.Program, params map[string]interface{}) bool {
res, err := expr.Run(exp, params)
if err != nil {
logs.Log.Warn(err.Error())
}
if res == true {
return true
} else {
return false
}
}
2022-09-26 17:19:08 +08:00
2023-01-03 17:09:32 +08:00
func (pool *Pool) doRedirect(bl *pkg.Baseline, depth int) {
defer pool.wg.Done()
2023-01-03 17:09:32 +08:00
if depth >= maxRedirect {
return
}
if uu, err := url.Parse(bl.RedirectURL); err == nil && uu.Hostname() == pool.index.Url.Hostname() {
pool.wg.Add(1)
pool.additionCh <- &Unit{
path: uu.Path,
source: RedirectSource,
frontUrl: bl.UrlString,
depth: depth + 1,
}
}
}
func (pool *Pool) doCrawl(bl *pkg.Baseline) {
defer pool.wg.Done()
if !pool.Crawl {
return
}
2023-01-03 17:09:32 +08:00
bl.CollectURL()
for _, u := range bl.URLs {
if strings.HasPrefix(u, "//") {
u = bl.Url.Scheme + u
} else if strings.HasPrefix(u, "/") {
// 绝对目录拼接
u = pkg.URLJoin(pool.BaseURL, u)
} else if !strings.HasPrefix(u, "http") {
// 相对目录拼接
u = pkg.URLJoin(pool.BaseURL, u)
}
if _, ok := pool.urls[u]; ok {
pool.urls[u]++
} else {
// 通过map去重, 只有新的url才会进入到该逻辑
pool.locker.Lock()
2023-01-03 17:09:32 +08:00
pool.urls[u] = 1
pool.locker.Unlock()
2023-01-03 17:09:32 +08:00
if bl.ReqDepth < maxCrawl {
parsed, err := url.Parse(u)
if err != nil {
continue
}
if parsed.Host != bl.Url.Host {
// 自动限定scoop, 防止爬到其他网站
continue
}
pool.wg.Add(1)
2023-01-06 03:31:28 +08:00
go pool.addAddition(&Unit{
2023-01-03 17:16:55 +08:00
path: parsed.Path,
source: CrawlSource,
depth: bl.ReqDepth + 1,
2023-01-06 03:31:28 +08:00
})
2023-01-03 17:09:32 +08:00
}
}
}
}
func (pool *Pool) doRule(bl *pkg.Baseline) {
defer pool.wg.Done()
if pool.AppendRule == nil {
return
}
if bl.Source == int(RuleSource) || bl.Dir {
return
}
for u := range rule.RunAsStream(pool.AppendRule.Expressions, path.Base(bl.Path)) {
pool.wg.Add(1)
2023-01-06 03:31:28 +08:00
go pool.addAddition(&Unit{
path: path.Join(path.Dir(bl.Path), u),
source: RuleSource,
2023-01-06 03:31:28 +08:00
})
}
}
2023-01-03 17:16:55 +08:00
func (pool *Pool) doActive() {
defer pool.wg.Done()
2023-01-03 17:16:55 +08:00
for _, u := range pkg.ActivePath {
pool.wg.Add(1)
2023-01-06 03:31:28 +08:00
pool.addAddition(&Unit{
path: safePath(pool.BaseURL, u),
2023-01-03 17:16:55 +08:00
source: ActiveSource,
2023-01-06 03:31:28 +08:00
})
2023-01-03 17:16:55 +08:00
}
}
func (pool *Pool) doBak() {
defer pool.wg.Done()
u, err := url.Parse(pool.BaseURL)
if err != nil {
return
}
worder, err := words.NewWorderWithDsl("{?0}.{@bak_ext}", [][]string{pkg.BakGenerator(u.Host)}, nil)
if err != nil {
return
}
worder.Run()
for w := range worder.C {
pool.wg.Add(1)
2023-01-06 03:31:28 +08:00
pool.addAddition(&Unit{
path: safePath(pool.BaseURL, w),
source: BakSource,
2023-01-06 03:31:28 +08:00
})
}
}
2023-01-03 17:09:32 +08:00
func (pool *Pool) doCheck() {
if pool.failedCount > pool.BreakThreshold {
// 当报错次数超过上限是, 结束任务
pool.recover()
pool.cancel()
pool.isFailed = true
return
}
if pool.Mod == pkg.HostSpray {
pool.checkCh <- CheckSource
} else if pool.Mod == pkg.PathSpray {
pool.checkCh <- CheckSource
}
}
2023-01-06 03:31:28 +08:00
func (pool *Pool) addAddition(u *Unit) {
pool.additionCh <- u
}
func (pool *Pool) addFuzzyBaseline(bl *pkg.Baseline) {
if _, ok := pool.baselines[bl.Status]; !ok && IntsContains(FuzzyStatus, bl.Status) {
bl.Collect()
pool.locker.Lock()
pool.wg.Add(1)
pool.doCrawl(bl)
pool.baselines[bl.Status] = bl
pool.locker.Unlock()
2023-01-06 03:31:28 +08:00
logs.Log.Infof("[baseline.%dinit] %s", bl.Status, bl.Format([]string{"status", "length", "spend", "title", "frame", "redirect"}))
}
}
2023-01-03 17:09:32 +08:00
func (pool *Pool) putToInvalid(bl *pkg.Baseline, reason string) {
2022-11-10 21:18:26 +08:00
bl.IsValid = false
pool.OutputCh <- bl
2022-11-10 21:18:26 +08:00
}
2023-01-03 17:09:32 +08:00
func (pool *Pool) putToFuzzy(bl *pkg.Baseline) {
2022-11-10 21:18:26 +08:00
bl.IsFuzzy = true
pool.FuzzyCh <- bl
2022-11-10 21:18:26 +08:00
}
func (pool *Pool) resetFailed() {
pool.failedCount = 1
pool.failedBaselines = nil
}
func (pool *Pool) recover() {
logs.Log.Errorf("%s ,failed request exceeds the threshold , task will exit. Breakpoint %d", pool.BaseURL, pool.reqCount)
for i, bl := range pool.failedBaselines {
logs.Log.Errorf("[failed.%d] %s", i, bl.String())
}
}
func (pool *Pool) Close() {
for pool.analyzeDone {
2022-09-23 11:20:41 +08:00
time.Sleep(time.Duration(100) * time.Millisecond)
}
close(pool.tempCh)
2023-01-03 17:09:32 +08:00
close(pool.additionCh)
pool.bar.Close()
2022-09-23 11:20:41 +08:00
}