package internal import ( "context" "fmt" "github.com/antonmedv/expr" "github.com/antonmedv/expr/vm" "github.com/chainreactors/logs" "github.com/chainreactors/parsers" "github.com/chainreactors/parsers/iutils" "github.com/chainreactors/spray/pkg" "github.com/chainreactors/spray/pkg/ihttp" "github.com/chainreactors/words" "github.com/chainreactors/words/mask" "github.com/chainreactors/words/rule" "github.com/panjf2000/ants/v2" "github.com/valyala/fasthttp" "golang.org/x/time/rate" "net/url" "path" "strconv" "strings" "sync" "sync/atomic" "time" ) var ( max = 2147483647 MaxRedirect = 3 MaxCrawl = 3 MaxRecursion = 0 nilBaseline = &pkg.Baseline{} ) func NewPool(ctx context.Context, config *pkg.Config) (*Pool, error) { var u *url.URL var err error if u, err = url.Parse(config.BaseURL); err != nil { return nil, err } pctx, cancel := context.WithCancel(ctx) pool := &Pool{ Config: config, base: u.Scheme + "://" + u.Host, isDir: strings.HasSuffix(u.Path, "/"), url: u, ctx: pctx, cancel: cancel, client: ihttp.NewClient(config.Thread, 2, config.ClientType), baselines: make(map[int]*pkg.Baseline), urls: make(map[string]struct{}), tempCh: make(chan *pkg.Baseline, 100), checkCh: make(chan int, 100), additionCh: make(chan *Unit, 100), closeCh: make(chan struct{}), waiter: sync.WaitGroup{}, initwg: sync.WaitGroup{}, limiter: rate.NewLimiter(rate.Limit(config.RateLimit), 1), failedCount: 1, } // 格式化dir, 保证至少有一个"/" if strings.HasSuffix(config.BaseURL, "/") { pool.dir = pool.url.Path } else if pool.url.Path == "" { pool.dir = "/" } else { pool.dir = Dir(pool.url.Path) } p, _ := ants.NewPoolWithFunc(config.Thread, pool.Invoke) pool.reqPool = p // 挂起一个异步的处理结果线程, 不干扰主线程的请求并发 go pool.Handler() return pool, nil } type Pool struct { *pkg.Config base string // url的根目录, 在爬虫或者redirect时, 会需要用到根目录进行拼接 dir string isDir bool url *url.URL Statistor *pkg.Statistor client *ihttp.Client reqPool *ants.PoolWithFunc bar *pkg.Bar ctx context.Context cancel context.CancelFunc tempCh chan *pkg.Baseline // 待处理的baseline checkCh chan int // 独立的check管道, 防止与redirect/crawl冲突 additionCh chan *Unit closeCh chan struct{} closed bool wordOffset int failedCount int32 isFailed bool failedBaselines []*pkg.Baseline random *pkg.Baseline index *pkg.Baseline baselines map[int]*pkg.Baseline urls map[string]struct{} analyzeDone bool worder *words.Worder limiter *rate.Limiter locker sync.Mutex waiter sync.WaitGroup initwg sync.WaitGroup // 初始化用, 之后改成锁 } func (pool *Pool) Init() error { // 分成两步是为了避免闭包的线程安全问题 pool.initwg.Add(2) pool.reqPool.Invoke(newUnit(pool.url.Path, InitIndexSource)) pool.reqPool.Invoke(newUnit(pool.safePath(pkg.RandPath()), InitRandomSource)) pool.initwg.Wait() if pool.index.ErrString != "" { logs.Log.Error(pool.index.String()) return fmt.Errorf(pool.index.ErrString) } if pool.index.Chunked && pool.ClientType == ihttp.FAST { logs.Log.Warn("chunk encoding! buf current client FASTHTTP not support chunk decode") } logs.Log.Info("[baseline.index] " + pool.index.Format([]string{"status", "length", "spend", "title", "frame", "redirect"})) // 检测基本访问能力 if pool.random.ErrString != "" { logs.Log.Error(pool.index.String()) return fmt.Errorf(pool.index.ErrString) } logs.Log.Info("[baseline.random] " + pool.random.Format([]string{"status", "length", "spend", "title", "frame", "redirect"})) // 某些网站http会重定向到https, 如果发现随机目录出现这种情况, 则自定将baseurl升级为https if pool.url.Scheme == "http" { if pool.index.RedirectURL != "" { if err := pool.Upgrade(pool.index); err != nil { return err } } else if pool.random.RedirectURL != "" { if err := pool.Upgrade(pool.random); err != nil { return err } } } return nil } func (pool *Pool) Run(ctx context.Context, offset, limit int) { pool.worder.RunWithRules() if pool.Active { pool.waiter.Add(1) go pool.doActive() } if pool.Bak { pool.waiter.Add(1) go pool.doBak() } if pool.Common { pool.waiter.Add(1) go pool.doCommonFile() } var done bool go func() { for { if done { pool.waiter.Wait() close(pool.closeCh) return } time.Sleep(100 * time.Millisecond) } }() Loop: for { select { case w, ok := <-pool.worder.C: if !ok { done = true continue } pool.Statistor.End++ pool.wordOffset++ if pool.wordOffset < offset { continue } if pool.Statistor.End > limit { done = true continue } pool.waiter.Add(1) //pool.urls[w] = struct{}{} pool.reqPool.Invoke(newUnitWithNumber(pool.safePath(w), WordSource, pool.wordOffset)) // 原样的目录拼接, 输入了几个"/"就是几个, 适配java的目录解析 case source := <-pool.checkCh: pool.Statistor.CheckNumber++ if pool.Mod == pkg.HostSpray { pool.reqPool.Invoke(newUnitWithNumber(pkg.RandHost(), source, pool.wordOffset)) } else if pool.Mod == pkg.PathSpray { pool.reqPool.Invoke(newUnitWithNumber(pool.safePath(pkg.RandPath()), source, pool.wordOffset)) } case unit, ok := <-pool.additionCh: if !ok || pool.closed { continue } if _, ok := pool.urls[unit.path]; ok { logs.Log.Debugf("[%s] duplicate path: %s, skipped", parsers.GetSpraySourceName(unit.source), pool.base+unit.path) pool.waiter.Done() } else { pool.urls[unit.path] = struct{}{} unit.number = pool.wordOffset pool.reqPool.Invoke(unit) } case <-pool.closeCh: break Loop case <-ctx.Done(): break Loop case <-pool.ctx.Done(): break Loop } } pool.closed = true pool.Close() } func (pool *Pool) Invoke(v interface{}) { if pool.RateLimit != 0 { pool.limiter.Wait(pool.ctx) } atomic.AddInt32(&pool.Statistor.ReqTotal, 1) unit := v.(*Unit) req, err := pool.genReq(unit.path) if err != nil { logs.Log.Error(err.Error()) return } req.SetHeaders(pool.Headers) start := time.Now() resp, reqerr := pool.client.Do(pool.ctx, req) if pool.ClientType == ihttp.FAST { defer fasthttp.ReleaseResponse(resp.FastResponse) defer fasthttp.ReleaseRequest(req.FastRequest) } // compare与各种错误处理 var bl *pkg.Baseline if reqerr != nil && reqerr != fasthttp.ErrBodyTooLarge { atomic.AddInt32(&pool.failedCount, 1) atomic.AddInt32(&pool.Statistor.FailedNumber, 1) bl = &pkg.Baseline{ SprayResult: &parsers.SprayResult{ UrlString: pool.base + unit.path, IsValid: false, ErrString: reqerr.Error(), Reason: ErrRequestFailed.Error(), }, } pool.failedBaselines = append(pool.failedBaselines, bl) } else { if unit.source <= 3 || unit.source == CrawlSource || unit.source == CommonFileSource { // 一些高优先级的source, 将跳过PreCompare bl = pkg.NewBaseline(req.URI(), req.Host(), resp) } else if pool.MatchExpr != nil { // 如果自定义了match函数, 则所有数据送入tempch中 bl = pkg.NewBaseline(req.URI(), req.Host(), resp) } else if err = pool.PreCompare(resp); err == nil { // 通过预对比跳过一些无用数据, 减少性能消耗 bl = pkg.NewBaseline(req.URI(), req.Host(), resp) } else { bl = pkg.NewInvalidBaseline(req.URI(), req.Host(), resp, err.Error()) } } // 手动处理重定向 if bl.IsValid && unit.source != CheckSource && bl.RedirectURL != "" { pool.waiter.Add(1) pool.doRedirect(bl, unit.depth) } if ihttp.DefaultMaxBodySize != 0 && bl.BodyLength > ihttp.DefaultMaxBodySize { bl.ExceedLength = true } bl.Source = unit.source bl.ReqDepth = unit.depth bl.Number = unit.number bl.Spended = time.Since(start).Milliseconds() switch unit.source { case InitRandomSource: bl.Collect() pool.locker.Lock() pool.random = bl pool.addFuzzyBaseline(bl) pool.locker.Unlock() pool.initwg.Done() case InitIndexSource: bl.Collect() pool.locker.Lock() pool.index = bl pool.locker.Unlock() if bl.Status == 200 || (bl.Status/100) == 3 { pool.waiter.Add(1) pool.tempCh <- bl } pool.initwg.Done() case CheckSource: if bl.ErrString != "" { logs.Log.Warnf("[check.error] %s maybe ip had banned, break (%d/%d), error: %s", pool.BaseURL, pool.failedCount, pool.BreakThreshold, bl.ErrString) } else if i := pool.random.Compare(bl); i < 1 { if i == 0 { if pool.Fuzzy { logs.Log.Warn("[check.fuzzy] maybe trigger risk control, " + bl.String()) } } else { atomic.AddInt32(&pool.failedCount, 1) // logs.Log.Warn("[check.failed] maybe trigger risk control, " + bl.String()) pool.failedBaselines = append(pool.failedBaselines, bl) } } else { pool.resetFailed() // 如果后续访问正常, 重置错误次数 logs.Log.Debug("[check.pass] " + bl.String()) } case WordSource: // 异步进行性能消耗较大的深度对比 pool.tempCh <- bl if int(pool.Statistor.ReqTotal)%pool.CheckPeriod == 0 { pool.doCheck() } else if pool.failedCount%pool.ErrPeriod == 0 { atomic.AddInt32(&pool.failedCount, 1) pool.doCheck() } pool.bar.Done() case RedirectSource: bl.FrontURL = unit.frontUrl pool.tempCh <- bl default: pool.tempCh <- bl } } func (pool *Pool) Handler() { for bl := range pool.tempCh { if bl.IsValid { pool.addFuzzyBaseline(bl) } if _, ok := pool.Statistor.Counts[bl.Status]; ok { pool.Statistor.Counts[bl.Status]++ } else { pool.Statistor.Counts[bl.Status] = 1 } if _, ok := pool.Statistor.Sources[bl.Source]; ok { pool.Statistor.Sources[bl.Source]++ } else { pool.Statistor.Sources[bl.Source] = 1 } var params map[string]interface{} if pool.MatchExpr != nil || pool.FilterExpr != nil || pool.RecuExpr != nil { params = map[string]interface{}{ "index": pool.index, "random": pool.random, "current": bl, } for _, status := range FuzzyStatus { if bl, ok := pool.baselines[status]; ok { params["bl"+strconv.Itoa(status)] = bl } else { params["bl"+strconv.Itoa(status)] = nilBaseline } } } var status bool if pool.MatchExpr != nil { if CompareWithExpr(pool.MatchExpr, params) { status = true } } else { status = pool.BaseCompare(bl) } if status { pool.Statistor.FoundNumber++ if pool.FilterExpr != nil && CompareWithExpr(pool.FilterExpr, params) { pool.Statistor.FilteredNumber++ bl.Reason = ErrCustomFilter.Error() bl.IsValid = false } } else { bl.IsValid = false } if bl.IsValid || bl.IsFuzzy { pool.waiter.Add(2) pool.doCrawl(bl) pool.doRule(bl) } // 如果要进行递归判断, 要满足 bl有效, mod为path-spray, 当前深度小于最大递归深度 if bl.IsValid { if bl.RecuDepth < MaxRecursion { if CompareWithExpr(pool.RecuExpr, params) { bl.Recu = true } } } if !pool.closed { // 如果任务被取消, 所有还没处理的请求结果都会被丢弃 pool.OutputCh <- bl } pool.waiter.Done() } pool.analyzeDone = true } func (pool *Pool) checkRedirect(redirectURL string) bool { if pool.random.RedirectURL == "" { // 如果random的redirectURL为空, 此时该项 return true } if redirectURL == pool.random.RedirectURL { // 相同的RedirectURL将被认为是无效数据 return false } else { // path为3xx, 且与baseline中的RedirectURL不同时, 为有效数据 return true } } func (pool *Pool) genReq(s string) (*ihttp.Request, error) { if pool.Mod == pkg.HostSpray { return ihttp.BuildHostRequest(pool.ClientType, pool.BaseURL, s) } else if pool.Mod == pkg.PathSpray { return ihttp.BuildPathRequest(pool.ClientType, pool.base, s) } return nil, fmt.Errorf("unknown mod") } func (pool *Pool) PreCompare(resp *ihttp.Response) error { status := resp.StatusCode() if iutils.IntsContains(WhiteStatus, status) { // 如果为白名单状态码则直接返回 return nil } if pool.random != nil && pool.random.Status != 200 && pool.random.Status == status { return ErrSameStatus } if iutils.IntsContains(BlackStatus, status) { return ErrBadStatus } if iutils.IntsContains(WAFStatus, status) { return ErrWaf } if !pool.checkRedirect(resp.GetHeader("Location")) { return ErrRedirect } return nil } func (pool *Pool) BaseCompare(bl *pkg.Baseline) bool { var status = -1 // 30x状态码的特殊处理 if strings.HasSuffix(bl.RedirectURL, bl.Url.Path+"/") { bl.Reason = ErrFuzzyRedirect.Error() pool.putToFuzzy(bl) return false } // 使用与baseline相同状态码, 需要在fuzzystatus中提前配置 base, ok := pool.baselines[bl.Status] // 挑选对应状态码的baseline进行compare if !ok { if pool.index != nil { } else if pool.random.Status == bl.Status { // 当other的状态码与base相同时, 会使用base ok = true base = pool.random } else if pool.index.Status == bl.Status { // 当other的状态码与index相同时, 会使用index ok = true base = pool.index } } if ok { if status = base.Compare(bl); status == 1 { bl.Reason = ErrCompareFailed.Error() return false } } bl.Collect() //if !pool.IgnoreWaf { // // 部分情况下waf的特征可能是全局, 指定了--ignore-waf则不会进行waf的指纹检测 // for _, f := range bl.Frameworks { // if f.HasTag("waf") { // pool.Statistor.WafedNumber++ // bl.Reason = ErrWaf.Error() // return false // } // } //} if ok && status == 0 && base.FuzzyCompare(bl) { pool.Statistor.FuzzyNumber++ bl.Reason = ErrFuzzyCompareFailed.Error() pool.putToFuzzy(bl) return false } return true } func CompareWithExpr(exp *vm.Program, params map[string]interface{}) bool { res, err := expr.Run(exp, params) if err != nil { logs.Log.Warn(err.Error()) } if res == true { return true } else { return false } } func (pool *Pool) Upgrade(bl *pkg.Baseline) error { rurl, err := url.Parse(bl.RedirectURL) if err == nil && rurl.Hostname() == bl.Url.Hostname() && bl.Url.Scheme == "http" && rurl.Scheme == "https" { logs.Log.Infof("baseurl %s upgrade http to https, reinit", pool.BaseURL) pool.base = strings.Replace(pool.BaseURL, "http", "https", 1) pool.url.Scheme = "https" // 重新初始化 err = pool.Init() if err != nil { return err } } return nil } func (pool *Pool) doRedirect(bl *pkg.Baseline, depth int) { defer pool.waiter.Done() if depth >= MaxRedirect { return } reURL := FormatURL(bl.Url.Path, bl.RedirectURL) pool.waiter.Add(1) go pool.addAddition(&Unit{ path: reURL, source: RedirectSource, frontUrl: bl.UrlString, depth: depth + 1, }) } func (pool *Pool) doCrawl(bl *pkg.Baseline) { if !pool.Crawl || bl.ReqDepth >= MaxCrawl { pool.waiter.Done() return } bl.CollectURL() if bl.URLs == nil { pool.waiter.Done() return } go func() { defer pool.waiter.Done() for _, u := range bl.URLs { if u = FormatURL(bl.Url.Path, u); u == "" { continue } // 通过map去重, 只有新的url才会进入到该逻辑 pool.waiter.Add(1) pool.addAddition(&Unit{ path: u, source: CrawlSource, depth: bl.ReqDepth + 1, }) } }() } func (pool *Pool) doRule(bl *pkg.Baseline) { if pool.AppendRule == nil { pool.waiter.Done() return } if bl.Source == RuleSource { pool.waiter.Done() return } go func() { defer pool.waiter.Done() for u := range rule.RunAsStream(pool.AppendRule.Expressions, path.Base(bl.Path)) { pool.waiter.Add(1) pool.addAddition(&Unit{ path: Dir(bl.Url.Path) + u, source: RuleSource, }) } }() } func (pool *Pool) doActive() { defer pool.waiter.Done() for _, u := range pkg.ActivePath { pool.waiter.Add(1) pool.addAddition(&Unit{ path: pool.dir + u[1:], source: ActiveSource, }) } } func (pool *Pool) doBak() { defer pool.waiter.Done() worder, err := words.NewWorderWithDsl("{?0}.{@bak_ext}", [][]string{pkg.BakGenerator(pool.url.Host)}, nil) if err != nil { return } worder.Run() for w := range worder.C { pool.waiter.Add(1) pool.addAddition(&Unit{ path: pool.dir + w, source: BakSource, }) } worder, err = words.NewWorderWithDsl("{@bak_name}.{@bak_ext}", nil, nil) if err != nil { return } worder.Run() for w := range worder.C { pool.waiter.Add(1) pool.addAddition(&Unit{ path: pool.dir + w, source: BakSource, }) } } func (pool *Pool) doCommonFile() { defer pool.waiter.Done() for _, u := range mask.SpecialWords["common_file"] { pool.waiter.Add(1) pool.addAddition(&Unit{ path: pool.dir + u, source: CommonFileSource, }) } } func (pool *Pool) doCheck() { if pool.failedCount > pool.BreakThreshold { // 当报错次数超过上限是, 结束任务 pool.recover() pool.cancel() pool.isFailed = true return } if pool.Mod == pkg.HostSpray { pool.checkCh <- CheckSource } else if pool.Mod == pkg.PathSpray { pool.checkCh <- CheckSource } } func (pool *Pool) addAddition(u *Unit) { // 强行屏蔽报错, 防止goroutine泄露 defer func() { if err := recover(); err != nil { } }() pool.additionCh <- u } func (pool *Pool) addFuzzyBaseline(bl *pkg.Baseline) { if _, ok := pool.baselines[bl.Status]; !ok && iutils.IntsContains(FuzzyStatus, bl.Status) { bl.Collect() pool.waiter.Add(1) pool.doCrawl(bl) pool.baselines[bl.Status] = bl logs.Log.Infof("[baseline.%dinit] %s", bl.Status, bl.Format([]string{"status", "length", "spend", "title", "frame", "redirect"})) } } func (pool *Pool) putToInvalid(bl *pkg.Baseline, reason string) { bl.IsValid = false pool.OutputCh <- bl } func (pool *Pool) putToFuzzy(bl *pkg.Baseline) { bl.IsFuzzy = true pool.FuzzyCh <- bl } func (pool *Pool) resetFailed() { pool.failedCount = 1 pool.failedBaselines = nil } func (pool *Pool) recover() { logs.Log.Errorf("%s ,failed request exceeds the threshold , task will exit. Breakpoint %d", pool.BaseURL, pool.wordOffset) for i, bl := range pool.failedBaselines { logs.Log.Errorf("[failed.%d] %s", i, bl.String()) } } func (pool *Pool) Close() { for pool.analyzeDone { // 等待缓存的待处理任务完成 time.Sleep(time.Duration(100) * time.Millisecond) } close(pool.additionCh) // 关闭addition管道 close(pool.checkCh) // 关闭check管道 pool.Statistor.EndTime = time.Now().Unix() pool.bar.Close() } func (pool *Pool) safePath(u string) string { // 自动生成的目录将采用safepath的方式拼接到相对目录中, 避免出现//的情况. 例如init, check, common hasSlash := strings.HasPrefix(u, "/") if hasSlash { if pool.isDir { return pool.dir + u[1:] } else { return pool.url.Path + u } } else { if pool.isDir { return pool.url.Path + u } else { return pool.url.Path + "/" + u } } }