From 4a774718c95942ccad92745ca1626595972c9016 Mon Sep 17 00:00:00 2001 From: M09Ic Date: Wed, 11 Jan 2023 11:12:00 +0800 Subject: [PATCH] =?UTF-8?q?=E8=B0=83=E6=95=B4url=E5=8E=BB=E9=87=8D?= =?UTF-8?q?=E7=9A=84=E9=80=BB=E8=BE=91,=20=E7=8E=B0=E5=9C=A8=E5=B0=86?= =?UTF-8?q?=E7=BB=9F=E4=B8=80=E8=BF=9B=E8=A1=8C=E5=8E=BB=E9=87=8D=E5=88=A4?= =?UTF-8?q?=E6=96=AD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- internal/pool.go | 54 ++++++++++++++++++++++-------------------------- 1 file changed, 25 insertions(+), 29 deletions(-) diff --git a/internal/pool.go b/internal/pool.go index e14a43a..72e1856 100644 --- a/internal/pool.go +++ b/internal/pool.go @@ -40,13 +40,13 @@ func NewPool(ctx context.Context, config *pkg.Config) (*Pool, error) { pool := &Pool{ Config: config, base: u.Scheme + "://" + u.Hostname(), - isDir: strings.HasSuffix(config.BaseURL, "/"), + isDir: strings.HasSuffix(u.Path, "/"), url: u, ctx: pctx, cancel: cancel, client: ihttp.NewClient(config.Thread, 2, config.ClientType), baselines: make(map[int]*pkg.Baseline), - urls: make(map[string]int), + urls: make(map[string]struct{}), tempCh: make(chan *pkg.Baseline, config.Thread), checkCh: make(chan int), additionCh: make(chan *Unit, 100), @@ -57,7 +57,7 @@ func NewPool(ctx context.Context, config *pkg.Config) (*Pool, error) { } // 格式化dir, 保证至少有一个"/" - if pool.isDir { + if strings.HasSuffix(config.BaseURL, "/") { pool.dir = pool.url.Path } else if pool.url.Path == "" { pool.dir = "/" @@ -148,8 +148,8 @@ func NewPool(ctx context.Context, config *pkg.Config) (*Pool, error) { type Pool struct { *pkg.Config base string // url的根目录, 在爬虫或者redirect时, 会需要用到根目录进行拼接 - isDir bool // url是否以/结尾 dir string + isDir bool url *url.URL Statistor *pkg.Statistor client *ihttp.Client @@ -167,7 +167,7 @@ type Pool struct { random *pkg.Baseline index *pkg.Baseline baselines map[int]*pkg.Baseline - urls map[string]int + urls map[string]struct{} analyzeDone bool worder *words.Worder locker sync.Mutex @@ -278,6 +278,7 @@ Loop: } pool.wg.Add(1) + pool.urls[u] = struct{}{} pool.reqPool.Invoke(newUnit(pool.safePath(u), WordSource)) // 原样的目录拼接, 输入了几个"/"就是几个, 适配java的目录解析 case source := <-pool.checkCh: pool.Statistor.CheckNumber++ @@ -290,7 +291,12 @@ Loop: if !ok { continue } - pool.reqPool.Invoke(unit) + if _, ok := pool.urls[unit.path]; ok { + pool.wg.Done() + } else { + pool.urls[unit.path] = struct{}{} + pool.reqPool.Invoke(unit) + } case <-closeCh: break Loop case <-ctx.Done(): @@ -543,24 +549,18 @@ func (pool *Pool) doCrawl(bl *pkg.Baseline) { go func() { defer pool.wg.Done() for _, u := range bl.URLs { - if u = FormatURL(bl.Url.Path, u); u == "" || u == pool.url.Path { + if u = FormatURL(bl.Url.Path, u); u == "" { continue } - pool.locker.Lock() - if _, ok := pool.urls[u]; ok { - pool.urls[u]++ - } else { - // 通过map去重, 只有新的url才会进入到该逻辑 - pool.urls[u] = 1 - pool.wg.Add(1) - pool.addAddition(&Unit{ - path: u, - source: CrawlSource, - depth: bl.ReqDepth + 1, - }) - } - pool.locker.Unlock() + // 通过map去重, 只有新的url才会进入到该逻辑 + pool.urls[u] = struct{}{} + pool.wg.Add(1) + pool.addAddition(&Unit{ + path: u, + source: CrawlSource, + depth: bl.ReqDepth + 1, + }) } }() @@ -571,7 +571,7 @@ func (pool *Pool) doRule(bl *pkg.Baseline) { pool.wg.Done() return } - if bl.Source == int(RuleSource) || bl.Dir { + if bl.Source == int(RuleSource) { pool.wg.Done() return } @@ -702,14 +702,10 @@ func (pool *Pool) Close() { func (pool *Pool) safePath(u string) string { // 自动生成的目录将采用safepath的方式拼接到相对目录中, 避免出现//的情况. 例如init, check, common - if u == "" { - return pool.url.Path - } - - if strings.HasPrefix(u, "/") { + if !pool.isDir && !strings.HasPrefix(u, "/") { // 如果path已经有"/", 则去掉 - return pool.dir + u[1:] - } else { return pool.dir + u + } else { + return pool.dir + u[1:] } }