调整url去重的逻辑, 现在将统一进行去重判断

This commit is contained in:
M09Ic 2023-01-11 11:12:00 +08:00
parent 75236c7708
commit 4a774718c9

View File

@ -40,13 +40,13 @@ func NewPool(ctx context.Context, config *pkg.Config) (*Pool, error) {
pool := &Pool{ pool := &Pool{
Config: config, Config: config,
base: u.Scheme + "://" + u.Hostname(), base: u.Scheme + "://" + u.Hostname(),
isDir: strings.HasSuffix(config.BaseURL, "/"), isDir: strings.HasSuffix(u.Path, "/"),
url: u, url: u,
ctx: pctx, ctx: pctx,
cancel: cancel, cancel: cancel,
client: ihttp.NewClient(config.Thread, 2, config.ClientType), client: ihttp.NewClient(config.Thread, 2, config.ClientType),
baselines: make(map[int]*pkg.Baseline), baselines: make(map[int]*pkg.Baseline),
urls: make(map[string]int), urls: make(map[string]struct{}),
tempCh: make(chan *pkg.Baseline, config.Thread), tempCh: make(chan *pkg.Baseline, config.Thread),
checkCh: make(chan int), checkCh: make(chan int),
additionCh: make(chan *Unit, 100), additionCh: make(chan *Unit, 100),
@ -57,7 +57,7 @@ func NewPool(ctx context.Context, config *pkg.Config) (*Pool, error) {
} }
// 格式化dir, 保证至少有一个"/" // 格式化dir, 保证至少有一个"/"
if pool.isDir { if strings.HasSuffix(config.BaseURL, "/") {
pool.dir = pool.url.Path pool.dir = pool.url.Path
} else if pool.url.Path == "" { } else if pool.url.Path == "" {
pool.dir = "/" pool.dir = "/"
@ -148,8 +148,8 @@ func NewPool(ctx context.Context, config *pkg.Config) (*Pool, error) {
type Pool struct { type Pool struct {
*pkg.Config *pkg.Config
base string // url的根目录, 在爬虫或者redirect时, 会需要用到根目录进行拼接 base string // url的根目录, 在爬虫或者redirect时, 会需要用到根目录进行拼接
isDir bool // url是否以/结尾
dir string dir string
isDir bool
url *url.URL url *url.URL
Statistor *pkg.Statistor Statistor *pkg.Statistor
client *ihttp.Client client *ihttp.Client
@ -167,7 +167,7 @@ type Pool struct {
random *pkg.Baseline random *pkg.Baseline
index *pkg.Baseline index *pkg.Baseline
baselines map[int]*pkg.Baseline baselines map[int]*pkg.Baseline
urls map[string]int urls map[string]struct{}
analyzeDone bool analyzeDone bool
worder *words.Worder worder *words.Worder
locker sync.Mutex locker sync.Mutex
@ -278,6 +278,7 @@ Loop:
} }
pool.wg.Add(1) pool.wg.Add(1)
pool.urls[u] = struct{}{}
pool.reqPool.Invoke(newUnit(pool.safePath(u), WordSource)) // 原样的目录拼接, 输入了几个"/"就是几个, 适配java的目录解析 pool.reqPool.Invoke(newUnit(pool.safePath(u), WordSource)) // 原样的目录拼接, 输入了几个"/"就是几个, 适配java的目录解析
case source := <-pool.checkCh: case source := <-pool.checkCh:
pool.Statistor.CheckNumber++ pool.Statistor.CheckNumber++
@ -290,7 +291,12 @@ Loop:
if !ok { if !ok {
continue continue
} }
pool.reqPool.Invoke(unit) if _, ok := pool.urls[unit.path]; ok {
pool.wg.Done()
} else {
pool.urls[unit.path] = struct{}{}
pool.reqPool.Invoke(unit)
}
case <-closeCh: case <-closeCh:
break Loop break Loop
case <-ctx.Done(): case <-ctx.Done():
@ -543,24 +549,18 @@ func (pool *Pool) doCrawl(bl *pkg.Baseline) {
go func() { go func() {
defer pool.wg.Done() defer pool.wg.Done()
for _, u := range bl.URLs { for _, u := range bl.URLs {
if u = FormatURL(bl.Url.Path, u); u == "" || u == pool.url.Path { if u = FormatURL(bl.Url.Path, u); u == "" {
continue continue
} }
pool.locker.Lock() // 通过map去重, 只有新的url才会进入到该逻辑
if _, ok := pool.urls[u]; ok { pool.urls[u] = struct{}{}
pool.urls[u]++ pool.wg.Add(1)
} else { pool.addAddition(&Unit{
// 通过map去重, 只有新的url才会进入到该逻辑 path: u,
pool.urls[u] = 1 source: CrawlSource,
pool.wg.Add(1) depth: bl.ReqDepth + 1,
pool.addAddition(&Unit{ })
path: u,
source: CrawlSource,
depth: bl.ReqDepth + 1,
})
}
pool.locker.Unlock()
} }
}() }()
@ -571,7 +571,7 @@ func (pool *Pool) doRule(bl *pkg.Baseline) {
pool.wg.Done() pool.wg.Done()
return return
} }
if bl.Source == int(RuleSource) || bl.Dir { if bl.Source == int(RuleSource) {
pool.wg.Done() pool.wg.Done()
return return
} }
@ -702,14 +702,10 @@ func (pool *Pool) Close() {
func (pool *Pool) safePath(u string) string { func (pool *Pool) safePath(u string) string {
// 自动生成的目录将采用safepath的方式拼接到相对目录中, 避免出现//的情况. 例如init, check, common // 自动生成的目录将采用safepath的方式拼接到相对目录中, 避免出现//的情况. 例如init, check, common
if u == "" { if !pool.isDir && !strings.HasPrefix(u, "/") {
return pool.url.Path
}
if strings.HasPrefix(u, "/") {
// 如果path已经有"/", 则去掉 // 如果path已经有"/", 则去掉
return pool.dir + u[1:]
} else {
return pool.dir + u return pool.dir + u
} else {
return pool.dir + u[1:]
} }
} }