调整url去重的逻辑, 现在将统一进行去重判断

This commit is contained in:
M09Ic 2023-01-11 11:12:00 +08:00
parent 75236c7708
commit 4a774718c9

View File

@ -40,13 +40,13 @@ func NewPool(ctx context.Context, config *pkg.Config) (*Pool, error) {
pool := &Pool{
Config: config,
base: u.Scheme + "://" + u.Hostname(),
isDir: strings.HasSuffix(config.BaseURL, "/"),
isDir: strings.HasSuffix(u.Path, "/"),
url: u,
ctx: pctx,
cancel: cancel,
client: ihttp.NewClient(config.Thread, 2, config.ClientType),
baselines: make(map[int]*pkg.Baseline),
urls: make(map[string]int),
urls: make(map[string]struct{}),
tempCh: make(chan *pkg.Baseline, config.Thread),
checkCh: make(chan int),
additionCh: make(chan *Unit, 100),
@ -57,7 +57,7 @@ func NewPool(ctx context.Context, config *pkg.Config) (*Pool, error) {
}
// 格式化dir, 保证至少有一个"/"
if pool.isDir {
if strings.HasSuffix(config.BaseURL, "/") {
pool.dir = pool.url.Path
} else if pool.url.Path == "" {
pool.dir = "/"
@ -148,8 +148,8 @@ func NewPool(ctx context.Context, config *pkg.Config) (*Pool, error) {
type Pool struct {
*pkg.Config
base string // url的根目录, 在爬虫或者redirect时, 会需要用到根目录进行拼接
isDir bool // url是否以/结尾
dir string
isDir bool
url *url.URL
Statistor *pkg.Statistor
client *ihttp.Client
@ -167,7 +167,7 @@ type Pool struct {
random *pkg.Baseline
index *pkg.Baseline
baselines map[int]*pkg.Baseline
urls map[string]int
urls map[string]struct{}
analyzeDone bool
worder *words.Worder
locker sync.Mutex
@ -278,6 +278,7 @@ Loop:
}
pool.wg.Add(1)
pool.urls[u] = struct{}{}
pool.reqPool.Invoke(newUnit(pool.safePath(u), WordSource)) // 原样的目录拼接, 输入了几个"/"就是几个, 适配java的目录解析
case source := <-pool.checkCh:
pool.Statistor.CheckNumber++
@ -290,7 +291,12 @@ Loop:
if !ok {
continue
}
if _, ok := pool.urls[unit.path]; ok {
pool.wg.Done()
} else {
pool.urls[unit.path] = struct{}{}
pool.reqPool.Invoke(unit)
}
case <-closeCh:
break Loop
case <-ctx.Done():
@ -543,16 +549,12 @@ func (pool *Pool) doCrawl(bl *pkg.Baseline) {
go func() {
defer pool.wg.Done()
for _, u := range bl.URLs {
if u = FormatURL(bl.Url.Path, u); u == "" || u == pool.url.Path {
if u = FormatURL(bl.Url.Path, u); u == "" {
continue
}
pool.locker.Lock()
if _, ok := pool.urls[u]; ok {
pool.urls[u]++
} else {
// 通过map去重, 只有新的url才会进入到该逻辑
pool.urls[u] = 1
pool.urls[u] = struct{}{}
pool.wg.Add(1)
pool.addAddition(&Unit{
path: u,
@ -560,8 +562,6 @@ func (pool *Pool) doCrawl(bl *pkg.Baseline) {
depth: bl.ReqDepth + 1,
})
}
pool.locker.Unlock()
}
}()
}
@ -571,7 +571,7 @@ func (pool *Pool) doRule(bl *pkg.Baseline) {
pool.wg.Done()
return
}
if bl.Source == int(RuleSource) || bl.Dir {
if bl.Source == int(RuleSource) {
pool.wg.Done()
return
}
@ -702,14 +702,10 @@ func (pool *Pool) Close() {
func (pool *Pool) safePath(u string) string {
// 自动生成的目录将采用safepath的方式拼接到相对目录中, 避免出现//的情况. 例如init, check, common
if u == "" {
return pool.url.Path
}
if strings.HasPrefix(u, "/") {
if !pool.isDir && !strings.HasPrefix(u, "/") {
// 如果path已经有"/", 则去掉
return pool.dir + u[1:]
} else {
return pool.dir + u
} else {
return pool.dir + u[1:]
}
}