mirror of
https://github.com/chainreactors/spray.git
synced 2025-09-15 11:40:13 +00:00
调整url去重的逻辑, 现在将统一进行去重判断
This commit is contained in:
parent
75236c7708
commit
4a774718c9
@ -40,13 +40,13 @@ func NewPool(ctx context.Context, config *pkg.Config) (*Pool, error) {
|
|||||||
pool := &Pool{
|
pool := &Pool{
|
||||||
Config: config,
|
Config: config,
|
||||||
base: u.Scheme + "://" + u.Hostname(),
|
base: u.Scheme + "://" + u.Hostname(),
|
||||||
isDir: strings.HasSuffix(config.BaseURL, "/"),
|
isDir: strings.HasSuffix(u.Path, "/"),
|
||||||
url: u,
|
url: u,
|
||||||
ctx: pctx,
|
ctx: pctx,
|
||||||
cancel: cancel,
|
cancel: cancel,
|
||||||
client: ihttp.NewClient(config.Thread, 2, config.ClientType),
|
client: ihttp.NewClient(config.Thread, 2, config.ClientType),
|
||||||
baselines: make(map[int]*pkg.Baseline),
|
baselines: make(map[int]*pkg.Baseline),
|
||||||
urls: make(map[string]int),
|
urls: make(map[string]struct{}),
|
||||||
tempCh: make(chan *pkg.Baseline, config.Thread),
|
tempCh: make(chan *pkg.Baseline, config.Thread),
|
||||||
checkCh: make(chan int),
|
checkCh: make(chan int),
|
||||||
additionCh: make(chan *Unit, 100),
|
additionCh: make(chan *Unit, 100),
|
||||||
@ -57,7 +57,7 @@ func NewPool(ctx context.Context, config *pkg.Config) (*Pool, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// 格式化dir, 保证至少有一个"/"
|
// 格式化dir, 保证至少有一个"/"
|
||||||
if pool.isDir {
|
if strings.HasSuffix(config.BaseURL, "/") {
|
||||||
pool.dir = pool.url.Path
|
pool.dir = pool.url.Path
|
||||||
} else if pool.url.Path == "" {
|
} else if pool.url.Path == "" {
|
||||||
pool.dir = "/"
|
pool.dir = "/"
|
||||||
@ -148,8 +148,8 @@ func NewPool(ctx context.Context, config *pkg.Config) (*Pool, error) {
|
|||||||
type Pool struct {
|
type Pool struct {
|
||||||
*pkg.Config
|
*pkg.Config
|
||||||
base string // url的根目录, 在爬虫或者redirect时, 会需要用到根目录进行拼接
|
base string // url的根目录, 在爬虫或者redirect时, 会需要用到根目录进行拼接
|
||||||
isDir bool // url是否以/结尾
|
|
||||||
dir string
|
dir string
|
||||||
|
isDir bool
|
||||||
url *url.URL
|
url *url.URL
|
||||||
Statistor *pkg.Statistor
|
Statistor *pkg.Statistor
|
||||||
client *ihttp.Client
|
client *ihttp.Client
|
||||||
@ -167,7 +167,7 @@ type Pool struct {
|
|||||||
random *pkg.Baseline
|
random *pkg.Baseline
|
||||||
index *pkg.Baseline
|
index *pkg.Baseline
|
||||||
baselines map[int]*pkg.Baseline
|
baselines map[int]*pkg.Baseline
|
||||||
urls map[string]int
|
urls map[string]struct{}
|
||||||
analyzeDone bool
|
analyzeDone bool
|
||||||
worder *words.Worder
|
worder *words.Worder
|
||||||
locker sync.Mutex
|
locker sync.Mutex
|
||||||
@ -278,6 +278,7 @@ Loop:
|
|||||||
}
|
}
|
||||||
|
|
||||||
pool.wg.Add(1)
|
pool.wg.Add(1)
|
||||||
|
pool.urls[u] = struct{}{}
|
||||||
pool.reqPool.Invoke(newUnit(pool.safePath(u), WordSource)) // 原样的目录拼接, 输入了几个"/"就是几个, 适配java的目录解析
|
pool.reqPool.Invoke(newUnit(pool.safePath(u), WordSource)) // 原样的目录拼接, 输入了几个"/"就是几个, 适配java的目录解析
|
||||||
case source := <-pool.checkCh:
|
case source := <-pool.checkCh:
|
||||||
pool.Statistor.CheckNumber++
|
pool.Statistor.CheckNumber++
|
||||||
@ -290,7 +291,12 @@ Loop:
|
|||||||
if !ok {
|
if !ok {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
pool.reqPool.Invoke(unit)
|
if _, ok := pool.urls[unit.path]; ok {
|
||||||
|
pool.wg.Done()
|
||||||
|
} else {
|
||||||
|
pool.urls[unit.path] = struct{}{}
|
||||||
|
pool.reqPool.Invoke(unit)
|
||||||
|
}
|
||||||
case <-closeCh:
|
case <-closeCh:
|
||||||
break Loop
|
break Loop
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
@ -543,24 +549,18 @@ func (pool *Pool) doCrawl(bl *pkg.Baseline) {
|
|||||||
go func() {
|
go func() {
|
||||||
defer pool.wg.Done()
|
defer pool.wg.Done()
|
||||||
for _, u := range bl.URLs {
|
for _, u := range bl.URLs {
|
||||||
if u = FormatURL(bl.Url.Path, u); u == "" || u == pool.url.Path {
|
if u = FormatURL(bl.Url.Path, u); u == "" {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
pool.locker.Lock()
|
// 通过map去重, 只有新的url才会进入到该逻辑
|
||||||
if _, ok := pool.urls[u]; ok {
|
pool.urls[u] = struct{}{}
|
||||||
pool.urls[u]++
|
pool.wg.Add(1)
|
||||||
} else {
|
pool.addAddition(&Unit{
|
||||||
// 通过map去重, 只有新的url才会进入到该逻辑
|
path: u,
|
||||||
pool.urls[u] = 1
|
source: CrawlSource,
|
||||||
pool.wg.Add(1)
|
depth: bl.ReqDepth + 1,
|
||||||
pool.addAddition(&Unit{
|
})
|
||||||
path: u,
|
|
||||||
source: CrawlSource,
|
|
||||||
depth: bl.ReqDepth + 1,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
pool.locker.Unlock()
|
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
@ -571,7 +571,7 @@ func (pool *Pool) doRule(bl *pkg.Baseline) {
|
|||||||
pool.wg.Done()
|
pool.wg.Done()
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
if bl.Source == int(RuleSource) || bl.Dir {
|
if bl.Source == int(RuleSource) {
|
||||||
pool.wg.Done()
|
pool.wg.Done()
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@ -702,14 +702,10 @@ func (pool *Pool) Close() {
|
|||||||
|
|
||||||
func (pool *Pool) safePath(u string) string {
|
func (pool *Pool) safePath(u string) string {
|
||||||
// 自动生成的目录将采用safepath的方式拼接到相对目录中, 避免出现//的情况. 例如init, check, common
|
// 自动生成的目录将采用safepath的方式拼接到相对目录中, 避免出现//的情况. 例如init, check, common
|
||||||
if u == "" {
|
if !pool.isDir && !strings.HasPrefix(u, "/") {
|
||||||
return pool.url.Path
|
|
||||||
}
|
|
||||||
|
|
||||||
if strings.HasPrefix(u, "/") {
|
|
||||||
// 如果path已经有"/", 则去掉
|
// 如果path已经有"/", 则去掉
|
||||||
return pool.dir + u[1:]
|
|
||||||
} else {
|
|
||||||
return pool.dir + u
|
return pool.dir + u
|
||||||
|
} else {
|
||||||
|
return pool.dir + u[1:]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user