mirror of
https://github.com/chainreactors/spray.git
synced 2025-05-07 03:01:25 +00:00
调整爬虫的逻辑, 优化输出颜色
This commit is contained in:
parent
900dca32cb
commit
835d4663dd
@ -8,7 +8,6 @@ import (
|
||||
"github.com/chainreactors/parsers"
|
||||
"github.com/chainreactors/spray/pkg/ihttp"
|
||||
"net/url"
|
||||
"path"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
@ -116,53 +115,18 @@ func (bl *Baseline) CollectURL() {
|
||||
for _, reg := range JSRegexps {
|
||||
urls := reg.FindAllStringSubmatch(string(bl.Body), -1)
|
||||
for _, u := range urls {
|
||||
var filter bool
|
||||
parsed, err := url.Parse(u[1])
|
||||
if err != nil {
|
||||
filter = true
|
||||
} else {
|
||||
for _, scoop := range BadScoop {
|
||||
if scoop == parsed.Host {
|
||||
filter = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !filterJs(u[1]) {
|
||||
bl.URLs = append(bl.URLs, u[1])
|
||||
}
|
||||
|
||||
if filter {
|
||||
continue
|
||||
}
|
||||
bl.URLs = append(bl.URLs, u[1])
|
||||
}
|
||||
}
|
||||
|
||||
for _, reg := range URLRegexps {
|
||||
urls := reg.FindAllStringSubmatch(string(bl.Body), -1)
|
||||
for _, u := range urls {
|
||||
var filter bool
|
||||
parsed, err := url.Parse(u[1])
|
||||
if err != nil {
|
||||
filter = true
|
||||
} else {
|
||||
ext := path.Ext(parsed.Path)
|
||||
for _, e := range BadExt {
|
||||
if e == ext {
|
||||
filter = true
|
||||
break
|
||||
}
|
||||
}
|
||||
for _, scoop := range BadScoop {
|
||||
if scoop == parsed.Host {
|
||||
filter = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !filterUrl(u[1]) {
|
||||
bl.URLs = append(bl.URLs, u[1])
|
||||
}
|
||||
|
||||
if filter {
|
||||
continue
|
||||
}
|
||||
bl.URLs = append(bl.URLs, u[1])
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -334,7 +298,7 @@ func (bl *Baseline) ColorString() string {
|
||||
line.WriteString("\n")
|
||||
}
|
||||
for _, u := range bl.URLs {
|
||||
line.WriteString("\t" + u + "\n")
|
||||
line.WriteString("\t" + logs.PurpleLine(u) + "\n")
|
||||
}
|
||||
return line.String()
|
||||
}
|
||||
|
30
pkg/utils.go
30
pkg/utils.go
@ -5,7 +5,9 @@ import (
|
||||
"github.com/chainreactors/gogo/v2/pkg/utils"
|
||||
"github.com/chainreactors/ipcs"
|
||||
"math/rand"
|
||||
"net/url"
|
||||
"os"
|
||||
"path"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
@ -144,6 +146,34 @@ var (
|
||||
BadScoop = []string{"www.w3.org", "example.com"}
|
||||
)
|
||||
|
||||
func filterJs(u string) bool {
|
||||
for _, scoop := range BadScoop {
|
||||
if strings.Contains(u, scoop) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func filterUrl(u string) bool {
|
||||
parsed, err := url.Parse(u)
|
||||
if err != nil {
|
||||
return true
|
||||
} else {
|
||||
ext := path.Ext(parsed.Path)
|
||||
for _, e := range BadExt {
|
||||
if e == ext {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
for _, scoop := range BadScoop {
|
||||
if strings.Contains(u, scoop) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
func URLJoin(base, uri string) string {
|
||||
baseSlash := strings.HasSuffix(base, "/")
|
||||
uriSlash := strings.HasPrefix(uri, "/")
|
||||
|
Loading…
x
Reference in New Issue
Block a user