调整爬虫的逻辑, 优化输出颜色

This commit is contained in:
M09Ic 2023-01-03 18:22:13 +08:00
parent 900dca32cb
commit 835d4663dd
2 changed files with 35 additions and 41 deletions

View File

@ -8,7 +8,6 @@ import (
"github.com/chainreactors/parsers" "github.com/chainreactors/parsers"
"github.com/chainreactors/spray/pkg/ihttp" "github.com/chainreactors/spray/pkg/ihttp"
"net/url" "net/url"
"path"
"strconv" "strconv"
"strings" "strings"
) )
@ -116,53 +115,18 @@ func (bl *Baseline) CollectURL() {
for _, reg := range JSRegexps { for _, reg := range JSRegexps {
urls := reg.FindAllStringSubmatch(string(bl.Body), -1) urls := reg.FindAllStringSubmatch(string(bl.Body), -1)
for _, u := range urls { for _, u := range urls {
var filter bool if !filterJs(u[1]) {
parsed, err := url.Parse(u[1]) bl.URLs = append(bl.URLs, u[1])
if err != nil {
filter = true
} else {
for _, scoop := range BadScoop {
if scoop == parsed.Host {
filter = true
break
}
}
} }
if filter {
continue
}
bl.URLs = append(bl.URLs, u[1])
} }
} }
for _, reg := range URLRegexps { for _, reg := range URLRegexps {
urls := reg.FindAllStringSubmatch(string(bl.Body), -1) urls := reg.FindAllStringSubmatch(string(bl.Body), -1)
for _, u := range urls { for _, u := range urls {
var filter bool if !filterUrl(u[1]) {
parsed, err := url.Parse(u[1]) bl.URLs = append(bl.URLs, u[1])
if err != nil {
filter = true
} else {
ext := path.Ext(parsed.Path)
for _, e := range BadExt {
if e == ext {
filter = true
break
}
}
for _, scoop := range BadScoop {
if scoop == parsed.Host {
filter = true
break
}
}
} }
if filter {
continue
}
bl.URLs = append(bl.URLs, u[1])
} }
} }
} }
@ -334,7 +298,7 @@ func (bl *Baseline) ColorString() string {
line.WriteString("\n") line.WriteString("\n")
} }
for _, u := range bl.URLs { for _, u := range bl.URLs {
line.WriteString("\t" + u + "\n") line.WriteString("\t" + logs.PurpleLine(u) + "\n")
} }
return line.String() return line.String()
} }

View File

@ -5,7 +5,9 @@ import (
"github.com/chainreactors/gogo/v2/pkg/utils" "github.com/chainreactors/gogo/v2/pkg/utils"
"github.com/chainreactors/ipcs" "github.com/chainreactors/ipcs"
"math/rand" "math/rand"
"net/url"
"os" "os"
"path"
"regexp" "regexp"
"strings" "strings"
"time" "time"
@ -144,6 +146,34 @@ var (
BadScoop = []string{"www.w3.org", "example.com"} BadScoop = []string{"www.w3.org", "example.com"}
) )
func filterJs(u string) bool {
for _, scoop := range BadScoop {
if strings.Contains(u, scoop) {
return true
}
}
return false
}
func filterUrl(u string) bool {
parsed, err := url.Parse(u)
if err != nil {
return true
} else {
ext := path.Ext(parsed.Path)
for _, e := range BadExt {
if e == ext {
return true
}
}
}
for _, scoop := range BadScoop {
if strings.Contains(u, scoop) {
return true
}
}
return false
}
func URLJoin(base, uri string) string { func URLJoin(base, uri string) string {
baseSlash := strings.HasSuffix(base, "/") baseSlash := strings.HasSuffix(base, "/")
uriSlash := strings.HasPrefix(uri, "/") uriSlash := strings.HasPrefix(uri, "/")