mirror of
https://github.com/chainreactors/spray.git
synced 2025-09-15 11:40:13 +00:00
对爬虫进行大量优化, 包括优化性能, 去重, 优化目录拼接, 适配"./"相对目录
fuzzy会在开启debug时自动启用.
This commit is contained in:
parent
f8b84c733b
commit
f24c7b3bc6
@ -232,14 +232,15 @@ func (opt *Option) PrepareRunner() (*Runner, error) {
|
||||
logs.Log.Importantf("Loaded %d word from %s", len(dicts[i]), f)
|
||||
}
|
||||
|
||||
if len(opt.Dictionaries) > 0 && opt.Word == "" {
|
||||
if len(opt.Dictionaries) == 0 && opt.Word == "" {
|
||||
// 用来仅使用高级功能下, 防止无字典报错.
|
||||
opt.Word = "/"
|
||||
} else {
|
||||
opt.Word = "{?"
|
||||
for i, _ := range dicts {
|
||||
opt.Word += strconv.Itoa(i)
|
||||
}
|
||||
opt.Word += "}"
|
||||
} else {
|
||||
opt.Word = "/"
|
||||
}
|
||||
|
||||
if opt.Suffixes != nil {
|
||||
@ -376,7 +377,7 @@ func (opt *Option) PrepareRunner() (*Runner, error) {
|
||||
if opt.RemoveExtensions != "" {
|
||||
rexts := strings.Split(opt.ExcludeExtensions, ",")
|
||||
r.Fns = append(r.Fns, func(s string) string {
|
||||
if ext := parseExtension(s); StringsContains(rexts, ext) {
|
||||
if ext := parseExtension(s); pkg.StringsContains(rexts, ext) {
|
||||
return strings.TrimSuffix(s, "."+ext)
|
||||
}
|
||||
return s
|
||||
@ -386,7 +387,7 @@ func (opt *Option) PrepareRunner() (*Runner, error) {
|
||||
if opt.ExcludeExtensions != "" {
|
||||
exexts := strings.Split(opt.ExcludeExtensions, ",")
|
||||
r.Fns = append(r.Fns, func(s string) string {
|
||||
if ext := parseExtension(s); StringsContains(exexts, ext) {
|
||||
if ext := parseExtension(s); pkg.StringsContains(exexts, ext) {
|
||||
return ""
|
||||
}
|
||||
return s
|
||||
|
@ -221,12 +221,6 @@ func (pool *Pool) Run(ctx context.Context, offset, limit int) {
|
||||
}
|
||||
|
||||
closeCh := make(chan struct{})
|
||||
//go func() {
|
||||
// select {
|
||||
// case <-worderDone:
|
||||
// closeCh <- struct{}{}
|
||||
// }
|
||||
//}()
|
||||
var worderDone bool
|
||||
wait := func() {
|
||||
if !worderDone {
|
||||
@ -279,9 +273,6 @@ Loop:
|
||||
}
|
||||
|
||||
pool.wg.Wait()
|
||||
for pool.analyzeDone {
|
||||
time.Sleep(time.Duration(100) * time.Millisecond)
|
||||
}
|
||||
pool.Statistor.EndTime = time.Now().Unix()
|
||||
pool.Close()
|
||||
}
|
||||
@ -345,8 +336,8 @@ func (pool *Pool) Invoke(v interface{}) {
|
||||
bl.Collect()
|
||||
pool.locker.Lock()
|
||||
pool.random = bl
|
||||
pool.locker.Unlock()
|
||||
pool.addFuzzyBaseline(bl)
|
||||
pool.locker.Unlock()
|
||||
pool.initwg.Done()
|
||||
case InitIndexSource:
|
||||
bl.Collect()
|
||||
@ -354,10 +345,10 @@ func (pool *Pool) Invoke(v interface{}) {
|
||||
pool.index = bl
|
||||
pool.locker.Unlock()
|
||||
pool.wg.Add(1)
|
||||
pool.doCrawl(bl)
|
||||
if bl.Status == 200 || (bl.Status/100) == 3 {
|
||||
pool.OutputCh <- bl
|
||||
}
|
||||
pool.doCrawl(bl)
|
||||
pool.initwg.Done()
|
||||
case CheckSource:
|
||||
if bl.ErrString != "" {
|
||||
@ -399,7 +390,7 @@ func (pool *Pool) Invoke(v interface{}) {
|
||||
|
||||
func (pool *Pool) PreCompare(resp *ihttp.Response) error {
|
||||
status := resp.StatusCode()
|
||||
if IntsContains(WhiteStatus, status) {
|
||||
if pkg.IntsContains(WhiteStatus, status) {
|
||||
// 如果为白名单状态码则直接返回
|
||||
return nil
|
||||
}
|
||||
@ -407,11 +398,11 @@ func (pool *Pool) PreCompare(resp *ihttp.Response) error {
|
||||
return ErrSameStatus
|
||||
}
|
||||
|
||||
if IntsContains(BlackStatus, status) {
|
||||
if pkg.IntsContains(BlackStatus, status) {
|
||||
return ErrBadStatus
|
||||
}
|
||||
|
||||
if IntsContains(WAFStatus, status) {
|
||||
if pkg.IntsContains(WAFStatus, status) {
|
||||
return ErrWaf
|
||||
}
|
||||
|
||||
@ -505,17 +496,47 @@ func (pool *Pool) doCrawl(bl *pkg.Baseline) {
|
||||
return
|
||||
}
|
||||
bl.CollectURL()
|
||||
if bl.URLs == nil {
|
||||
pool.wg.Done()
|
||||
return
|
||||
}
|
||||
go func() {
|
||||
defer pool.wg.Done()
|
||||
for _, u := range bl.URLs {
|
||||
if strings.HasPrefix(u, "//") {
|
||||
u = bl.Url.Scheme + u
|
||||
parsed, err := url.Parse(u)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if parsed.Host != bl.Url.Host {
|
||||
continue
|
||||
}
|
||||
u = parsed.Path
|
||||
} else if strings.HasPrefix(u, "/") {
|
||||
// 绝对目录拼接
|
||||
u = pkg.URLJoin(pool.BaseURL, u)
|
||||
// 不需要进行处理, 用来跳过下面的判断
|
||||
} else if strings.HasPrefix(u, "./") {
|
||||
// "./"相对目录拼接
|
||||
if bl.Dir {
|
||||
u = pkg.URLJoin(bl.Url.Path, u[2:])
|
||||
} else {
|
||||
u = pkg.URLJoin(path.Dir(bl.Url.Path), u[2:])
|
||||
}
|
||||
} else if !strings.HasPrefix(u, "http") {
|
||||
// 相对目录拼接
|
||||
u = pkg.URLJoin(pool.BaseURL, u)
|
||||
if bl.Dir {
|
||||
u = pkg.URLJoin(bl.Url.Path, u)
|
||||
} else {
|
||||
u = pkg.URLJoin(path.Dir(bl.Url.Path), u)
|
||||
}
|
||||
} else {
|
||||
parsed, err := url.Parse(u)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if parsed.Host != bl.Url.Host {
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
if _, ok := pool.urls[u]; ok {
|
||||
@ -526,17 +547,9 @@ func (pool *Pool) doCrawl(bl *pkg.Baseline) {
|
||||
pool.urls[u] = 1
|
||||
pool.locker.Unlock()
|
||||
if bl.ReqDepth < maxCrawl {
|
||||
parsed, err := url.Parse(u)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if parsed.Host != bl.Url.Host {
|
||||
// 自动限定scoop, 防止爬到其他网站
|
||||
continue
|
||||
}
|
||||
pool.wg.Add(1)
|
||||
pool.addAddition(&Unit{
|
||||
path: parsed.Path,
|
||||
path: u[1:],
|
||||
source: CrawlSource,
|
||||
depth: bl.ReqDepth + 1,
|
||||
})
|
||||
@ -645,7 +658,7 @@ func (pool *Pool) addAddition(u *Unit) {
|
||||
}
|
||||
|
||||
func (pool *Pool) addFuzzyBaseline(bl *pkg.Baseline) {
|
||||
if _, ok := pool.baselines[bl.Status]; !ok && IntsContains(FuzzyStatus, bl.Status) {
|
||||
if _, ok := pool.baselines[bl.Status]; !ok && pkg.IntsContains(FuzzyStatus, bl.Status) {
|
||||
bl.Collect()
|
||||
pool.wg.Add(1)
|
||||
pool.doCrawl(bl)
|
||||
|
@ -326,6 +326,13 @@ func (r *Runner) Done() {
|
||||
}
|
||||
|
||||
func (r *Runner) Outputting() {
|
||||
debugPrint := func(bl *pkg.Baseline) {
|
||||
if r.Color {
|
||||
logs.Log.Debug(bl.ColorString())
|
||||
} else {
|
||||
logs.Log.Debug(bl.String())
|
||||
}
|
||||
}
|
||||
go func() {
|
||||
var saveFunc func(*pkg.Baseline)
|
||||
|
||||
@ -355,7 +362,6 @@ func (r *Runner) Outputting() {
|
||||
logs.Log.Console("[+] " + bl.String() + "\n")
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -375,11 +381,7 @@ func (r *Runner) Outputting() {
|
||||
r.AddPool(&Task{baseUrl: bl.UrlString, depth: bl.RecuDepth + 1})
|
||||
}
|
||||
} else {
|
||||
if r.Color {
|
||||
logs.Log.Debug(bl.ColorString())
|
||||
} else {
|
||||
logs.Log.Debug(bl.String())
|
||||
}
|
||||
debugPrint(bl)
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -411,6 +413,8 @@ func (r *Runner) Outputting() {
|
||||
}
|
||||
if r.Fuzzy {
|
||||
fuzzySaveFunc(bl)
|
||||
} else {
|
||||
debugPrint(bl)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -16,24 +16,6 @@ func parseExtension(s string) string {
|
||||
return ""
|
||||
}
|
||||
|
||||
func StringsContains(s []string, e string) bool {
|
||||
for _, v := range s {
|
||||
if v == e {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func IntsContains(s []int, e int) bool {
|
||||
for _, v := range s {
|
||||
if v == e {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func loadFileToSlice(filename string) ([]string, error) {
|
||||
var ss []string
|
||||
content, err := ioutil.ReadFile(filename)
|
||||
|
@ -18,9 +18,9 @@ func GetSourceName(s int) string {
|
||||
case 1:
|
||||
return "check"
|
||||
case 2:
|
||||
return "index"
|
||||
case 3:
|
||||
return "random"
|
||||
case 3:
|
||||
return "index"
|
||||
case 4:
|
||||
return "redirect"
|
||||
case 5:
|
||||
@ -177,7 +177,7 @@ func (bl *Baseline) CollectURL() {
|
||||
urls := reg.FindAllStringSubmatch(string(bl.Body), -1)
|
||||
for _, u := range urls {
|
||||
if !filterJs(u[1]) {
|
||||
bl.URLs = append(bl.URLs, u[1])
|
||||
bl.URLs = append(bl.URLs, formatURL(u[1]))
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -186,7 +186,7 @@ func (bl *Baseline) CollectURL() {
|
||||
urls := reg.FindAllStringSubmatch(string(bl.Body), -1)
|
||||
for _, u := range urls {
|
||||
if !filterUrl(u[1]) {
|
||||
bl.URLs = append(bl.URLs, u[1])
|
||||
bl.URLs = append(bl.URLs, formatURL(u[1]))
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -194,7 +194,7 @@ func (bl *Baseline) CollectURL() {
|
||||
if bl.URLs != nil {
|
||||
bl.Extracteds = append(bl.Extracteds, &fingers.Extracted{
|
||||
Name: "crawl",
|
||||
ExtractResult: bl.URLs,
|
||||
ExtractResult: RemoveDuplication(bl.URLs),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
51
pkg/utils.go
51
pkg/utils.go
@ -77,6 +77,22 @@ func IntsContains(s []int, e int) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
func RemoveDuplication(arr []string) []string {
|
||||
set := make(map[string]struct{}, len(arr))
|
||||
j := 0
|
||||
for _, v := range arr {
|
||||
_, ok := set[v]
|
||||
if ok {
|
||||
continue
|
||||
}
|
||||
set[v] = struct{}{}
|
||||
arr[j] = v
|
||||
j++
|
||||
}
|
||||
|
||||
return arr[:j]
|
||||
}
|
||||
|
||||
func HasStdin() bool {
|
||||
stat, err := os.Stdin.Stat()
|
||||
if err != nil {
|
||||
@ -210,20 +226,22 @@ func FingerDetect(content string) Frameworks {
|
||||
|
||||
var (
|
||||
BadExt = []string{".js", ".css", ".scss", ",", ".jpeg", ".jpg", ".png", ".gif", ".ico", ".svg", ".vue", ".ts"}
|
||||
//BadURL = []string{".js?", ".css?", ".jpeg?", ".jpg?", ".png?", ".gif?", "github.com", "www.w3.org", "example.com", "<", ">", "{", "}", "[", "]", "|", "^", ";", "/js/", ".src", ".url", ".att", ".href", "location.href", "javascript:", "location:", ".createObject", ":location", ".path", "*#__PURE__*", "\\n"}
|
||||
BadScoop = []string{"www.w3.org", "example.com"}
|
||||
BadURL = []string{"www.w3.org", "example.com", ".src", ".url", ".att", ".href", "location.href", "javascript:", "location:", ".createObject", ":location", ".path", "*#__PURE__*"}
|
||||
)
|
||||
|
||||
func filterJs(u string) bool {
|
||||
for _, scoop := range BadScoop {
|
||||
if strings.Contains(u, scoop) {
|
||||
return true
|
||||
}
|
||||
if commonFilter(u) {
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func filterUrl(u string) bool {
|
||||
if commonFilter(u) {
|
||||
return true
|
||||
}
|
||||
|
||||
parsed, err := url.Parse(u)
|
||||
if err != nil {
|
||||
return true
|
||||
@ -235,7 +253,26 @@ func filterUrl(u string) bool {
|
||||
}
|
||||
}
|
||||
}
|
||||
for _, scoop := range BadScoop {
|
||||
return false
|
||||
}
|
||||
|
||||
func formatURL(u string) string {
|
||||
// 去掉frag与params, 节约url.parse性能, 防止带参数造成意外的影响
|
||||
if i := strings.Index(u, "?"); i != -1 {
|
||||
return u[:i]
|
||||
}
|
||||
if i := strings.Index(u, "#"); i != -1 {
|
||||
return u[:i]
|
||||
}
|
||||
return u
|
||||
}
|
||||
|
||||
func commonFilter(u string) bool {
|
||||
if strings.HasPrefix(u, "http") && len(u) < 9 {
|
||||
return true
|
||||
}
|
||||
|
||||
for _, scoop := range BadURL {
|
||||
if strings.Contains(u, scoop) {
|
||||
return true
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user