调整extract格式

This commit is contained in:
M09Ic 2023-02-08 15:30:40 +08:00
parent b3589db853
commit 5ace37824a
5 changed files with 25 additions and 29 deletions

View File

@ -4,6 +4,7 @@ import (
"context"
"fmt"
"github.com/chainreactors/logs"
"github.com/chainreactors/parsers"
"github.com/chainreactors/parsers/iutils"
"github.com/chainreactors/spray/internal"
"github.com/chainreactors/spray/pkg"
@ -63,7 +64,12 @@ func Spray() {
if reg, ok := pkg.ExtractRegexps[e]; ok {
pkg.Extractors[e] = reg
} else {
pkg.Extractors[e] = []*regexp.Regexp{regexp.MustCompile(e)}
pkg.Extractors[e] = []*parsers.Extractor{
&parsers.Extractor{
Name: e,
CompiledRegexps: []*regexp.Regexp{regexp.MustCompile(e)},
},
}
}
}
}

2
go.mod
View File

@ -8,7 +8,7 @@ require (
github.com/chainreactors/gogo/v2 v2.10.4
github.com/chainreactors/ipcs v0.0.13
github.com/chainreactors/logs v0.7.1-0.20221214153111-85f123ff6580
github.com/chainreactors/parsers v0.3.1-0.20230204104401-6e150669e599
github.com/chainreactors/parsers v0.3.1-0.20230208070438-6903b0d366c9
github.com/chainreactors/words v0.4.1-0.20230203115443-ca934844e361
)

2
go.sum
View File

@ -28,6 +28,8 @@ github.com/chainreactors/parsers v0.3.1-0.20230201103008-e20167926b49 h1:snsLbWc
github.com/chainreactors/parsers v0.3.1-0.20230201103008-e20167926b49/go.mod h1:tA33N6UbYFnIT3k5tufOMfETxmEP20RZFyTSEnVXNUA=
github.com/chainreactors/parsers v0.3.1-0.20230204104401-6e150669e599 h1:9PwMZzN+RZDv2BUDvOG8e0N6W3XJQLVaP2AW6RD5mjM=
github.com/chainreactors/parsers v0.3.1-0.20230204104401-6e150669e599/go.mod h1:tA33N6UbYFnIT3k5tufOMfETxmEP20RZFyTSEnVXNUA=
github.com/chainreactors/parsers v0.3.1-0.20230208070438-6903b0d366c9 h1:JCm8SmLb1jMFp5T6bBXKn3GmqPTjLxqWiz5yQKlo5Bs=
github.com/chainreactors/parsers v0.3.1-0.20230208070438-6903b0d366c9/go.mod h1:tA33N6UbYFnIT3k5tufOMfETxmEP20RZFyTSEnVXNUA=
github.com/chainreactors/words v0.3.2-0.20230105161651-7c1fc4c9605a h1:vRAMDJ6UQV73uyiRBQnuE/+S7Q7JTpfubSpyRlooZ2U=
github.com/chainreactors/words v0.3.2-0.20230105161651-7c1fc4c9605a/go.mod h1:QIWX1vMT5j/Mp9zx3/wgZh3FqskhjCbo/3Ffy/Hxj9w=
github.com/chainreactors/words v0.4.1-0.20230203114605-f305deb098a2 h1:51GoU85MLp/s8IvXcKLeedSxypkvZBFJWIBUlGV+MiI=

View File

@ -147,7 +147,7 @@ func (bl *Baseline) CollectURL() {
if len(bl.Body) == 0 {
return
}
for _, reg := range ExtractRegexps["js"] {
for _, reg := range ExtractRegexps["js"][0].CompiledRegexps {
urls := reg.FindAllStringSubmatch(string(bl.Body), -1)
for _, u := range urls {
u[1] = formatURL(u[1])
@ -157,7 +157,7 @@ func (bl *Baseline) CollectURL() {
}
}
for _, reg := range ExtractRegexps["url"] {
for _, reg := range ExtractRegexps["url"][0].CompiledRegexps {
urls := reg.FindAllStringSubmatch(string(bl.Body), -1)
for _, u := range urls {
u[1] = formatURL(u[1])

View File

@ -11,7 +11,6 @@ import (
"net/url"
"os"
"path"
"regexp"
"strconv"
"strings"
"time"
@ -24,22 +23,10 @@ var (
Rules map[string]string = make(map[string]string)
ActivePath []string
Fingers fingers.Fingers
//JSRegexps []*regexp.Regexp = []*regexp.Regexp{
// regexp.MustCompile(`.(https{0,1}:[^\s'"”><()|*\[]{2,250}?[^=*\s'><:;|()[]{3}\[]\.js)`),
// regexp.MustCompile(`["']([^\s',"”><;()|*:\[]{2,250}?[^=*\s'|"”><^:;()\[]{3}\.js)`),
// regexp.MustCompile(`=\s{0,6}["']{0,1}\s{0,6}([^\s^',+><;()|*\[]{2,250}?[^=,\s'"”>|<:;*()\[]{3}\.js)`),
//}
//URLRegexps []*regexp.Regexp = []*regexp.Regexp{
// regexp.MustCompile(`=\s{0,6}(https{0,1}:[^\s'"><()|*\[]{2,250})`),
// regexp.MustCompile(`["']([^\s',"”><.@$;:()|*\[]{2,250}\.[a-zA-Z]\w{1,4})["']`),
// regexp.MustCompile(`["'](https?:[^\s'"><()@|*\[]{2,250}?\.[^\s',"”><;()|*\[]{2,250}?)["']`),
// regexp.MustCompile(`["']\s{0,6}([#,.]{0,2}/[^\s'",><;@$()|*\[]{2,250}?)\s{0,6}["']`),
// regexp.MustCompile(`href\s{0,6}=\s{0,6}["'‘“]{0,1}\s{0,6}([^\s',"”><$@;()|*\[]{2,250})|action\s{0,6}=\s{0,6}["'‘“]{0,1}\s{0,6}([^\s'"“><)(]{2,250})`),
//}
ExtractRegexps map[string][]*regexp.Regexp = map[string][]*regexp.Regexp{}
ExtractRegexps = map[string][]*parsers.Extractor{}
Extractors = make(parsers.Extractors)
BadExt = []string{".js", ".css", ".scss", ".,", ".jpeg", ".jpg", ".png", ".gif", ".svg", ".vue", ".ts", ".swf", ".pdf", ".mp4"}
BadExt = []string{".js", ".css", ".scss", ".,", ".jpeg", ".jpg", ".png", ".gif", ".svg", ".vue", ".ts", ".swf", ".pdf", ".mp4", ".zip", ".rar"}
BadURL = []string{";", "}", "\\n", "webpack://", "{", "www.w3.org", ".src", ".url", ".att", ".href", "location.href", "javascript:", "location:", ".createObject", ":location", ".path"}
ContentTypeMap = map[string]string{
@ -210,12 +197,12 @@ func LoadTemplates() error {
for _, extract := range extracts {
extract.Compile()
ExtractRegexps[extract.Name] = extract.CompiledRegexps
ExtractRegexps[extract.Name] = []*parsers.Extractor{extract}
for _, tag := range extract.Tags {
if _, ok := ExtractRegexps[tag]; !ok {
ExtractRegexps[tag] = extract.CompiledRegexps
ExtractRegexps[tag] = []*parsers.Extractor{extract}
} else {
ExtractRegexps[tag] = append(ExtractRegexps[tag], extract.CompiledRegexps...)
ExtractRegexps[tag] = append(ExtractRegexps[tag], extract)
}
}
}
@ -354,6 +341,7 @@ func CRC16Hash(data []byte) uint16 {
}
func UniqueHash(bl *Baseline) uint16 {
// 由host+状态码+重定向url+content-type+title+length舍去个位与十位组成的hash, 没有body length, 因为可能存在随机值
// 由host+状态码+重定向url+content-type+title+length舍去个位与十位组成的hash
// body length可能会导致一些误报, 目前没有更好的解决办法
return CRC16Hash([]byte(bl.Host + strconv.Itoa(bl.Status) + bl.RedirectURL + bl.ContentType + bl.Title + strconv.Itoa(bl.BodyLength/100*100)))
}