调整extract格式

This commit is contained in:
M09Ic 2023-02-08 15:30:40 +08:00
parent b3589db853
commit 5ace37824a
5 changed files with 25 additions and 29 deletions

View File

@ -4,6 +4,7 @@ import (
"context" "context"
"fmt" "fmt"
"github.com/chainreactors/logs" "github.com/chainreactors/logs"
"github.com/chainreactors/parsers"
"github.com/chainreactors/parsers/iutils" "github.com/chainreactors/parsers/iutils"
"github.com/chainreactors/spray/internal" "github.com/chainreactors/spray/internal"
"github.com/chainreactors/spray/pkg" "github.com/chainreactors/spray/pkg"
@ -63,7 +64,12 @@ func Spray() {
if reg, ok := pkg.ExtractRegexps[e]; ok { if reg, ok := pkg.ExtractRegexps[e]; ok {
pkg.Extractors[e] = reg pkg.Extractors[e] = reg
} else { } else {
pkg.Extractors[e] = []*regexp.Regexp{regexp.MustCompile(e)} pkg.Extractors[e] = []*parsers.Extractor{
&parsers.Extractor{
Name: e,
CompiledRegexps: []*regexp.Regexp{regexp.MustCompile(e)},
},
}
} }
} }
} }

2
go.mod
View File

@ -8,7 +8,7 @@ require (
github.com/chainreactors/gogo/v2 v2.10.4 github.com/chainreactors/gogo/v2 v2.10.4
github.com/chainreactors/ipcs v0.0.13 github.com/chainreactors/ipcs v0.0.13
github.com/chainreactors/logs v0.7.1-0.20221214153111-85f123ff6580 github.com/chainreactors/logs v0.7.1-0.20221214153111-85f123ff6580
github.com/chainreactors/parsers v0.3.1-0.20230204104401-6e150669e599 github.com/chainreactors/parsers v0.3.1-0.20230208070438-6903b0d366c9
github.com/chainreactors/words v0.4.1-0.20230203115443-ca934844e361 github.com/chainreactors/words v0.4.1-0.20230203115443-ca934844e361
) )

2
go.sum
View File

@ -28,6 +28,8 @@ github.com/chainreactors/parsers v0.3.1-0.20230201103008-e20167926b49 h1:snsLbWc
github.com/chainreactors/parsers v0.3.1-0.20230201103008-e20167926b49/go.mod h1:tA33N6UbYFnIT3k5tufOMfETxmEP20RZFyTSEnVXNUA= github.com/chainreactors/parsers v0.3.1-0.20230201103008-e20167926b49/go.mod h1:tA33N6UbYFnIT3k5tufOMfETxmEP20RZFyTSEnVXNUA=
github.com/chainreactors/parsers v0.3.1-0.20230204104401-6e150669e599 h1:9PwMZzN+RZDv2BUDvOG8e0N6W3XJQLVaP2AW6RD5mjM= github.com/chainreactors/parsers v0.3.1-0.20230204104401-6e150669e599 h1:9PwMZzN+RZDv2BUDvOG8e0N6W3XJQLVaP2AW6RD5mjM=
github.com/chainreactors/parsers v0.3.1-0.20230204104401-6e150669e599/go.mod h1:tA33N6UbYFnIT3k5tufOMfETxmEP20RZFyTSEnVXNUA= github.com/chainreactors/parsers v0.3.1-0.20230204104401-6e150669e599/go.mod h1:tA33N6UbYFnIT3k5tufOMfETxmEP20RZFyTSEnVXNUA=
github.com/chainreactors/parsers v0.3.1-0.20230208070438-6903b0d366c9 h1:JCm8SmLb1jMFp5T6bBXKn3GmqPTjLxqWiz5yQKlo5Bs=
github.com/chainreactors/parsers v0.3.1-0.20230208070438-6903b0d366c9/go.mod h1:tA33N6UbYFnIT3k5tufOMfETxmEP20RZFyTSEnVXNUA=
github.com/chainreactors/words v0.3.2-0.20230105161651-7c1fc4c9605a h1:vRAMDJ6UQV73uyiRBQnuE/+S7Q7JTpfubSpyRlooZ2U= github.com/chainreactors/words v0.3.2-0.20230105161651-7c1fc4c9605a h1:vRAMDJ6UQV73uyiRBQnuE/+S7Q7JTpfubSpyRlooZ2U=
github.com/chainreactors/words v0.3.2-0.20230105161651-7c1fc4c9605a/go.mod h1:QIWX1vMT5j/Mp9zx3/wgZh3FqskhjCbo/3Ffy/Hxj9w= github.com/chainreactors/words v0.3.2-0.20230105161651-7c1fc4c9605a/go.mod h1:QIWX1vMT5j/Mp9zx3/wgZh3FqskhjCbo/3Ffy/Hxj9w=
github.com/chainreactors/words v0.4.1-0.20230203114605-f305deb098a2 h1:51GoU85MLp/s8IvXcKLeedSxypkvZBFJWIBUlGV+MiI= github.com/chainreactors/words v0.4.1-0.20230203114605-f305deb098a2 h1:51GoU85MLp/s8IvXcKLeedSxypkvZBFJWIBUlGV+MiI=

View File

@ -147,7 +147,7 @@ func (bl *Baseline) CollectURL() {
if len(bl.Body) == 0 { if len(bl.Body) == 0 {
return return
} }
for _, reg := range ExtractRegexps["js"] { for _, reg := range ExtractRegexps["js"][0].CompiledRegexps {
urls := reg.FindAllStringSubmatch(string(bl.Body), -1) urls := reg.FindAllStringSubmatch(string(bl.Body), -1)
for _, u := range urls { for _, u := range urls {
u[1] = formatURL(u[1]) u[1] = formatURL(u[1])
@ -157,7 +157,7 @@ func (bl *Baseline) CollectURL() {
} }
} }
for _, reg := range ExtractRegexps["url"] { for _, reg := range ExtractRegexps["url"][0].CompiledRegexps {
urls := reg.FindAllStringSubmatch(string(bl.Body), -1) urls := reg.FindAllStringSubmatch(string(bl.Body), -1)
for _, u := range urls { for _, u := range urls {
u[1] = formatURL(u[1]) u[1] = formatURL(u[1])

View File

@ -11,7 +11,6 @@ import (
"net/url" "net/url"
"os" "os"
"path" "path"
"regexp"
"strconv" "strconv"
"strings" "strings"
"time" "time"
@ -19,27 +18,15 @@ import (
) )
var ( var (
Md5Fingers map[string]string = make(map[string]string) Md5Fingers map[string]string = make(map[string]string)
Mmh3Fingers map[string]string = make(map[string]string) Mmh3Fingers map[string]string = make(map[string]string)
Rules map[string]string = make(map[string]string) Rules map[string]string = make(map[string]string)
ActivePath []string ActivePath []string
Fingers fingers.Fingers Fingers fingers.Fingers
//JSRegexps []*regexp.Regexp = []*regexp.Regexp{ ExtractRegexps = map[string][]*parsers.Extractor{}
// regexp.MustCompile(`.(https{0,1}:[^\s'"”><()|*\[]{2,250}?[^=*\s'><:;|()[]{3}\[]\.js)`), Extractors = make(parsers.Extractors)
// regexp.MustCompile(`["']([^\s',"”><;()|*:\[]{2,250}?[^=*\s'|"”><^:;()\[]{3}\.js)`),
// regexp.MustCompile(`=\s{0,6}["']{0,1}\s{0,6}([^\s^',+><;()|*\[]{2,250}?[^=,\s'"”>|<:;*()\[]{3}\.js)`),
//}
//URLRegexps []*regexp.Regexp = []*regexp.Regexp{
// regexp.MustCompile(`=\s{0,6}(https{0,1}:[^\s'"><()|*\[]{2,250})`),
// regexp.MustCompile(`["']([^\s',"”><.@$;:()|*\[]{2,250}\.[a-zA-Z]\w{1,4})["']`),
// regexp.MustCompile(`["'](https?:[^\s'"><()@|*\[]{2,250}?\.[^\s',"”><;()|*\[]{2,250}?)["']`),
// regexp.MustCompile(`["']\s{0,6}([#,.]{0,2}/[^\s'",><;@$()|*\[]{2,250}?)\s{0,6}["']`),
// regexp.MustCompile(`href\s{0,6}=\s{0,6}["'‘“]{0,1}\s{0,6}([^\s',"”><$@;()|*\[]{2,250})|action\s{0,6}=\s{0,6}["'‘“]{0,1}\s{0,6}([^\s'"“><)(]{2,250})`),
//}
ExtractRegexps map[string][]*regexp.Regexp = map[string][]*regexp.Regexp{}
Extractors = make(parsers.Extractors)
BadExt = []string{".js", ".css", ".scss", ".,", ".jpeg", ".jpg", ".png", ".gif", ".svg", ".vue", ".ts", ".swf", ".pdf", ".mp4"} BadExt = []string{".js", ".css", ".scss", ".,", ".jpeg", ".jpg", ".png", ".gif", ".svg", ".vue", ".ts", ".swf", ".pdf", ".mp4", ".zip", ".rar"}
BadURL = []string{";", "}", "\\n", "webpack://", "{", "www.w3.org", ".src", ".url", ".att", ".href", "location.href", "javascript:", "location:", ".createObject", ":location", ".path"} BadURL = []string{";", "}", "\\n", "webpack://", "{", "www.w3.org", ".src", ".url", ".att", ".href", "location.href", "javascript:", "location:", ".createObject", ":location", ".path"}
ContentTypeMap = map[string]string{ ContentTypeMap = map[string]string{
@ -210,12 +197,12 @@ func LoadTemplates() error {
for _, extract := range extracts { for _, extract := range extracts {
extract.Compile() extract.Compile()
ExtractRegexps[extract.Name] = extract.CompiledRegexps ExtractRegexps[extract.Name] = []*parsers.Extractor{extract}
for _, tag := range extract.Tags { for _, tag := range extract.Tags {
if _, ok := ExtractRegexps[tag]; !ok { if _, ok := ExtractRegexps[tag]; !ok {
ExtractRegexps[tag] = extract.CompiledRegexps ExtractRegexps[tag] = []*parsers.Extractor{extract}
} else { } else {
ExtractRegexps[tag] = append(ExtractRegexps[tag], extract.CompiledRegexps...) ExtractRegexps[tag] = append(ExtractRegexps[tag], extract)
} }
} }
} }
@ -354,6 +341,7 @@ func CRC16Hash(data []byte) uint16 {
} }
func UniqueHash(bl *Baseline) uint16 { func UniqueHash(bl *Baseline) uint16 {
// 由host+状态码+重定向url+content-type+title+length舍去个位与十位组成的hash, 没有body length, 因为可能存在随机值 // 由host+状态码+重定向url+content-type+title+length舍去个位与十位组成的hash
// body length可能会导致一些误报, 目前没有更好的解决办法
return CRC16Hash([]byte(bl.Host + strconv.Itoa(bl.Status) + bl.RedirectURL + bl.ContentType + bl.Title + strconv.Itoa(bl.BodyLength/100*100))) return CRC16Hash([]byte(bl.Host + strconv.Itoa(bl.Status) + bl.RedirectURL + bl.ContentType + bl.Title + strconv.Itoa(bl.BodyLength/100*100)))
} }