spray/pkg/baseline.go
2023-12-28 14:34:19 +08:00

234 lines
5.7 KiB
Go

package pkg
import (
"bytes"
"github.com/chainreactors/parsers"
"github.com/chainreactors/spray/pkg/ihttp"
"github.com/chainreactors/utils/encode"
"github.com/chainreactors/utils/iutils"
"net/url"
"strings"
)
func NewBaseline(u, host string, resp *ihttp.Response) *Baseline {
bl := &Baseline{
SprayResult: &parsers.SprayResult{
UrlString: u,
Status: resp.StatusCode(),
IsValid: true,
Frameworks: make(parsers.Frameworks),
},
}
if t, ok := ContentTypeMap[resp.ContentType()]; ok {
bl.ContentType = t
bl.Title = t + " data"
} else {
bl.ContentType = "other"
}
header := resp.Header()
bl.Header = make([]byte, len(header))
copy(bl.Header, header)
bl.HeaderLength = len(bl.Header)
if i := resp.ContentLength(); i != 0 && bl.ContentType != "bin" {
body := resp.Body()
bl.Body = make([]byte, len(body))
copy(bl.Body, body)
if i == -1 {
bl.Chunked = true
bl.BodyLength = len(bl.Body)
} else {
bl.BodyLength = i
}
}
bl.Raw = append(bl.Header, bl.Body...)
if r := resp.GetHeader("Location"); r != "" {
bl.RedirectURL = r
} else {
bl.RedirectURL = resp.GetHeader("location")
}
bl.Dir = bl.IsDir()
uu, err := url.Parse(u)
if err == nil {
bl.Path = uu.Path
bl.Url = uu
if uu.Host != host {
bl.Host = host
}
} else {
bl.IsValid = false
bl.Reason = ErrUrlError.Error()
bl.ErrString = err.Error()
}
bl.Unique = UniqueHash(bl)
return bl
}
func NewInvalidBaseline(u, host string, resp *ihttp.Response, reason string) *Baseline {
bl := &Baseline{
SprayResult: &parsers.SprayResult{
UrlString: u,
Status: resp.StatusCode(),
IsValid: false,
Reason: reason,
},
}
// 无效数据也要读取body, 否则keep-alive不生效
resp.Body()
bl.BodyLength = resp.ContentLength()
bl.RedirectURL = string(resp.GetHeader("Location"))
bl.Dir = bl.IsDir()
uu, err := url.Parse(u)
if err == nil {
bl.Path = uu.Path
bl.Url = uu
} else {
return bl
}
if bl.Url.Host != host {
bl.Host = host
}
return bl
}
type Baseline struct {
*parsers.SprayResult
Unique uint16 `json:"-"`
Url *url.URL `json:"-"`
Dir bool `json:"-"`
Chunked bool `json:"-"`
Body BS `json:"-"`
Header BS `json:"-"`
Raw BS `json:"-"`
Recu bool `json:"-"`
RecuDepth int `json:"-"`
URLs []string `json:"-"`
Collected bool `json:"-"`
Retry int `json:"-"`
}
func (bl *Baseline) IsDir() bool {
if strings.HasSuffix(bl.Path, "/") {
return true
}
return false
}
// Collect 深度收集信息
func (bl *Baseline) Collect() {
if bl.ContentType == "html" || bl.ContentType == "json" || bl.ContentType == "txt" {
// 指纹库设计的时候没考虑js,css文件的指纹, 跳过非必要的指纹收集减少误报提高性能
bl.Frameworks = FingerDetect(bl.Raw)
}
if len(bl.Body) > 0 {
if bl.ContentType == "html" {
bl.Title = iutils.AsciiEncode(parsers.MatchTitle(bl.Body))
} else if bl.ContentType == "ico" {
if name, ok := Md5Fingers[encode.Md5Hash(bl.Body)]; ok {
bl.Frameworks[name] = &parsers.Framework{Name: name}
} else if name, ok := Mmh3Fingers[encode.Mmh3Hash32(bl.Body)]; ok {
bl.Frameworks[name] = &parsers.Framework{Name: name}
}
}
}
bl.Hashes = parsers.NewHashes(bl.Raw)
bl.Extracteds = Extractors.Extract(string(bl.Raw))
bl.Unique = UniqueHash(bl)
}
func (bl *Baseline) CollectURL() {
if bl.Collected {
// 防止重复收集
return
} else {
bl.Collected = true
}
if len(bl.Body) == 0 {
return
}
for _, reg := range ExtractRegexps["js"][0].CompiledRegexps {
urls := reg.FindAllStringSubmatch(string(bl.Body), -1)
for _, u := range urls {
u[1] = formatURL(u[1])
if u[1] != "" && !filterJs(u[1]) {
bl.URLs = append(bl.URLs, u[1])
}
}
}
for _, reg := range ExtractRegexps["url"][0].CompiledRegexps {
urls := reg.FindAllStringSubmatch(string(bl.Body), -1)
for _, u := range urls {
u[1] = formatURL(u[1])
if u[1] != "" && !filterUrl(u[1]) {
bl.URLs = append(bl.URLs, u[1])
}
}
}
bl.URLs = RemoveDuplication(bl.URLs)
if bl.URLs != nil {
bl.Extracteds = append(bl.Extracteds, &parsers.Extracted{
Name: "crawl",
ExtractResult: bl.URLs,
})
}
}
// Compare
// if totally equal return 1
// if maybe equal return 0
// not equal return -1
func (bl *Baseline) Compare(other *Baseline) int {
if other.RedirectURL != "" && bl.RedirectURL == other.RedirectURL {
// 如果重定向url不为空, 且与base不相同, 则说明不是同一个页面
return 1
}
if bl.BodyLength == other.BodyLength {
// 如果body length相等且md5相等, 则说明是同一个页面
if bytes.Equal(bl.Body, other.Body) {
// 如果length相等, md5也相等, 则判断为全同
return 1
} else {
// 如果长度相等, 但是md5不相等, 可能是存在csrftoken之类的随机值
return 0
}
} else if i := bl.BodyLength - other.BodyLength; (i < 16 && i > 0) || (i > -16 && i < 0) {
// 如果body length绝对值小于16, 则可能是存在csrftoken之类的随机值, 需要模糊判断
return 0
} else {
// 如果body length绝对值大于16, 则认为大概率存在较大差异
if strings.Contains(string(other.Body), other.Path) {
// 如果包含路径本身, 可能是路径自身的随机值影响结果
return 0
} else {
// 如果不包含路径本身, 则认为是不同页面
return -1
}
}
return -1
}
var Distance uint8 = 5 // 数字越小越相似, 数字为0则为完全一致.
func (bl *Baseline) FuzzyCompare(other *Baseline) bool {
// 这里使用rawsimhash, 是为了保证一定数量的字符串, 否则超短的body会导致simhash偏差指较大
if other.Distance = encode.SimhashCompare(other.RawSimhash, bl.RawSimhash); other.Distance < Distance {
return true
}
return false
}