2022-11-10 21:03:07 +08:00
|
|
|
package pkg
|
2022-09-08 15:57:17 +08:00
|
|
|
|
|
|
|
import (
|
2022-12-11 01:21:05 +08:00
|
|
|
"bytes"
|
2022-09-19 14:42:29 +08:00
|
|
|
"github.com/chainreactors/parsers"
|
2022-10-26 18:28:40 +08:00
|
|
|
"github.com/chainreactors/spray/pkg/ihttp"
|
2023-12-28 14:34:19 +08:00
|
|
|
"github.com/chainreactors/utils/encode"
|
|
|
|
"github.com/chainreactors/utils/iutils"
|
2022-11-09 16:05:17 +08:00
|
|
|
"net/url"
|
2022-09-08 15:57:17 +08:00
|
|
|
"strings"
|
|
|
|
)
|
|
|
|
|
2022-11-10 21:03:07 +08:00
|
|
|
func NewBaseline(u, host string, resp *ihttp.Response) *Baseline {
|
|
|
|
bl := &Baseline{
|
2023-02-01 18:31:50 +08:00
|
|
|
SprayResult: &parsers.SprayResult{
|
|
|
|
UrlString: u,
|
|
|
|
Status: resp.StatusCode(),
|
|
|
|
IsValid: true,
|
|
|
|
Frameworks: make(parsers.Frameworks),
|
|
|
|
},
|
2022-09-08 15:57:17 +08:00
|
|
|
}
|
2023-02-08 19:18:33 +08:00
|
|
|
|
|
|
|
if t, ok := ContentTypeMap[resp.ContentType()]; ok {
|
|
|
|
bl.ContentType = t
|
|
|
|
bl.Title = t + " data"
|
|
|
|
} else {
|
|
|
|
bl.ContentType = "other"
|
|
|
|
}
|
|
|
|
|
2023-01-06 15:06:40 +08:00
|
|
|
header := resp.Header()
|
|
|
|
bl.Header = make([]byte, len(header))
|
|
|
|
copy(bl.Header, header)
|
|
|
|
bl.HeaderLength = len(bl.Header)
|
|
|
|
|
2023-02-08 19:18:33 +08:00
|
|
|
if i := resp.ContentLength(); i != 0 && bl.ContentType != "bin" {
|
2023-01-10 11:59:43 +08:00
|
|
|
body := resp.Body()
|
|
|
|
bl.Body = make([]byte, len(body))
|
|
|
|
copy(bl.Body, body)
|
|
|
|
|
|
|
|
if i == -1 {
|
2023-01-12 16:35:34 +08:00
|
|
|
bl.Chunked = true
|
2023-01-10 11:59:43 +08:00
|
|
|
bl.BodyLength = len(bl.Body)
|
|
|
|
} else {
|
|
|
|
bl.BodyLength = i
|
|
|
|
}
|
2023-01-06 15:06:40 +08:00
|
|
|
}
|
|
|
|
|
2022-09-23 01:20:01 +08:00
|
|
|
bl.Raw = append(bl.Header, bl.Body...)
|
2023-02-07 18:37:19 +08:00
|
|
|
if r := resp.GetHeader("Location"); r != "" {
|
|
|
|
bl.RedirectURL = r
|
|
|
|
} else {
|
|
|
|
bl.RedirectURL = resp.GetHeader("location")
|
|
|
|
}
|
2023-01-10 23:44:03 +08:00
|
|
|
|
2023-01-12 16:35:34 +08:00
|
|
|
bl.Dir = bl.IsDir()
|
2023-01-10 23:44:03 +08:00
|
|
|
uu, err := url.Parse(u)
|
|
|
|
if err == nil {
|
|
|
|
bl.Path = uu.Path
|
|
|
|
bl.Url = uu
|
2023-04-25 17:28:08 +08:00
|
|
|
if uu.Host != host {
|
2023-02-21 17:58:16 +08:00
|
|
|
bl.Host = host
|
|
|
|
}
|
2023-04-25 17:28:08 +08:00
|
|
|
} else {
|
|
|
|
bl.IsValid = false
|
2023-04-25 17:33:07 +08:00
|
|
|
bl.Reason = ErrUrlError.Error()
|
|
|
|
bl.ErrString = err.Error()
|
2023-01-10 23:44:03 +08:00
|
|
|
}
|
2023-02-08 12:58:56 +08:00
|
|
|
bl.Unique = UniqueHash(bl)
|
2022-09-23 01:47:24 +08:00
|
|
|
return bl
|
2022-09-08 15:57:17 +08:00
|
|
|
}
|
|
|
|
|
2022-11-10 21:03:07 +08:00
|
|
|
func NewInvalidBaseline(u, host string, resp *ihttp.Response, reason string) *Baseline {
|
|
|
|
bl := &Baseline{
|
2023-02-01 18:31:50 +08:00
|
|
|
SprayResult: &parsers.SprayResult{
|
|
|
|
UrlString: u,
|
|
|
|
Status: resp.StatusCode(),
|
|
|
|
IsValid: false,
|
|
|
|
Reason: reason,
|
|
|
|
},
|
2022-09-08 15:57:17 +08:00
|
|
|
}
|
|
|
|
|
2023-01-10 23:44:03 +08:00
|
|
|
// 无效数据也要读取body, 否则keep-alive不生效
|
|
|
|
resp.Body()
|
|
|
|
bl.BodyLength = resp.ContentLength()
|
|
|
|
bl.RedirectURL = string(resp.GetHeader("Location"))
|
|
|
|
|
2023-01-12 16:35:34 +08:00
|
|
|
bl.Dir = bl.IsDir()
|
2022-11-09 16:05:17 +08:00
|
|
|
uu, err := url.Parse(u)
|
|
|
|
if err == nil {
|
|
|
|
bl.Path = uu.Path
|
2022-11-29 20:50:00 +08:00
|
|
|
bl.Url = uu
|
2023-01-12 16:35:34 +08:00
|
|
|
} else {
|
2023-01-10 23:44:03 +08:00
|
|
|
return bl
|
2022-11-09 16:05:17 +08:00
|
|
|
}
|
|
|
|
|
2023-01-10 01:08:42 +08:00
|
|
|
if bl.Url.Host != host {
|
2022-10-27 23:40:15 +08:00
|
|
|
bl.Host = host
|
|
|
|
}
|
|
|
|
|
2022-09-08 15:57:17 +08:00
|
|
|
return bl
|
|
|
|
}
|
|
|
|
|
2022-11-10 21:03:07 +08:00
|
|
|
type Baseline struct {
|
2023-02-01 18:31:50 +08:00
|
|
|
*parsers.SprayResult
|
2023-02-08 12:58:56 +08:00
|
|
|
Unique uint16 `json:"-"`
|
2023-02-01 18:31:50 +08:00
|
|
|
Url *url.URL `json:"-"`
|
|
|
|
Dir bool `json:"-"`
|
|
|
|
Chunked bool `json:"-"`
|
2023-06-03 21:09:01 +08:00
|
|
|
Body BS `json:"-"`
|
|
|
|
Header BS `json:"-"`
|
|
|
|
Raw BS `json:"-"`
|
2023-02-01 18:31:50 +08:00
|
|
|
Recu bool `json:"-"`
|
|
|
|
RecuDepth int `json:"-"`
|
|
|
|
URLs []string `json:"-"`
|
2023-02-04 19:44:37 +08:00
|
|
|
Collected bool `json:"-"`
|
2023-05-04 12:04:59 +08:00
|
|
|
Retry int `json:"-"`
|
2022-09-08 15:57:17 +08:00
|
|
|
}
|
|
|
|
|
2022-12-11 00:24:28 +08:00
|
|
|
func (bl *Baseline) IsDir() bool {
|
|
|
|
if strings.HasSuffix(bl.Path, "/") {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
2022-09-23 01:20:01 +08:00
|
|
|
// Collect 深度收集信息
|
2022-11-10 21:03:07 +08:00
|
|
|
func (bl *Baseline) Collect() {
|
2023-02-19 22:27:16 +08:00
|
|
|
if bl.ContentType == "html" || bl.ContentType == "json" || bl.ContentType == "txt" {
|
|
|
|
// 指纹库设计的时候没考虑js,css文件的指纹, 跳过非必要的指纹收集减少误报提高性能
|
2023-03-27 15:20:57 +08:00
|
|
|
bl.Frameworks = FingerDetect(bl.Raw)
|
2023-02-19 22:27:16 +08:00
|
|
|
}
|
|
|
|
|
2022-11-10 17:36:09 +08:00
|
|
|
if len(bl.Body) > 0 {
|
2023-01-09 13:41:59 +08:00
|
|
|
if bl.ContentType == "html" {
|
2023-03-27 15:20:57 +08:00
|
|
|
bl.Title = iutils.AsciiEncode(parsers.MatchTitle(bl.Body))
|
2023-01-09 13:41:59 +08:00
|
|
|
} else if bl.ContentType == "ico" {
|
2023-12-28 14:34:19 +08:00
|
|
|
if name, ok := Md5Fingers[encode.Md5Hash(bl.Body)]; ok {
|
2023-02-01 18:31:50 +08:00
|
|
|
bl.Frameworks[name] = &parsers.Framework{Name: name}
|
2023-12-28 14:34:19 +08:00
|
|
|
} else if name, ok := Mmh3Fingers[encode.Mmh3Hash32(bl.Body)]; ok {
|
2023-02-01 18:31:50 +08:00
|
|
|
bl.Frameworks[name] = &parsers.Framework{Name: name}
|
2023-01-09 13:41:59 +08:00
|
|
|
}
|
|
|
|
}
|
2022-11-10 17:36:09 +08:00
|
|
|
}
|
2023-01-09 13:41:59 +08:00
|
|
|
|
2022-12-11 01:21:05 +08:00
|
|
|
bl.Hashes = parsers.NewHashes(bl.Raw)
|
2022-11-10 21:03:07 +08:00
|
|
|
bl.Extracteds = Extractors.Extract(string(bl.Raw))
|
2023-02-08 12:58:56 +08:00
|
|
|
bl.Unique = UniqueHash(bl)
|
2022-09-23 01:20:01 +08:00
|
|
|
}
|
|
|
|
|
2023-01-03 17:09:32 +08:00
|
|
|
func (bl *Baseline) CollectURL() {
|
2023-02-04 19:44:37 +08:00
|
|
|
if bl.Collected {
|
|
|
|
// 防止重复收集
|
|
|
|
return
|
|
|
|
} else {
|
|
|
|
bl.Collected = true
|
|
|
|
}
|
|
|
|
|
2023-01-03 17:09:32 +08:00
|
|
|
if len(bl.Body) == 0 {
|
|
|
|
return
|
|
|
|
}
|
2023-02-08 15:30:40 +08:00
|
|
|
for _, reg := range ExtractRegexps["js"][0].CompiledRegexps {
|
2023-01-03 17:09:32 +08:00
|
|
|
urls := reg.FindAllStringSubmatch(string(bl.Body), -1)
|
|
|
|
for _, u := range urls {
|
2023-01-10 00:58:16 +08:00
|
|
|
u[1] = formatURL(u[1])
|
2023-01-10 23:44:03 +08:00
|
|
|
if u[1] != "" && !filterJs(u[1]) {
|
2023-01-10 00:58:16 +08:00
|
|
|
bl.URLs = append(bl.URLs, u[1])
|
2023-01-03 17:09:32 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-02-08 15:30:40 +08:00
|
|
|
for _, reg := range ExtractRegexps["url"][0].CompiledRegexps {
|
2023-01-03 17:09:32 +08:00
|
|
|
urls := reg.FindAllStringSubmatch(string(bl.Body), -1)
|
|
|
|
for _, u := range urls {
|
2023-01-10 00:58:16 +08:00
|
|
|
u[1] = formatURL(u[1])
|
2023-01-10 23:44:03 +08:00
|
|
|
if u[1] != "" && !filterUrl(u[1]) {
|
2023-01-10 00:58:16 +08:00
|
|
|
bl.URLs = append(bl.URLs, u[1])
|
2023-01-03 17:09:32 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2023-01-06 03:31:28 +08:00
|
|
|
|
2023-01-11 11:26:03 +08:00
|
|
|
bl.URLs = RemoveDuplication(bl.URLs)
|
2023-01-04 11:18:03 +08:00
|
|
|
if bl.URLs != nil {
|
2023-01-28 13:15:49 +08:00
|
|
|
bl.Extracteds = append(bl.Extracteds, &parsers.Extracted{
|
2023-01-04 11:18:03 +08:00
|
|
|
Name: "crawl",
|
2023-01-11 11:26:03 +08:00
|
|
|
ExtractResult: bl.URLs,
|
2023-01-04 11:18:03 +08:00
|
|
|
})
|
|
|
|
}
|
2023-01-03 17:09:32 +08:00
|
|
|
}
|
|
|
|
|
2022-11-11 01:12:35 +08:00
|
|
|
// Compare
|
|
|
|
// if totally equal return 1
|
|
|
|
// if maybe equal return 0
|
|
|
|
// not equal return -1
|
2022-11-10 21:03:07 +08:00
|
|
|
func (bl *Baseline) Compare(other *Baseline) int {
|
2022-09-23 01:20:01 +08:00
|
|
|
if other.RedirectURL != "" && bl.RedirectURL == other.RedirectURL {
|
2022-11-09 16:05:17 +08:00
|
|
|
// 如果重定向url不为空, 且与base不相同, 则说明不是同一个页面
|
|
|
|
return 1
|
2022-09-08 15:57:17 +08:00
|
|
|
}
|
|
|
|
|
2022-12-11 03:52:06 +08:00
|
|
|
if bl.BodyLength == other.BodyLength {
|
2022-09-23 01:20:01 +08:00
|
|
|
// 如果body length相等且md5相等, 则说明是同一个页面
|
2022-12-11 01:21:05 +08:00
|
|
|
if bytes.Equal(bl.Body, other.Body) {
|
2022-11-09 16:05:17 +08:00
|
|
|
// 如果length相等, md5也相等, 则判断为全同
|
|
|
|
return 1
|
|
|
|
} else {
|
|
|
|
// 如果长度相等, 但是md5不相等, 可能是存在csrftoken之类的随机值
|
|
|
|
return 0
|
|
|
|
}
|
2022-12-11 03:52:06 +08:00
|
|
|
} else if i := bl.BodyLength - other.BodyLength; (i < 16 && i > 0) || (i > -16 && i < 0) {
|
|
|
|
// 如果body length绝对值小于16, 则可能是存在csrftoken之类的随机值, 需要模糊判断
|
|
|
|
return 0
|
2022-11-09 16:05:17 +08:00
|
|
|
} else {
|
2022-12-11 03:52:06 +08:00
|
|
|
// 如果body length绝对值大于16, 则认为大概率存在较大差异
|
2022-11-09 16:05:17 +08:00
|
|
|
if strings.Contains(string(other.Body), other.Path) {
|
2022-11-11 01:12:35 +08:00
|
|
|
// 如果包含路径本身, 可能是路径自身的随机值影响结果
|
2022-11-09 16:05:17 +08:00
|
|
|
return 0
|
|
|
|
} else {
|
2022-12-11 03:52:06 +08:00
|
|
|
// 如果不包含路径本身, 则认为是不同页面
|
2022-11-09 16:05:17 +08:00
|
|
|
return -1
|
2022-09-23 01:20:01 +08:00
|
|
|
}
|
2022-09-08 15:57:17 +08:00
|
|
|
}
|
2022-11-09 16:05:17 +08:00
|
|
|
return -1
|
2022-09-08 15:57:17 +08:00
|
|
|
}
|
|
|
|
|
2023-01-09 14:47:58 +08:00
|
|
|
var Distance uint8 = 5 // 数字越小越相似, 数字为0则为完全一致.
|
2022-11-10 21:26:07 +08:00
|
|
|
|
2022-11-10 21:03:07 +08:00
|
|
|
func (bl *Baseline) FuzzyCompare(other *Baseline) bool {
|
2023-01-09 14:47:58 +08:00
|
|
|
// 这里使用rawsimhash, 是为了保证一定数量的字符串, 否则超短的body会导致simhash偏差指较大
|
2023-12-28 14:34:19 +08:00
|
|
|
if other.Distance = encode.SimhashCompare(other.RawSimhash, bl.RawSimhash); other.Distance < Distance {
|
2022-11-10 21:03:07 +08:00
|
|
|
return true
|
|
|
|
}
|
2022-09-08 15:57:17 +08:00
|
|
|
return false
|
|
|
|
}
|