初步实现简易爬虫

This commit is contained in:
M09Ic 2023-01-03 17:09:32 +08:00
parent 494ce9414a
commit 9e9b0de039
7 changed files with 243 additions and 77 deletions

View File

@ -32,6 +32,7 @@ type InputOptions struct {
ResumeFrom string `long:"resume-from"`
URL string `short:"u" long:"url" description:"String, input baseurl (separated by commas), e.g.: http://google.com, http://baidu.com"`
URLFile string `short:"l" long:"list" description:"File, input filename"`
Raw string `long:"raw" description:"File, input raw request filename"`
Offset int `long:"offset" description:"Int, wordlist offset"`
Limit int `long:"limit" description:"Int, wordlist limit, start with offset. e.g.: --offset 1000 --limit 100"`
Dictionaries []string `short:"d" long:"dict" description:"Files, dict files, e.g.: -d 1.txt -d 2.txt"`
@ -77,6 +78,8 @@ type ModeOptions struct {
CheckOnly bool `long:"check-only" description:"Bool, check only"`
Recursive string `long:"recursive" default:"current.IsDir()" description:"String,custom recursive rule, e.g.: --recursive current.IsDir()"`
Depth int `long:"depth" default:"0" description:"Int, recursive depth"`
Crawl bool `long:"crawl" description:"Bool, enable crawl"`
CrawlDepth int `long:"spider-depth" default:"3" description:"Int, crawl depth"`
CheckPeriod int `long:"check-period" default:"200" description:"Int, check period when request"`
ErrPeriod int `long:"error-period" default:"10" description:"Int, check period when error"`
BreakThreshold int `long:"error-threshold" default:"20" description:"Int, break when the error exceeds the threshold "`
@ -123,6 +126,7 @@ func (opt *Option) PrepareRunner() (*Runner, error) {
CheckPeriod: opt.CheckPeriod,
ErrPeriod: opt.ErrPeriod,
BreakThreshold: opt.BreakThreshold,
Crawl: opt.Crawl,
}
err = pkg.LoadTemplates()

View File

@ -20,13 +20,12 @@ import (
)
var (
CheckRedirect func(string) bool
max = 2147483647
maxRedirect = 3
maxCrawl = 3
maxRecursion = 0
)
var max = 2147483647
var maxRedirect = 3
var maxRecuDepth = 0
func NewPool(ctx context.Context, config *pkg.Config) (*Pool, error) {
pctx, cancel := context.WithCancel(ctx)
pool := &Pool{
@ -35,8 +34,10 @@ func NewPool(ctx context.Context, config *pkg.Config) (*Pool, error) {
cancel: cancel,
client: ihttp.NewClient(config.Thread, 2, config.ClientType),
baselines: make(map[int]*pkg.Baseline),
urls: make(map[string]int),
tempCh: make(chan *pkg.Baseline, config.Thread),
checkCh: make(chan sourceType),
additionCh: make(chan *Unit, 100),
wg: sync.WaitGroup{},
initwg: sync.WaitGroup{},
reqCount: 1,
@ -80,7 +81,7 @@ func NewPool(ctx context.Context, config *pkg.Config) (*Pool, error) {
bl.RedirectURL = "/" + strings.TrimLeft(bl.RedirectURL, "/")
bl.RedirectURL = pool.BaseURL + bl.RedirectURL
}
pool.addRedirect(bl, unit.reCount)
pool.doRedirect(bl, unit.depth)
}
pool.addFuzzyBaseline(bl)
} else {
@ -89,14 +90,17 @@ func NewPool(ctx context.Context, config *pkg.Config) (*Pool, error) {
}
}
bl.ReqDepth = unit.depth
bl.Spended = time.Since(start).Milliseconds()
switch unit.source {
case InitRandomSource:
pool.random = bl
pool.addFuzzyBaseline(bl)
pool.doCrawl(bl)
pool.initwg.Done()
case InitIndexSource:
pool.index = bl
pool.doCrawl(bl)
pool.initwg.Done()
case CheckSource:
if bl.ErrString != "" {
@ -122,15 +126,17 @@ func NewPool(ctx context.Context, config *pkg.Config) (*Pool, error) {
pool.reqCount++
if pool.reqCount%pool.CheckPeriod == 0 {
pool.reqCount++
pool.check()
pool.doCheck()
} else if pool.failedCount%pool.ErrPeriod == 0 {
pool.failedCount++
pool.check()
pool.doCheck()
}
pool.bar.Done()
case RedirectSource:
bl.FrontURL = unit.frontUrl
pool.tempCh <- bl
case CrawlSource:
pool.tempCh <- bl
}
})
@ -184,11 +190,14 @@ func NewPool(ctx context.Context, config *pkg.Config) (*Pool, error) {
}
// 如果要进行递归判断, 要满足 bl有效, mod为path-spray, 当前深度小于最大递归深度
if bl.IsValid && pool.Mod == pkg.PathSpray && bl.RecuDepth < maxRecuDepth {
if bl.IsValid {
pool.doCrawl(bl)
if bl.RecuDepth < maxRecursion {
if CompareWithExpr(pool.RecuExpr, params) {
bl.Recu = true
}
}
}
pool.OutputCh <- bl
pool.wg.Done()
}
@ -207,7 +216,8 @@ type Pool struct {
ctx context.Context
cancel context.CancelFunc
tempCh chan *pkg.Baseline // 待处理的baseline
checkCh chan sourceType
checkCh chan sourceType // 独立的check管道 防止与redirect/crawl冲突
additionCh chan *Unit
reqCount int
failedCount int
isFailed bool
@ -215,6 +225,7 @@ type Pool struct {
random *pkg.Baseline
index *pkg.Baseline
baselines map[int]*pkg.Baseline
urls map[string]int
analyzeDone bool
worder *words.Worder
locker sync.Mutex
@ -253,8 +264,10 @@ func (pool *Pool) Init() error {
}
}
if pool.random.RedirectURL != "" {
CheckRedirect = func(redirectURL string) bool {
return nil
}
func (pool *Pool) checkRedirect(redirectURL string) bool {
if redirectURL == pool.random.RedirectURL {
// 相同的RedirectURL将被认为是无效数据
return false
@ -263,43 +276,6 @@ func (pool *Pool) Init() error {
return true
}
}
}
return nil
}
func (pool *Pool) addRedirect(bl *pkg.Baseline, reCount int) {
if reCount >= maxRedirect {
return
}
if uu, err := url.Parse(bl.RedirectURL); err == nil && uu.Hostname() == pool.index.Url.Hostname() {
pool.wg.Add(1)
_ = pool.reqPool.Invoke(&Unit{
number: bl.Number,
path: uu.Path,
source: RedirectSource,
frontUrl: bl.UrlString,
reCount: reCount + 1,
})
}
}
func (pool *Pool) check() {
if pool.failedCount > pool.BreakThreshold {
// 当报错次数超过上限是, 结束任务
pool.recover()
pool.cancel()
pool.isFailed = true
return
}
if pool.Mod == pkg.HostSpray {
pool.checkCh <- CheckSource
} else if pool.Mod == pkg.PathSpray {
pool.checkCh <- CheckSource
}
}
func (pool *Pool) genReq(s string) (*ihttp.Request, error) {
if pool.Mod == pkg.HostSpray {
@ -311,6 +287,11 @@ func (pool *Pool) genReq(s string) (*ihttp.Request, error) {
}
func (pool *Pool) Run(ctx context.Context, offset, limit int) {
pool.worder.RunWithRules()
go func() {
for unit := range pool.additionCh {
pool.reqPool.Invoke(unit)
}
}()
Loop:
for {
select {
@ -340,13 +321,16 @@ Loop:
} else if pool.Mod == pkg.PathSpray {
pool.reqPool.Invoke(newUnitWithNumber(pkg.RandPath(), source, pool.Statistor.End))
}
case <-ctx.Done():
break Loop
case <-pool.ctx.Done():
break Loop
}
}
for len(pool.additionCh) > 0 {
time.Sleep(time.Second)
}
pool.wg.Wait()
pool.Statistor.EndTime = time.Now().Unix()
pool.Close()
@ -370,7 +354,7 @@ func (pool *Pool) PreCompare(resp *ihttp.Response) error {
return ErrWaf
}
if CheckRedirect != nil && !CheckRedirect(resp.GetHeader("Location")) {
if !pool.checkRedirect(resp.GetHeader("Location")) {
return ErrRedirect
}
@ -417,7 +401,7 @@ func (pool *Pool) BaseCompare(bl *pkg.Baseline) bool {
if ok && status == 0 && base.FuzzyCompare(bl) {
pool.Statistor.FuzzyNumber++
bl.Reason = ErrFuzzyCompareFailed.Error()
pool.PutToFuzzy(bl)
pool.putToFuzzy(bl)
return false
}
@ -437,6 +421,77 @@ func CompareWithExpr(exp *vm.Program, params map[string]interface{}) bool {
}
}
func (pool *Pool) doRedirect(bl *pkg.Baseline, depth int) {
if depth >= maxRedirect {
return
}
if uu, err := url.Parse(bl.RedirectURL); err == nil && uu.Hostname() == pool.index.Url.Hostname() {
pool.wg.Add(1)
pool.additionCh <- &Unit{
path: uu.Path,
source: RedirectSource,
frontUrl: bl.UrlString,
depth: depth + 1,
}
}
}
func (pool *Pool) doCrawl(bl *pkg.Baseline) {
bl.CollectURL()
for _, u := range bl.URLs {
if strings.HasPrefix(u, "//") {
u = bl.Url.Scheme + u
} else if strings.HasPrefix(u, "/") {
// 绝对目录拼接
u = pkg.URLJoin(pool.BaseURL, u)
} else if !strings.HasPrefix(u, "http") {
// 相对目录拼接
u = pkg.URLJoin(pool.BaseURL, u)
}
if _, ok := pool.urls[u]; ok {
pool.urls[u]++
} else {
// 通过map去重, 只有新的url才会进入到该逻辑
pool.urls[u] = 1
if bl.ReqDepth < maxCrawl {
parsed, err := url.Parse(u)
if err != nil {
continue
}
if parsed.Host != bl.Url.Host {
// 自动限定scoop, 防止爬到其他网站
continue
}
pool.wg.Add(1)
pool.additionCh <- &Unit{
path: parsed.Path,
source: CrawlSource,
frontUrl: bl.UrlString,
depth: bl.ReqDepth + 1,
}
}
}
}
}
func (pool *Pool) doCheck() {
if pool.failedCount > pool.BreakThreshold {
// 当报错次数超过上限是, 结束任务
pool.recover()
pool.cancel()
pool.isFailed = true
return
}
if pool.Mod == pkg.HostSpray {
pool.checkCh <- CheckSource
} else if pool.Mod == pkg.PathSpray {
pool.checkCh <- CheckSource
}
}
func (pool *Pool) addFuzzyBaseline(bl *pkg.Baseline) {
if _, ok := pool.baselines[bl.Status]; !ok && IntsContains(FuzzyStatus, bl.Status) {
bl.Collect()
@ -447,12 +502,12 @@ func (pool *Pool) addFuzzyBaseline(bl *pkg.Baseline) {
}
}
func (pool *Pool) PutToInvalid(bl *pkg.Baseline, reason string) {
func (pool *Pool) putToInvalid(bl *pkg.Baseline, reason string) {
bl.IsValid = false
pool.OutputCh <- bl
}
func (pool *Pool) PutToFuzzy(bl *pkg.Baseline) {
func (pool *Pool) putToFuzzy(bl *pkg.Baseline) {
bl.IsFuzzy = true
pool.FuzzyCh <- bl
}
@ -474,5 +529,6 @@ func (pool *Pool) Close() {
time.Sleep(time.Duration(100) * time.Millisecond)
}
close(pool.tempCh)
close(pool.additionCh)
pool.bar.Close()
}

View File

@ -72,6 +72,7 @@ type Runner struct {
CheckOnly bool
Force bool
IgnoreWaf bool
Crawl bool
}
func (r *Runner) PrepareConfig() *pkg.Config {
@ -90,6 +91,7 @@ func (r *Runner) PrepareConfig() *pkg.Config {
FilterExpr: r.FilterExpr,
RecuExpr: r.RecursiveExpr,
IgnoreWaf: r.IgnoreWaf,
Crawl: r.Crawl,
}
if config.Mod == pkg.PathSpray {
config.ClientType = ihttp.FAST

View File

@ -51,6 +51,7 @@ const (
InitRandomSource
InitIndexSource
RedirectSource
CrawlSource
WordSource
WafSource
)
@ -60,15 +61,14 @@ func newUnit(path string, source sourceType) *Unit {
}
func newUnitWithNumber(path string, source sourceType, number int) *Unit {
return &Unit{number: number, path: path, source: source}
return &Unit{path: path, source: source}
}
type Unit struct {
number int
path string
source sourceType
frontUrl string
reCount int // redirect number
depth int // redirect depth
}
type Task struct {

View File

@ -8,6 +8,7 @@ import (
"github.com/chainreactors/parsers"
"github.com/chainreactors/spray/pkg/ihttp"
"net/url"
"path"
"strconv"
"strings"
)
@ -84,7 +85,9 @@ type Baseline struct {
Reason string `json:"reason"`
IsValid bool `json:"valid"`
IsFuzzy bool `json:"fuzzy"`
URLs []string `json:"urls"`
RecuDepth int `json:"-"`
ReqDepth int `json:"depth"`
Recu bool `json:"-"`
*parsers.Hashes
}
@ -106,6 +109,64 @@ func (bl *Baseline) Collect() {
bl.Frameworks = FingerDetect(string(bl.Raw))
}
func (bl *Baseline) CollectURL() {
if len(bl.Body) == 0 {
return
}
for _, reg := range JSRegexps {
urls := reg.FindAllStringSubmatch(string(bl.Body), -1)
for _, u := range urls {
var filter bool
parsed, err := url.Parse(u[1])
if err != nil {
filter = true
} else {
for _, scoop := range BadScoop {
if scoop == parsed.Host {
filter = true
break
}
}
}
if filter {
continue
}
bl.URLs = append(bl.URLs, u[1])
}
}
for _, reg := range URLRegexps {
urls := reg.FindAllStringSubmatch(string(bl.Body), -1)
for _, u := range urls {
var filter bool
parsed, err := url.Parse(u[1])
if err != nil {
filter = true
} else {
ext := path.Ext(parsed.Path)
for _, e := range BadExt {
if e == ext {
filter = true
break
}
}
for _, scoop := range BadScoop {
if scoop == parsed.Host {
filter = true
break
}
}
}
if filter {
continue
}
bl.URLs = append(bl.URLs, u[1])
}
}
}
// Compare
// if totally equal return 1
// if maybe equal return 0
@ -186,6 +247,8 @@ func (bl *Baseline) Get(key string) string {
return bl.Extracteds.String()
case "frame", "framework":
return bl.Frameworks.String()
case "full":
return bl.String()
default:
return ""
}
@ -256,9 +319,9 @@ func (bl *Baseline) ColorString() string {
line.WriteString(" - ")
line.WriteString(logs.GreenBold(strconv.Itoa(bl.Status)))
line.WriteString(" - ")
line.WriteString(logs.Blue(strconv.Itoa(bl.BodyLength)))
line.WriteString(logs.YellowBold(strconv.Itoa(bl.BodyLength)))
line.WriteString(" - ")
line.WriteString(logs.Blue(strconv.Itoa(int(bl.Spended)) + "ms"))
line.WriteString(logs.YellowBold(strconv.Itoa(int(bl.Spended)) + "ms"))
line.WriteString(logs.GreenLine(bl.Additional("title")))
line.WriteString(logs.Blue(bl.Frameworks.String()))
line.WriteString(logs.Blue(bl.Extracteds.String()))
@ -267,6 +330,12 @@ func (bl *Baseline) ColorString() string {
line.WriteString(logs.CyanLine(bl.RedirectURL))
line.WriteString(" ")
}
if len(bl.URLs) > 0 {
line.WriteString("\n")
}
for _, u := range bl.URLs {
line.WriteString("\t" + u + "\n")
}
return line.String()
}
@ -308,6 +377,12 @@ func (bl *Baseline) String() string {
line.WriteString(bl.RedirectURL)
line.WriteString(" ")
}
if len(bl.URLs) > 0 {
line.WriteString("\n")
}
for _, u := range bl.URLs {
line.WriteString("\t" + u + "\n")
}
return line.String()
}

View File

@ -38,4 +38,5 @@ type Config struct {
FuzzyCh chan *Baseline
Fuzzy bool
IgnoreWaf bool
Crawl bool
}

View File

@ -1,17 +1,35 @@
package pkg
import (
"fmt"
"github.com/chainreactors/gogo/v2/pkg/fingers"
"github.com/chainreactors/gogo/v2/pkg/utils"
"github.com/chainreactors/ipcs"
"github.com/go-dedup/simhash"
"math/rand"
"os"
"regexp"
"strings"
"time"
"unsafe"
)
var (
Md5Fingers map[string]string = make(map[string]string)
Mmh3Fingers map[string]string = make(map[string]string)
ActivePath []string
Fingers fingers.Fingers
JSRegexps []*regexp.Regexp = []*regexp.Regexp{
regexp.MustCompile(".(https{0,1}:[^\\s,^',^,^\",^”,^>,^<,^;,^(,^),^|,^*,^\\[]{2,250}?[^=,^*,^\\s,^',^,^\",^”,^>,^<,^:,^;,^*,^|,^(,^),^\\[]{3}[.]js)"),
regexp.MustCompile("[\",',,“]\\s{0,6}(/{0,1}[^\\s,^',^,^\",^”,^|,^>,^<,^:,^;,^*,^(,^\\),^\\[]{2,250}?[^=,^*,^\\s,^',^,^|,^\",^”,^>,^<,^:,^;,^*,^(,^),^\\[]{3}[.]js)"),
regexp.MustCompile("=\\s{0,6}[\",',,”]{0,1}\\s{0,6}(/{0,1}[^\\s,^',^,^\",^”,^|,^>,^<,^;,^*,^(,^),^\\[]{2,250}?[^=,^*,^\\s,^',^,^\",^”,^>,^|,^<,^:,^;,^*,^(,^),^\\[]{3}[.]js)"),
}
URLRegexps []*regexp.Regexp = []*regexp.Regexp{
regexp.MustCompile("[\",',,“]\\s{0,6}(https{0,1}:[^\\s,^',^,^\",^”,^>,^<,^),^(]{2,250}?)\\s{0,6}[\",',,“]"),
regexp.MustCompile("=\\s{0,6}(https{0,1}:[^\\s,^',^,^\",^”,^>,^<,^),^(]{2,250})"),
regexp.MustCompile("[\",',,“]\\s{0,6}([#,.]{0,2}/[^\\s,^',^,^\",^”,^>,^<,^:,^),^(]{2,250}?)\\s{0,6}[\",',,“]"),
regexp.MustCompile("href\\s{0,6}=\\s{0,6}[\",',,“]{0,1}\\s{0,6}([^\\s,^',^,^\",^“,^>,^<,^,^+),^(]{2,250})|action\\s{0,6}=\\s{0,6}[\",',,“]{0,1}\\s{0,6}([^\\s,^',^,^\",^“,^>,^<,^,^+),^(]{2,250})"),
}
)
func HasStdin() bool {
stat, err := os.Stdin.Stat()
if err != nil {
@ -24,11 +42,6 @@ func HasStdin() bool {
return isPipedFromChrDev || isPipedFromFIFO
}
func Simhash(raw []byte) string {
sh := simhash.NewSimhash()
return fmt.Sprintf("%x", sh.GetSimhash(sh.NewWordFeatureSet(raw)))
}
const letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
var src = rand.NewSource(time.Now().UnixNano())
@ -80,12 +93,6 @@ func RandHost() string {
return *(*string)(unsafe.Pointer(&b))
}
var (
Md5Fingers map[string]string = make(map[string]string)
Mmh3Fingers map[string]string = make(map[string]string)
Fingers fingers.Fingers
)
func LoadTemplates() error {
var err error
Fingers, err = fingers.LoadFingers(LoadConfig("http"))
@ -102,6 +109,9 @@ func LoadTemplates() error {
for _, f := range Fingers {
for _, rule := range f.Rules {
if rule.SendDataStr != "" {
ActivePath = append(ActivePath, rule.SendDataStr)
}
if rule.Favicon != nil {
for _, mmh3 := range rule.Favicon.Mmh3 {
Mmh3Fingers[mmh3] = f.Name
@ -127,3 +137,21 @@ func FingerDetect(content string) Frameworks {
}
return frames
}
var (
BadExt = []string{".js", ".css", ".scss", ",", ".jpeg", ".jpg", ".png", ".gif", ".ico", ".svg", ".vue", ".ts"}
//BadURL = []string{".js?", ".css?", ".jpeg?", ".jpg?", ".png?", ".gif?", "github.com", "www.w3.org", "example.com", "<", ">", "{", "}", "[", "]", "|", "^", ";", "/js/", ".src", ".url", ".att", ".href", "location.href", "javascript:", "location:", ".createObject", ":location", ".path", "*#__PURE__*", "\\n"}
BadScoop = []string{"www.w3.org", "example.com"}
)
func URLJoin(base, uri string) string {
baseSlash := strings.HasSuffix(base, "/")
uriSlash := strings.HasPrefix(uri, "/")
if (baseSlash && !uriSlash) || (!baseSlash && uriSlash) {
return base + uri
} else if baseSlash && uriSlash {
return base + uri[1:]
} else {
return base + "/" + uri
}
}