diff --git a/internal/option.go b/internal/option.go index c9a7573..f7b3727 100644 --- a/internal/option.go +++ b/internal/option.go @@ -86,6 +86,7 @@ type ModeOptions struct { Recursive string `long:"recursive" default:"current.IsDir()" description:"String,custom recursive rule, e.g.: --recursive current.IsDir()"` Depth int `long:"depth" default:"0" description:"Int, recursive depth"` CrawlDepth int `long:"crawl-depth" default:"3" description:"Int, crawl depth"` + CrawlScope string `long:"crawl-scope" description:"Int, crawl scope (todo)"` CheckPeriod int `long:"check-period" default:"200" description:"Int, check period when request"` ErrPeriod int `long:"error-period" default:"10" description:"Int, check period when error"` BreakThreshold int `long:"error-threshold" default:"20" description:"Int, break when the error exceeds the threshold "` @@ -225,15 +226,16 @@ func (opt *Option) PrepareRunner() (*Runner, error) { logs.Log.Importantf("Loaded %d word from %s", len(dicts[i]), f) } - if len(opt.Dictionaries) == 0 && opt.Word == "" { - // 用来仅使用高级功能下, 防止无字典报错. - opt.Word = "/" - } else { - opt.Word = "{?" - for i, _ := range dicts { - opt.Word += strconv.Itoa(i) + if opt.Word == "" { + if len(opt.Dictionaries) == 0 { + opt.Word = "/" + } else { + opt.Word = "{?" + for i, _ := range dicts { + opt.Word += strconv.Itoa(i) + } + opt.Word += "}" } - opt.Word += "}" } if opt.Suffixes != nil { diff --git a/internal/pool.go b/internal/pool.go index 16e27fc..0962e82 100644 --- a/internal/pool.go +++ b/internal/pool.go @@ -39,6 +39,8 @@ func NewPool(ctx context.Context, config *pkg.Config) (*Pool, error) { pctx, cancel := context.WithCancel(ctx) pool := &Pool{ Config: config, + base: u.Scheme + "://" + u.Hostname(), + isDir: strings.HasSuffix(config.BaseURL, "/"), url: u, ctx: pctx, cancel: cancel, @@ -54,6 +56,15 @@ func NewPool(ctx context.Context, config *pkg.Config) (*Pool, error) { failedCount: 1, } + // 格式化dir, 保证至少有一个"/" + if pool.isDir { + pool.dir = pool.url.Path + } else if pool.url.Path == "" { + pool.dir = "/" + } else { + pool.dir = Dir(pool.url.Path) + } + p, _ := ants.NewPoolWithFunc(config.Thread, pool.Invoke) pool.reqPool = p @@ -130,6 +141,9 @@ func NewPool(ctx context.Context, config *pkg.Config) (*Pool, error) { type Pool struct { *pkg.Config + base string // url的根目录, 在爬虫或者redirect时, 会需要用到根目录进行拼接 + isDir bool // url是否以/结尾 + dir string url *url.URL Statistor *pkg.Statistor client *ihttp.Client @@ -159,7 +173,7 @@ func (pool *Pool) Init() error { // 分成两步是为了避免闭包的线程安全问题 pool.initwg.Add(2) pool.reqPool.Invoke(newUnit("", InitIndexSource)) - pool.reqPool.Invoke(newUnit(pkg.RandPath(), InitRandomSource)) + pool.reqPool.Invoke(newUnit(pool.safePath(pkg.RandPath()), InitRandomSource)) pool.initwg.Wait() if pool.index.ErrString != "" { return fmt.Errorf(pool.index.String()) @@ -206,7 +220,7 @@ func (pool *Pool) genReq(s string) (*ihttp.Request, error) { if pool.Mod == pkg.HostSpray { return ihttp.BuildHostRequest(pool.ClientType, pool.BaseURL, s) } else if pool.Mod == pkg.PathSpray { - return ihttp.BuildPathRequest(pool.ClientType, pool.BaseURL, s) + return ihttp.BuildPathRequest(pool.ClientType, pool.base, s) } return nil, fmt.Errorf("unknown mod") } @@ -258,13 +272,13 @@ Loop: } pool.wg.Add(1) - pool.reqPool.Invoke(newUnit(u, WordSource)) + pool.reqPool.Invoke(newUnit(pool.safePath(u), WordSource)) // 原样的目录拼接, 输入了几个"/"就是几个, 适配java的目录解析 case source := <-pool.checkCh: pool.Statistor.CheckNumber++ if pool.Mod == pkg.HostSpray { pool.reqPool.Invoke(newUnit(pkg.RandHost(), source)) } else if pool.Mod == pkg.PathSpray { - pool.reqPool.Invoke(newUnit(safePath(pool.BaseURL, pkg.RandPath()), source)) + pool.reqPool.Invoke(newUnit(pool.safePath(pkg.RandPath()), source)) } case unit, ok := <-pool.additionCh: if !ok { @@ -306,32 +320,29 @@ func (pool *Pool) Invoke(v interface{}) { if reqerr != nil && reqerr != fasthttp.ErrBodyTooLarge { pool.failedCount++ atomic.AddInt32(&pool.Statistor.FailedNumber, 1) - bl = &pkg.Baseline{UrlString: pool.BaseURL + unit.path, IsValid: false, ErrString: reqerr.Error(), Reason: ErrRequestFailed.Error()} + bl = &pkg.Baseline{UrlString: pool.base + unit.path, IsValid: false, ErrString: reqerr.Error(), Reason: ErrRequestFailed.Error()} pool.failedBaselines = append(pool.failedBaselines, bl) } else { if unit.source <= 3 || unit.source == CrawlSource || unit.source == CommonFileSource { + // 一些高优先级的source, 将跳过PreCompare + bl = pkg.NewBaseline(req.URI(), req.Host(), resp) + } else if pool.MatchExpr != nil { + // 如果自定义了match函数, 则所有数据送入tempch中 + bl = pkg.NewBaseline(req.URI(), req.Host(), resp) + } else if err = pool.PreCompare(resp); err == nil { + // 通过预对比跳过一些无用数据, 减少性能消耗 bl = pkg.NewBaseline(req.URI(), req.Host(), resp) } else { - if pool.MatchExpr != nil { - // 如果非wordsource, 或自定义了match函数, 则所有数据送入tempch中 - bl = pkg.NewBaseline(req.URI(), req.Host(), resp) - } else if err = pool.PreCompare(resp); err == nil { - // 通过预对比跳过一些无用数据, 减少性能消耗 - bl = pkg.NewBaseline(req.URI(), req.Host(), resp) - if err != ErrRedirect && bl.RedirectURL != "" { - if bl.RedirectURL != "" && !strings.HasPrefix(bl.RedirectURL, "http") { - bl.RedirectURL = "/" + strings.TrimLeft(bl.RedirectURL, "/") - bl.RedirectURL = pool.BaseURL + bl.RedirectURL - } - pool.wg.Add(1) - pool.doRedirect(bl, unit.depth) - } - } else { - bl = pkg.NewInvalidBaseline(req.URI(), req.Host(), resp, err.Error()) - } + bl = pkg.NewInvalidBaseline(req.URI(), req.Host(), resp, err.Error()) } } + // 手动处理重定向 + if bl.IsValid && unit.source != CheckSource && bl.RedirectURL != "" { + pool.wg.Add(1) + pool.doRedirect(bl, unit.depth) + } + if ihttp.DefaultMaxBodySize != 0 && bl.BodyLength > ihttp.DefaultMaxBodySize { bl.ExceedLength = true } @@ -484,7 +495,7 @@ func (pool *Pool) Upgrade(bl *pkg.Baseline) error { rurl, err := url.Parse(bl.RedirectURL) if err == nil && rurl.Hostname() == bl.Url.Hostname() && bl.Url.Scheme == "http" && rurl.Scheme == "https" { logs.Log.Infof("baseurl %s upgrade http to https, reinit", pool.BaseURL) - pool.BaseURL = strings.Replace(pool.BaseURL, "http", "https", 1) + pool.base = strings.Replace(pool.BaseURL, "http", "https", 1) pool.url.Scheme = "https" // 重新初始化 err = pool.Init() @@ -501,20 +512,19 @@ func (pool *Pool) doRedirect(bl *pkg.Baseline, depth int) { if depth >= MaxRedirect { return } + reURL := FormatURL(bl.Url.Path, bl.RedirectURL) - if uu, err := url.Parse(bl.RedirectURL); err == nil && uu.Hostname() == pool.index.Url.Hostname() { - pool.wg.Add(1) - go pool.addAddition(&Unit{ - path: uu.Path, - source: RedirectSource, - frontUrl: bl.UrlString, - depth: depth + 1, - }) - } + pool.wg.Add(1) + go pool.addAddition(&Unit{ + path: reURL, + source: RedirectSource, + frontUrl: bl.UrlString, + depth: depth + 1, + }) } func (pool *Pool) doCrawl(bl *pkg.Baseline) { - if !pool.Crawl { + if !pool.Crawl || bl.ReqDepth >= MaxCrawl { pool.wg.Done() return } @@ -523,46 +533,12 @@ func (pool *Pool) doCrawl(bl *pkg.Baseline) { pool.wg.Done() return } + go func() { defer pool.wg.Done() for _, u := range bl.URLs { - if strings.HasPrefix(u, "//") { - parsed, err := url.Parse(u) - if err != nil { - continue - } - if parsed.Host != bl.Url.Host || len(parsed.Path) <= 1 { - continue - } - u = parsed.Path - } else if strings.HasPrefix(u, "/") { - // 绝对目录拼接 - // 不需要进行处理, 用来跳过下面的判断 - } else if strings.HasPrefix(u, "./") { - // "./"相对目录拼接 - if bl.Dir { - u = pkg.URLJoin(bl.Url.Path, u[2:]) - } else { - u = pkg.URLJoin(path.Dir(bl.Url.Path), u[2:]) - } - } else if strings.HasPrefix(u, "../") { - u = path.Join(path.Dir(bl.Url.Path), u) - } else if !strings.HasPrefix(u, "http") { - // 相对目录拼接 - if bl.Dir { - u = pkg.URLJoin(bl.Url.Path, u) - } else { - u = pkg.URLJoin(path.Dir(bl.Url.Path), u) - } - } else { - parsed, err := url.Parse(u) - if err != nil { - continue - } - if parsed.Host != bl.Url.Host || len(parsed.Path) <= 1 { - continue - } - u = parsed.Path + if u = FormatURL(bl.Url.Path, u); u == "" || u == pool.url.Path { + continue } pool.locker.Lock() @@ -571,14 +547,12 @@ func (pool *Pool) doCrawl(bl *pkg.Baseline) { } else { // 通过map去重, 只有新的url才会进入到该逻辑 pool.urls[u] = 1 - if bl.ReqDepth < MaxCrawl { - pool.wg.Add(1) - pool.addAddition(&Unit{ - path: u[1:], - source: CrawlSource, - depth: bl.ReqDepth + 1, - }) - } + pool.wg.Add(1) + pool.addAddition(&Unit{ + path: u, + source: CrawlSource, + depth: bl.ReqDepth + 1, + }) } pool.locker.Unlock() } @@ -601,7 +575,7 @@ func (pool *Pool) doRule(bl *pkg.Baseline) { for u := range rule.RunAsStream(pool.AppendRule.Expressions, path.Base(bl.Path)) { pool.wg.Add(1) pool.addAddition(&Unit{ - path: path.Join(path.Dir(bl.Path), u), + path: Dir(bl.Url.Path) + u, source: RuleSource, }) } @@ -613,7 +587,7 @@ func (pool *Pool) doActive() { for _, u := range pkg.ActivePath { pool.wg.Add(1) pool.addAddition(&Unit{ - path: safePath(pool.BaseURL, u), + path: pool.dir + u[1:], source: ActiveSource, }) } @@ -629,7 +603,7 @@ func (pool *Pool) doBak() { for w := range worder.C { pool.wg.Add(1) pool.addAddition(&Unit{ - path: safePath(pool.BaseURL, w), + path: pool.dir + w, source: BakSource, }) } @@ -642,7 +616,7 @@ func (pool *Pool) doBak() { for w := range worder.C { pool.wg.Add(1) pool.addAddition(&Unit{ - path: safePath(pool.BaseURL, w), + path: pool.dir + w, source: BakSource, }) } @@ -653,7 +627,7 @@ func (pool *Pool) doCommonFile() { for _, u := range mask.SpecialWords["common_file"] { pool.wg.Add(1) pool.addAddition(&Unit{ - path: safePath(pool.BaseURL, u), + path: pool.dir + u, source: CommonFileSource, }) } @@ -719,3 +693,17 @@ func (pool *Pool) Close() { close(pool.additionCh) pool.bar.Close() } + +func (pool *Pool) safePath(u string) string { + // 自动生成的目录将采用safepath的方式拼接到相对目录中, 避免出现//的情况. 例如init, check, common + if u == "" { + return pool.url.Path + } + + if strings.HasPrefix(u, "/") { + // 如果path已经有"/", 则去掉 + return pool.dir + u[1:] + } else { + return pool.dir + u + } +} diff --git a/internal/utils.go b/internal/utils.go index 201ca9b..97d2b5c 100644 --- a/internal/utils.go +++ b/internal/utils.go @@ -6,6 +6,8 @@ import ( "github.com/chainreactors/words/mask" "github.com/chainreactors/words/rule" "io/ioutil" + "net/url" + "path" "strings" ) @@ -107,14 +109,107 @@ func loadRuleWithFiles(ruleFiles []string, filter string) ([]rule.Expression, er return rule.Compile(rules.String(), filter).Expressions, nil } -func safePath(url, path string) string { - urlSlash := strings.HasSuffix(url, "/") - pathSlash := strings.HasPrefix(path, "/") - if !urlSlash && !pathSlash { - return "/" + path - } else if urlSlash && pathSlash { - return path[1:] +func relaPath(base, u string) string { + // 拼接相对目录, 不使用path.join的原因是, 如果存在"////"这样的情况, 可能真的是有意义的路由, 不能随意去掉. + // "" /a /a + // "" a /a + // / "" / + // /a/ b /a/b + // /a/ /b /a/b + // /a b /b + // /a /b /b + + if u == "" { + return base + } + + pathSlash := strings.HasPrefix(u, "/") + if base == "" { + if pathSlash { + return u[1:] + } else { + return "/" + u + } + } else if strings.HasSuffix(base, "/") { + if pathSlash { + return base + u[1:] + } else { + return base + u + } } else { - return path + if pathSlash { + return Dir(base) + u[1:] + } else { + return Dir(base) + u + } } } + +func Dir(u string) string { + // 安全的获取目录, 不会额外处理多个"//", 并非用来获取上级目录 + // /a / + // /a/ /a/ + // a/ a/ + // aaa / + + if strings.HasSuffix(u, "/") { + return u + } else if i := strings.LastIndex(u, "/"); i == -1 { + return "/" + } else { + return u[:i+1] + } +} + +func FormatURL(base, u string) string { + if strings.HasPrefix(u, "http") { + parsed, err := url.Parse(u) + if err != nil { + return "" + } + if len(parsed.Path) <= 1 { + return "" + } + return parsed.Path + } else if strings.HasPrefix(u, "//") { + parsed, err := url.Parse(u) + if err != nil { + return "" + } + if len(parsed.Path) <= 1 { + // 跳过"/"与空目录 + return "" + } + return parsed.Path + } else if strings.HasPrefix(u, "/") { + // 绝对目录拼接 + // 不需要进行处理, 用来跳过下面的判断 + return u + } else if strings.HasPrefix(u, "./") { + // "./"相对目录拼接 + return relaPath(base, u[2:]) + } else if strings.HasPrefix(u, "../") { + return path.Join(Dir(base), u) + } else { + // 相对目录拼接 + return relaPath(base, u) + } +} + +//func Join(base, u string) string { +// // //././ ../../../a +// base = Dir(base) +// for strings.HasPrefix(u, "../") { +// u = u[3:] +// for strings.HasSuffix(base, "/") { +// // 去掉多余的"/" +// base = base[:len(base)-2] +// } +// if i := strings.LastIndex(base, "/"); i == -1 { +// return "/" +// } else { +// return base[:i+1] +// } +// } +// return base + u +//} diff --git a/pkg/baseline.go b/pkg/baseline.go index 62543c5..626242e 100644 --- a/pkg/baseline.go +++ b/pkg/baseline.go @@ -19,15 +19,6 @@ func NewBaseline(u, host string, resp *ihttp.Response) *Baseline { Status: resp.StatusCode(), IsValid: true, } - uu, err := url.Parse(u) - if err == nil { - bl.Path = uu.Path - bl.Url = uu - } - bl.Dir = bl.IsDir() - if bl.Url.Host != host { - bl.Host = host - } header := resp.Header() bl.Header = make([]byte, len(header)) copy(bl.Header, header) @@ -53,6 +44,16 @@ func NewBaseline(u, host string, resp *ihttp.Response) *Baseline { } bl.Raw = append(bl.Header, bl.Body...) bl.RedirectURL = resp.GetHeader("Location") + + uu, err := url.Parse(u) + if err == nil { + bl.Path = uu.Path + bl.Url = uu + } + bl.Dir = bl.IsDir() + if bl.Url.Host != host { + bl.Host = host + } return bl } @@ -64,10 +65,16 @@ func NewInvalidBaseline(u, host string, resp *ihttp.Response, reason string) *Ba Reason: reason, } + // 无效数据也要读取body, 否则keep-alive不生效 + resp.Body() + bl.BodyLength = resp.ContentLength() + bl.RedirectURL = string(resp.GetHeader("Location")) + uu, err := url.Parse(u) if err == nil { bl.Path = uu.Path bl.Url = uu + return bl } bl.Dir = bl.IsDir() @@ -75,11 +82,6 @@ func NewInvalidBaseline(u, host string, resp *ihttp.Response, reason string) *Ba bl.Host = host } - // 无效数据也要读取body, 否则keep-alive不生效 - resp.Body() - bl.BodyLength = resp.ContentLength() - bl.RedirectURL = string(resp.GetHeader("Location")) - return bl } @@ -152,7 +154,7 @@ func (bl *Baseline) CollectURL() { urls := reg.FindAllStringSubmatch(string(bl.Body), -1) for _, u := range urls { u[1] = formatURL(u[1]) - if !filterJs(u[1]) { + if u[1] != "" && !filterJs(u[1]) { bl.URLs = append(bl.URLs, u[1]) } } @@ -162,7 +164,7 @@ func (bl *Baseline) CollectURL() { urls := reg.FindAllStringSubmatch(string(bl.Body), -1) for _, u := range urls { u[1] = formatURL(u[1]) - if !filterUrl(u[1]) { + if u[1] != "" && !filterUrl(u[1]) { bl.URLs = append(bl.URLs, u[1]) } } @@ -314,12 +316,11 @@ func (bl *Baseline) Format(probes []string) string { func (bl *Baseline) ColorString() string { var line strings.Builder - line.WriteString(logs.GreenLine("[" + GetSourceName(bl.Source) + "]")) + line.WriteString(logs.GreenLine("[" + GetSourceName(bl.Source) + "] ")) if bl.FrontURL != "" { line.WriteString(logs.CyanLine(bl.FrontURL)) line.WriteString(" --> ") } - line.WriteString(" ") line.WriteString(logs.GreenLine(bl.UrlString)) if bl.Host != "" { line.WriteString(" (" + bl.Host + ")") @@ -368,12 +369,11 @@ func (bl *Baseline) ColorString() string { func (bl *Baseline) String() string { var line strings.Builder - line.WriteString(logs.GreenLine("[" + GetSourceName(bl.Source) + "]")) + line.WriteString(logs.GreenLine("[" + GetSourceName(bl.Source) + "] ")) if bl.FrontURL != "" { line.WriteString(bl.FrontURL) line.WriteString(" --> ") } - line.WriteString(" ") line.WriteString(bl.UrlString) if bl.Host != "" { line.WriteString(" (" + bl.Host + ")") diff --git a/pkg/ihttp/request.go b/pkg/ihttp/request.go index 20a530a..6fe89d2 100644 --- a/pkg/ihttp/request.go +++ b/pkg/ihttp/request.go @@ -3,16 +3,15 @@ package ihttp import ( "github.com/valyala/fasthttp" "net/http" - "strings" ) func BuildPathRequest(clientType int, base, path string) (*Request, error) { if clientType == FAST { req := fasthttp.AcquireRequest() - req.SetRequestURI(safeUrlJoin(base, path)) + req.SetRequestURI(base + path) return &Request{FastRequest: req, ClientType: FAST}, nil } else { - req, err := http.NewRequest("GET", safeUrlJoin(base, path), nil) + req, err := http.NewRequest("GET", base+path, nil) return &Request{StandardRequest: req, ClientType: STANDARD}, err } } @@ -75,15 +74,3 @@ func (r *Request) Host() string { return "" } } - -func safeUrlJoin(base, uri string) string { - if uri == "" { - // 如果url为空, 则直接对原样的url请求 - return base - } - if !strings.HasSuffix(base, "/") && !strings.HasPrefix(uri, "/") { - return base + "/" + uri - } else { - return base + uri - } -} diff --git a/pkg/utils.go b/pkg/utils.go index 5f89c96..bcfb0db 100644 --- a/pkg/utils.go +++ b/pkg/utils.go @@ -29,8 +29,8 @@ var ( } URLRegexps []*regexp.Regexp = []*regexp.Regexp{ regexp.MustCompile(`=\s{0,6}(https{0,1}:[^\s',’"”><;()|*\[]{2,250})`), - regexp.MustCompile(`["']([^\s',’"”><;()|*\[]{2,250}\.[a-zA-Z]\w{1,3})["']`), - regexp.MustCompile(`["'](https?:[^\s',’"”><;()|*\[]{2,250}?\.[^\s',’"”><;()|*\[]{2,250}?)["']`), + regexp.MustCompile(`["']([^\s',’"”><.@;()|*\[]{2,250}\.[a-zA-Z]\w{1,4})["']`), + regexp.MustCompile(`["'](https?:[^\s',’"”><;()@|*\[]{2,250}?\.[^\s',’"”><;()|*\[]{2,250}?)["']`), regexp.MustCompile(`["']\s{0,6}([#,.]{0,2}/[^\s',’"”><;()|*\[]{2,250}?)\s{0,6}["']`), regexp.MustCompile(`href\s{0,6}=\s{0,6}["'‘“]{0,1}\s{0,6}([^\s',’"”><;()|*\[]{2,250})|action\s{0,6}=\s{0,6}["'‘“]{0,1}\s{0,6}([^\s'’"“><)(]{2,250})`), } @@ -227,7 +227,7 @@ func FingerDetect(content string) Frameworks { var ( BadExt = []string{".js", ".css", ".scss", ".,", ".jpeg", ".jpg", ".png", ".gif", ".svg", ".vue", ".ts", ".swf", ".pdf", ".mp4"} - BadURL = []string{";", "}", "{", "www.w3.org", ".src", ".url", ".att", ".href", "location.href", "javascript:", "location:", ".createObject", ":location", ".path", "*#__PURE__*"} + BadURL = []string{";", "}", "webpack://", "{", "www.w3.org", ".src", ".url", ".att", ".href", "location.href", "javascript:", "location:", ".createObject", ":location", ".path"} ) func filterJs(u string) bool { @@ -291,17 +291,29 @@ func commonFilter(u string) bool { return false } -func URLJoin(base, uri string) string { - baseSlash := strings.HasSuffix(base, "/") - uriSlash := strings.HasPrefix(uri, "/") - if (baseSlash && !uriSlash) || (!baseSlash && uriSlash) { - return base + uri - } else if baseSlash && uriSlash { - return base + uri[1:] - } else { - return base + "/" + uri - } -} +//func SafeJoin(base, uri string) string { +// baseSlash := strings.HasSuffix(base, "/") +// uriSlash := strings.HasPrefix(uri, "/") +// if (baseSlash && !uriSlash) || (!baseSlash && uriSlash) { +// return base + uri +// } else if baseSlash && uriSlash { +// return base + uri[1:] +// } else { +// return base + "/" + uri +// } +//} + +//func SafePath(url, path string) string { +// urlSlash := strings.HasSuffix(url, "/") +// pathSlash := strings.HasPrefix(path, "/") +// if !urlSlash && !pathSlash { +// return "/" + path +// } else if urlSlash && pathSlash { +// return path[1:] +// } else { +// return path +// } +//} func BakGenerator(domain string) []string { var possibilities []string