2020-10-30 10:47:23 +00:00
|
|
|
package utils
|
|
|
|
|
|
|
|
import (
|
|
|
|
"bytes"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
|
|
"net/url"
|
|
|
|
"strings"
|
|
|
|
)
|
|
|
|
|
|
|
|
var htmlTag = []byte{'<', 'h', 't', 'm', 'l', '>'}
|
|
|
|
|
2020-10-31 12:38:17 +00:00
|
|
|
// TODO: this sometimes fails??
|
|
|
|
// To test that behavior use goop on enigmaticboys dot com (adult website)
|
2020-10-30 10:47:23 +00:00
|
|
|
func IsHtml(body []byte) bool {
|
|
|
|
return bytes.Contains(body, htmlTag)
|
|
|
|
}
|
|
|
|
|
|
|
|
func GetIndexedFiles(body []byte) ([]string, error) {
|
|
|
|
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
var files []string
|
|
|
|
var exitErr error
|
|
|
|
doc.Find("a").EachWithBreak(func(_ int, link *goquery.Selection) bool {
|
|
|
|
lnk, err := url.Parse(link.AttrOr("href", ""))
|
|
|
|
if err != nil {
|
|
|
|
exitErr = err
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
if lnk.Path != "" &&
|
|
|
|
lnk.Path != "." &&
|
|
|
|
lnk.Path != ".." &&
|
|
|
|
!strings.HasPrefix(lnk.Path, "/") &&
|
|
|
|
lnk.Scheme == "" &&
|
|
|
|
lnk.Host == "" {
|
|
|
|
files = append(files, lnk.Path)
|
|
|
|
}
|
|
|
|
return true
|
|
|
|
})
|
|
|
|
return files, err
|
|
|
|
}
|