goop/internal/utils/html.go

41 lines
815 B
Go
Raw Normal View History

package utils
import (
"bytes"
"github.com/PuerkitoBio/goquery"
"net/url"
"strings"
)
2020-10-31 12:46:49 +00:00
var htmlTag = []byte{'<', 'h', 't', 'm', 'l'}
func IsHtml(body []byte) bool {
return bytes.Contains(body, htmlTag)
}
func GetIndexedFiles(body []byte) ([]string, error) {
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
if err != nil {
return nil, err
}
var files []string
var exitErr error
doc.Find("a").EachWithBreak(func(_ int, link *goquery.Selection) bool {
lnk, err := url.Parse(link.AttrOr("href", ""))
if err != nil {
exitErr = err
return false
}
if lnk.Path != "" &&
lnk.Path != "." &&
lnk.Path != ".." &&
!strings.HasPrefix(lnk.Path, "/") &&
lnk.Scheme == "" &&
lnk.Host == "" {
files = append(files, lnk.Path)
}
return true
})
return files, err
}