dischord/extractor/util/util.go

60 lines
1.1 KiB
Go
Raw Permalink Normal View History

2022-09-20 00:54:22 +02:00
package util
import (
"golang.org/x/net/html"
"net/http"
)
// Retrieve JavaScript embedded in HTML
func GetHTMLScriptFunc(url string, readCodeLineByLine bool, codeFunc func(code string) bool) error {
resp, err := http.Get(url)
if err != nil {
return err
}
defer resp.Body.Close()
z := html.NewTokenizer(resp.Body)
isScript := false
for {
tt := z.Next()
switch tt {
case html.ErrorToken:
return z.Err()
case html.TextToken:
if codeFunc != nil && isScript {
t := string(z.Text())
if readCodeLineByLine {
// NOTE: a bufio line scanner doesn't work (bufio.Scanner: token too long); maybe this is a bug
// Iterate over each line in the script
ls := 0 // line start
le := 0 // line end
for ls < len(t) {
if le == len(t) || t[le] == '\n' {
ln := t[ls:le]
if !codeFunc(ln) {
return nil
}
ls = le + 1
}
le++
}
} else {
if !codeFunc(t) {
return nil
}
}
}
case html.StartTagToken, html.EndTagToken:
tn, _ := z.TagName()
if string(tn) == "script" {
isScript = tt == html.StartTagToken
}
}
}
}