You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
407 lines
9.3 KiB
407 lines
9.3 KiB
package main |
|
|
|
import ( |
|
"bytes" |
|
"errors" |
|
"fmt" |
|
"io" |
|
"net/http" |
|
"net/url" |
|
"os" |
|
"path" |
|
"strings" |
|
|
|
gq "github.com/PuerkitoBio/goquery" |
|
"golang.org/x/net/html" |
|
"golang.org/x/net/html/atom" |
|
) |
|
|
|
var ( |
|
ErrInvalidURL = errors.New("invalid url") |
|
ErrNoChaptersFound = errors.New("no chapters found in index") |
|
ErrParsingPage = errors.New("error parsing page") |
|
ErrBookNotFound = errors.New("book not found") |
|
) |
|
|
|
const ( |
|
colRed = "\033[31;1m" |
|
colYellow = "\033[33;1m" |
|
colReset = "\033[m" |
|
) |
|
|
|
func usage(arg0 string, exitStatus int) { |
|
fmt.Fprintln(os.Stderr, `Usage: |
|
`+arg0+` [options...] <BOOK_URL> |
|
|
|
Book URL format: |
|
http[s]://[www.]projekt-gutenberg.org/<author>/<book>[/whateverdoesntmatter] |
|
|
|
Options: |
|
-dir <DIRECTORY> -- Output directory (default: "."). |
|
|
|
Output types: |
|
* <INFO> |
|
`+colYellow+`! <WARNING>`+colReset+` |
|
`+colRed+`! <ERROR>`+colReset) |
|
os.Exit(exitStatus) |
|
} |
|
|
|
func clearLine() { |
|
fmt.Print("\033[2K") |
|
} |
|
|
|
func printInfo(f string, v ...interface{}) { |
|
fmt.Printf("* "+f+"\n", v...) |
|
} |
|
|
|
func printWarn(f string, v ...interface{}) { |
|
fmt.Fprintf(os.Stderr, colYellow+"! "+f+colReset+"\n", v...) |
|
} |
|
|
|
func printErr(f string, v ...interface{}) { |
|
fmt.Fprintf(os.Stderr, colRed+"! "+f+colReset+"\n", v...) |
|
os.Exit(1) |
|
} |
|
|
|
func getBaseUrl(rawurl string) (string, error) { |
|
url, err := url.Parse(rawurl) |
|
if err != nil { |
|
return "", err |
|
} |
|
if !(url.Scheme == "http" || url.Scheme == "https") { |
|
return "", ErrInvalidURL |
|
} |
|
if !(url.Host == "projekt-gutenberg.org" || url.Host == "www.projekt-gutenberg.org") { |
|
return "", ErrInvalidURL |
|
} |
|
spPath := strings.Split(strings.Trim(url.Path, "/"), "/") |
|
if len(spPath) < 2 { |
|
return "", ErrInvalidURL |
|
} |
|
basePath := strings.Join(spPath[:2], "/") |
|
return url.Scheme + "://projekt-gutenberg.org/" + basePath, nil |
|
} |
|
|
|
// Returns a slice containing the links to the chapters. |
|
func getChapters(baseUrl string, doc *gq.Document) ([]string, error) { |
|
chapterUrls := make([]string, 0, 8) |
|
doc.Find("body ul li").Each(func(i int, s *gq.Selection) { |
|
// The website has a strange bug where the 'a' element is separate from |
|
// the text element. That's why we have to search the entire 'li' |
|
// element for an 'a' element with a link. |
|
s = s.Find("a[href]") |
|
if len(s.Nodes) == 0 { |
|
// This should really never happen, that's why we're using panic. |
|
panic("missing link in chapter index") |
|
} |
|
relUrl, _ := s.Attr("href") // We now know it must have the href attribute. |
|
chapterUrls = append(chapterUrls, baseUrl+"/"+relUrl) |
|
}) |
|
if len(chapterUrls) == 0 { |
|
return nil, ErrNoChaptersFound |
|
} |
|
return chapterUrls, nil |
|
} |
|
|
|
type MetaInfo struct { |
|
Author string |
|
Title string |
|
Year string |
|
} |
|
|
|
func getMetaInfo(doc *gq.Document) MetaInfo { |
|
metas := doc.Find("head meta") |
|
return MetaInfo{ |
|
Author: metas.Filter("[name=\"author\"]").AttrOr("content", "Unknown"), |
|
Title: metas.Filter("[name=\"title\"]").AttrOr("content", "Unknown"), |
|
Year: metas.Filter("[name=\"firstpub\"]").AttrOr("content", "Unknown"), |
|
} |
|
} |
|
|
|
func (m MetaInfo) ToTitle() string { |
|
return fmt.Sprintf("%s -- %s, %s", m.Author, m.Title, m.Year) |
|
} |
|
|
|
type Extractor struct { |
|
BaseUrl string |
|
Meta MetaInfo |
|
ChapterUrls []string |
|
W io.Writer |
|
} |
|
|
|
func NewExtractor(rawurl string, w io.Writer) (*Extractor, error) { |
|
baseUrl, err := getBaseUrl(rawurl) |
|
if err != nil { |
|
return nil, err |
|
} |
|
return &Extractor{ |
|
BaseUrl: baseUrl, |
|
W: w, |
|
}, nil |
|
} |
|
|
|
func (e *Extractor) FetchAndProcessIndex() error { |
|
// Get HTML document. |
|
resp, err := http.Get(e.BaseUrl) |
|
if err != nil { |
|
return err |
|
} |
|
if resp.StatusCode == 404 { |
|
return ErrBookNotFound |
|
} |
|
defer resp.Body.Close() |
|
// Parse HTML via Goquery. |
|
doc, err := gq.NewDocumentFromReader(resp.Body) |
|
if err != nil { |
|
return err |
|
} |
|
// Get metadata. |
|
metaInfo := getMetaInfo(doc) |
|
e.Meta = metaInfo |
|
// Get chapter URLs from index. |
|
chapterUrls, err := getChapters(e.BaseUrl, doc) |
|
if err != nil { |
|
return err |
|
} |
|
e.ChapterUrls = chapterUrls |
|
return nil |
|
} |
|
|
|
func (e *Extractor) parseAdditionalPage(doc *gq.Document) error { |
|
// Every document has two main <hr> elements with the given properties. |
|
// They are a way to mark the contained text. |
|
var passedHrs int |
|
var err error |
|
content := doc.Find("body").Children().FilterFunction(func(i int, s *gq.Selection) bool { |
|
if s.Is("hr[size=\"1\"][color=\"#808080\"]") { |
|
passedHrs++ |
|
return false |
|
} else if s.Is("a") && (s.Text() == "<<\u00A0zurück" || s.Text() == "weiter\u00A0>>") { |
|
// We don't want the "zurück"/"weiter"-buttons |
|
return false |
|
} |
|
switch passedHrs { |
|
case 0: |
|
return false |
|
case 1: |
|
return true |
|
case 2: |
|
return false |
|
default: |
|
err = ErrParsingPage |
|
return false |
|
} |
|
}) |
|
if err != nil { |
|
return err |
|
} |
|
|
|
// Now that we've extracted the actual content, convert it into markdown. |
|
var process func(*html.Node) string |
|
process = func(n *html.Node) string { |
|
processChildren := func() string { |
|
var ret string |
|
for i := n.FirstChild; i != nil; i = i.NextSibling { |
|
ret += process(i) |
|
} |
|
return ret |
|
} |
|
|
|
// Checks if `n` has the given HTML class. |
|
hasClass := func(class string) bool { |
|
for _, v := range n.Attr { |
|
if v.Key == "class" { |
|
classes := strings.Split(v.Val, " ") |
|
for _, cl := range classes { |
|
if cl == class { |
|
return true |
|
} |
|
} |
|
return false |
|
} |
|
} |
|
return false |
|
} |
|
|
|
var ret string |
|
switch n.Type { |
|
case html.TextNode: |
|
// If we have a text node, return the actual text after some |
|
// post-processing. |
|
ret = strings.ReplaceAll(n.Data, "\n", "") |
|
var newRet string |
|
// Replace all sequences of spaces consisting of more than one space |
|
// with just one space. |
|
var prevWasSpace bool |
|
for _, c := range ret { |
|
if c == ' ' { |
|
if prevWasSpace { |
|
continue |
|
} |
|
prevWasSpace = true |
|
} else { |
|
prevWasSpace = false |
|
} |
|
newRet += string(c) |
|
} |
|
ret = newRet |
|
case html.ElementNode: |
|
// Transform the individual HTML elements. |
|
switch n.DataAtom { |
|
case atom.Br: |
|
ret = "\n\n" |
|
case atom.H1: |
|
ret = "# " + processChildren() + "\n" |
|
case atom.H2: |
|
ret = "## " + processChildren() + "\n" |
|
case atom.H3: |
|
ret = "### " + processChildren() + "\n" |
|
case atom.H4: |
|
ret = "#### " + processChildren() + "\n" |
|
case atom.H5: |
|
ret = "##### " + processChildren() + "\n" |
|
case atom.H6: |
|
ret = "###### " + processChildren() + "\n" |
|
case atom.P: |
|
if hasClass("centerbig") { |
|
ret = "#### " + processChildren() + "\n\n" |
|
} else { |
|
ret = /*" " + */ processChildren() + "\n\n" |
|
} |
|
case atom.Div: |
|
ret = processChildren() |
|
case atom.Tt: |
|
ret = "`" + processChildren() + "`" |
|
case atom.I: |
|
ret = "_" + processChildren() + "_" |
|
case atom.A: |
|
ret = processChildren() |
|
case atom.Span: |
|
ret = processChildren() |
|
case atom.Img: |
|
default: |
|
clearLine() |
|
printWarn("Unknown data atom: %v", n.Data) |
|
} |
|
// Add some CSS effects. |
|
if hasClass("spaced") { |
|
// Add spaced effect. |
|
var newRet string |
|
var runes []rune = []rune(ret) |
|
var nRunes = len(runes) |
|
for i := 0; i < nRunes; i++ { |
|
newRet += string(runes[i]) |
|
if i < nRunes-1 { |
|
newRet += " " |
|
} |
|
} |
|
ret = newRet |
|
} |
|
default: |
|
clearLine() |
|
printWarn("Unknown type: %v", n.Type) |
|
} |
|
return ret |
|
} |
|
for _, n := range content.Nodes { |
|
fmt.Fprint(e.W, process(n)) |
|
} |
|
return nil |
|
} |
|
|
|
func (e *Extractor) FetchAndProcessChapter(chapterUrl string) error { |
|
// Get HTML document. |
|
resp, err := http.Get(chapterUrl) |
|
if err != nil { |
|
return err |
|
} |
|
defer resp.Body.Close() |
|
// Parse HTML via Goquery (or really x/net/html). |
|
doc, err := gq.NewDocumentFromReader(resp.Body) |
|
if err != nil { |
|
return err |
|
} |
|
// Parse page. |
|
err = e.parseAdditionalPage(doc) |
|
if err != nil { |
|
return err |
|
} |
|
// Add horizontal rule after title page. |
|
if path.Base(chapterUrl) == "titlepage.html" { |
|
fmt.Fprintln(e.W, "\n----------------\n") |
|
} |
|
return nil |
|
} |
|
|
|
func main() { |
|
var url string |
|
dir := "." |
|
|
|
if len(os.Args) < 2 { |
|
usage(os.Args[0], 1) |
|
} |
|
// Parse command line arguments. |
|
for i := 1; i < len(os.Args); i++ { |
|
// Returns the argument after the given option. Errors if there is no |
|
// argument. |
|
expectArg := func(currArg string) string { |
|
i++ |
|
if i >= len(os.Args) { |
|
printErr("Expected argument after option '%v'", currArg) |
|
} |
|
return os.Args[i] |
|
} |
|
|
|
arg := os.Args[i] |
|
if len(arg) >= 1 && arg[0] == '-' { |
|
switch arg { |
|
case "-dir": |
|
dir = expectArg(arg) |
|
case "--help", "-h": |
|
usage(os.Args[0], 0) |
|
default: |
|
printErr("Unknown option: '%v'", arg) |
|
} |
|
} else { |
|
if url == "" { |
|
url = arg |
|
} else { |
|
printErr("Expected option, but got '%v'", arg) |
|
} |
|
} |
|
} |
|
if url == "" { |
|
printInfo("Please specify a book URL") |
|
os.Exit(1) |
|
} |
|
printInfo("Book URL: %v", url) |
|
|
|
// Initial scraping. |
|
var b bytes.Buffer |
|
e, err := NewExtractor(url, &b) |
|
if err != nil { |
|
printErr("Error: %v", err) |
|
} |
|
err = e.FetchAndProcessIndex() |
|
if err != nil { |
|
printErr("Error: %v", err) |
|
} |
|
bookName := e.Meta.ToTitle() |
|
printInfo("Book: %v", bookName) |
|
|
|
// Download the actual chapters. |
|
for i, chapter := range e.ChapterUrls { |
|
clearLine() |
|
fmt.Printf("* Downloading chapter %v/%v...\r", i+1, len(e.ChapterUrls)) |
|
err = e.FetchAndProcessChapter(chapter) |
|
if err != nil { |
|
printErr("Error: %v", err) |
|
} |
|
} |
|
|
|
// Write the generated markdown text to a file. |
|
filename := path.Join(dir, bookName+".md") |
|
os.WriteFile(filename, b.Bytes(), 0666) |
|
clearLine() |
|
printInfo("Saved as: %v", filename) |
|
}
|
|
|