Initial commit

This commit is contained in:
r4 2021-07-19 18:27:56 +02:00
parent ca11b81540
commit feafe19aa1
3 changed files with 413 additions and 0 deletions

5
go.mod Normal file
View File

@ -0,0 +1,5 @@
module projekt_gutenberg_de_dl
go 1.16
require github.com/PuerkitoBio/goquery v1.7.1

12
go.sum Normal file
View File

@ -0,0 +1,12 @@
github.com/PuerkitoBio/goquery v1.7.1 h1:oE+T06D+1T7LNrn91B4aERsRIeCLJ/oPSa6xB9FPnz4=
github.com/PuerkitoBio/goquery v1.7.1/go.mod h1:XY0pP4kfraEmmV1O7Uf6XyjoslwsneBbgeDjLYuN8xY=
github.com/andybalholm/cascadia v1.2.0 h1:vuRCkM5Ozh/BfmsaTm26kbjm0mIOM3yS5Ek/F5h18aE=
github.com/andybalholm/cascadia v1.2.0/go.mod h1:YCyR8vOZT9aZ1CHEd8ap0gMVm2aFgxBp0T0eFw1RUQY=
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20210614182718-04defd469f4e h1:XpT3nA5TvE525Ne3hInMh6+GETgn27Zfm9dxsThnX2Q=
golang.org/x/net v0.0.0-20210614182718-04defd469f4e/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=

396
main.go Normal file
View File

@ -0,0 +1,396 @@
package main
import (
"bytes"
"errors"
"fmt"
"io"
"net/http"
"net/url"
"os"
"path"
"strings"
gq "github.com/PuerkitoBio/goquery"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
)
var (
ErrInvalidURL = errors.New("invalid url")
ErrNoChaptersFound = errors.New("no chapters found in index")
ErrParsingPage = errors.New("error parsing page")
)
const (
colRed = "\033[31m"
colYellow = "\033[33m"
colReset = "\033[m"
)
func usage(arg0 string, exitStatus int) {
fmt.Fprintln(os.Stderr, `Usage:
`+arg0+` [options...] <BOOK_URL>
Book URL format:
http[s]://[www.]projekt-gutenberg.org/<author>/<book>[/whateverdoesntmatter]
Options:
-dir <DIRECTORY> -- Output directory (default: ".").
Output types:
* <INFO>
`+colYellow+`! <WARNING>`+colReset+`
`+colRed+`! <ERROR>`+colReset)
os.Exit(exitStatus)
}
func printInfo(f string, v ...interface{}) {
fmt.Printf("* "+f+"\n", v...)
}
func printWarn(f string, v ...interface{}) {
fmt.Fprintf(os.Stderr, colYellow+"! "+f+colReset+"\n", v...)
}
func printErr(f string, v ...interface{}) {
fmt.Fprintf(os.Stderr, colRed+"! "+f+colReset+"\n", v...)
os.Exit(1)
}
func getBaseUrl(rawurl string) (string, error) {
url, err := url.Parse(rawurl)
if err != nil {
return "", err
}
if !(url.Scheme == "http" || url.Scheme == "https") {
return "", ErrInvalidURL
}
if !(url.Host == "projekt-gutenberg.org" || url.Host == "www.projekt-gutenberg.org") {
return "", ErrInvalidURL
}
spPath := strings.Split(strings.Trim(url.Path, "/"), "/")
if len(spPath) < 2 {
return "", ErrInvalidURL
}
basePath := strings.Join(spPath[:2], "/")
return url.Scheme + "://projekt-gutenberg.org/" + basePath, nil
}
// Returns a slice containing the links to the chapters.
func getChapters(baseUrl string, doc *gq.Document) ([]string, error) {
chapterUrls := make([]string, 0, 8)
doc.Find("body ul li").Each(func(i int, s *gq.Selection) {
// The website has a strange bug where the 'a' element is separate from
// the text element. That's why we have to search the entire 'li'
// element for an 'a' element with a link.
s = s.Find("a[href]")
if len(s.Nodes) == 0 {
// This should really never happen, that's why we're using panic.
panic("missing link in chapter index")
}
relUrl, _ := s.Attr("href") // We now know it must have the href attribute.
chapterUrls = append(chapterUrls, baseUrl+"/"+relUrl)
})
if len(chapterUrls) == 0 {
return nil, ErrNoChaptersFound
}
return chapterUrls, nil
}
type MetaInfo struct {
Author string
Title string
Year string
}
func getMetaInfo(doc *gq.Document) MetaInfo {
metas := doc.Find("head meta")
return MetaInfo{
Author: metas.Filter("[name=\"author\"]").AttrOr("content", "Unknown"),
Title: metas.Filter("[name=\"title\"]").AttrOr("content", "Unknown"),
Year: metas.Filter("[name=\"firstpub\"]").AttrOr("content", "Unknown"),
}
}
func (m MetaInfo) ToTitle() string {
return fmt.Sprintf("%s -- %s, %s", m.Author, m.Title, m.Year)
}
type Extractor struct {
BaseUrl string
Meta MetaInfo
ChapterUrls []string
W io.Writer
}
func NewExtractor(rawurl string, w io.Writer) (*Extractor, error) {
baseUrl, err := getBaseUrl(rawurl)
if err != nil {
return nil, err
}
return &Extractor{
BaseUrl: baseUrl,
W: w,
}, nil
}
func (e *Extractor) FetchAndProcessIndex() error {
// Get HTML document.
resp, err := http.Get(e.BaseUrl)
if err != nil {
return err
}
defer resp.Body.Close()
// Parse HTML via Goquery.
doc, err := gq.NewDocumentFromReader(resp.Body)
if err != nil {
return err
}
// Get metadata.
metaInfo := getMetaInfo(doc)
e.Meta = metaInfo
// Get chapter URLs from index.
chapterUrls, err := getChapters(e.BaseUrl, doc)
if err != nil {
return err
}
e.ChapterUrls = chapterUrls
return nil
}
func (e *Extractor) parseAdditionalPage(doc *gq.Document) error {
// Every document has two main <hr> elements with the given properties.
// They are a way to mark the contained text.
var passedHrs int
var err error
content := doc.Find("body").Children().FilterFunction(func(i int, s *gq.Selection) bool {
if s.Is("hr[size=\"1\"][color=\"#808080\"]") {
passedHrs++
return false
} else if s.Is("a") && (s.Text() == "<<\u00A0zurück" || s.Text() == "weiter\u00A0>>") {
// We don't want the "zurück"/"weiter"-buttons
return false
}
switch passedHrs {
case 0:
return false
case 1:
return true
case 2:
return false
default:
err = ErrParsingPage
return false
}
})
if err != nil {
return err
}
// Now that we've extracted the actual content, convert it into markdown.
var process func(*html.Node) string
process = func(n *html.Node) string {
processChildren := func() string {
var ret string
for i := n.FirstChild; i != nil; i = i.NextSibling {
ret += process(i)
}
return ret
}
// Checks if `n` has the given HTML class.
hasClass := func(class string) bool {
for _, v := range n.Attr {
if v.Key == "class" {
classes := strings.Split(v.Val, " ")
for _, cl := range classes {
if cl == class {
return true
}
}
return false
}
}
return false
}
var ret string
switch n.Type {
case html.TextNode:
// If we have a text node, return the actual text after some
// post-processing.
ret = strings.ReplaceAll(n.Data, "\n", "")
var newRet string
// Replace all sequences of spaces consisting of more than one space
// with just one space.
var prevWasSpace bool
for _, c := range ret {
if c == ' ' {
if prevWasSpace {
continue
}
prevWasSpace = true
} else {
prevWasSpace = false
}
newRet += string(c)
}
ret = newRet
case html.ElementNode:
// Transform the individual HTML elements.
switch n.DataAtom {
case atom.Br:
ret = "\n"
case atom.H1:
ret = "# " + processChildren() + "\n"
case atom.H2:
ret = "## " + processChildren() + "\n"
case atom.H3:
ret = "### " + processChildren() + "\n"
case atom.H4:
ret = "#### " + processChildren() + "\n"
case atom.H5:
ret = "##### " + processChildren() + "\n"
case atom.H6:
ret = "###### " + processChildren() + "\n"
case atom.P:
if hasClass("centerbig") {
ret = "#### " + processChildren() + "\n\n"
} else {
ret = /*" " + */processChildren() + "\n\n"
}
case atom.Div:
ret = processChildren()
case atom.Tt:
ret = "`" + processChildren() + "`"
case atom.I:
ret = "_" + processChildren() + "_"
case atom.A:
ret = processChildren()
case atom.Span:
ret = processChildren()
case atom.Img:
default:
printWarn("Unknown data atom:", n.Data)
}
// Add some CSS effects.
if hasClass("spaced") {
// Add spaced effect.
var newRet string
var runes []rune = []rune(ret)
var nRunes = len(runes)
for i := 0; i < nRunes; i++ {
newRet += string(runes[i])
if i < nRunes-1 {
newRet += " "
}
}
ret = newRet
}
default:
printWarn("Unknown type:", n.Type)
}
return ret
}
for _, n := range content.Nodes {
fmt.Fprint(e.W, process(n))
}
return nil
}
func (e *Extractor) FetchAndProcessChapter(chapterUrl string) error {
// Get HTML document.
resp, err := http.Get(chapterUrl)
if err != nil {
return err
}
defer resp.Body.Close()
// Parse HTML via Goquery (or really x/net/html).
doc, err := gq.NewDocumentFromReader(resp.Body)
if err != nil {
return err
}
// Parse page.
err = e.parseAdditionalPage(doc)
if err != nil {
return err
}
// Add horizontal rule after title page.
if path.Base(chapterUrl) == "titlepage.html" {
fmt.Fprintln(e.W, "----------------")
}
return nil
}
func main() {
var url string
dir := "."
if len(os.Args) < 2 {
usage(os.Args[0], 1)
}
// Parse command line arguments.
for i := 1; i < len(os.Args); i++ {
// Returns the argument after the given option. Errors if there is no
// argument.
expectArg := func(currArg string) string {
i++
if i >= len(os.Args) {
printErr("Expected argument after option '%v'", currArg)
}
return os.Args[i]
}
arg := os.Args[i]
if len(arg) >= 1 && arg[0] == '-' {
switch arg {
case "-dir":
dir = expectArg(arg)
case "--help", "-h":
usage(os.Args[0], 0)
default:
printErr("Unknown option: '%v'", arg)
}
} else {
if url == "" {
url = arg
} else {
printErr("Expected option, but got '%v'", arg)
}
}
}
if url == "" {
printInfo("Please specify a book URL")
os.Exit(1)
}
printInfo("Book URL: %v", url)
// Initial scraping.
var b bytes.Buffer
e, err := NewExtractor(url, &b)
if err != nil {
panic(err)
}
err = e.FetchAndProcessIndex()
if err != nil {
panic(err)
}
bookName := e.Meta.ToTitle()
printInfo("Book: %v", bookName)
// Download the actual chapters.
for i, chapter := range e.ChapterUrls {
fmt.Printf("\033[2K* Downloading chapter %v/%v...\r", i+1, len(e.ChapterUrls))
err = e.FetchAndProcessChapter(chapter)
if err != nil {
panic(err)
}
}
// Write the generated markdown text to a file.
filename := path.Join(dir, bookName + ".md")
os.WriteFile(filename, b.Bytes(), 0666)
printInfo("Saved as: %v", filename)
}