Initial commit

2021-07-19 18:27:56 +02:00 · 2021-07-19 18:27:56 +02:00 · feafe19aa1
commit feafe19aa1
parent ca11b81540
3 changed files with 413 additions and 0 deletions
--- a/go.mod
+++ b/go.mod
@ -0,0 +1,5 @@
 module projekt_gutenberg_de_dl
 go 1.16
 require github.com/PuerkitoBio/goquery v1.7.1
--- a/go.sum
+++ b/go.sum
@ -0,0 +1,12 @@
 github.com/PuerkitoBio/goquery v1.7.1 h1:oE+T06D+1T7LNrn91B4aERsRIeCLJ/oPSa6xB9FPnz4=
 github.com/PuerkitoBio/goquery v1.7.1/go.mod h1:XY0pP4kfraEmmV1O7Uf6XyjoslwsneBbgeDjLYuN8xY=
 github.com/andybalholm/cascadia v1.2.0 h1:vuRCkM5Ozh/BfmsaTm26kbjm0mIOM3yS5Ek/F5h18aE=
 github.com/andybalholm/cascadia v1.2.0/go.mod h1:YCyR8vOZT9aZ1CHEd8ap0gMVm2aFgxBp0T0eFw1RUQY=
 golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
 golang.org/x/net v0.0.0-20210614182718-04defd469f4e h1:XpT3nA5TvE525Ne3hInMh6+GETgn27Zfm9dxsThnX2Q=
 golang.org/x/net v0.0.0-20210614182718-04defd469f4e/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
 golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
 golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
--- a/main.go
+++ b/main.go
@ -0,0 +1,396 @@
 package main
 import (
 	"bytes"
 	"errors"
 	"fmt"
 	"io"
 	"net/http"
 	"net/url"
 	"os"
 	"path"
 	"strings"
 	gq "github.com/PuerkitoBio/goquery"
 	"golang.org/x/net/html"
 	"golang.org/x/net/html/atom"
 )
 var (
 	ErrInvalidURL      = errors.New("invalid url")
 	ErrNoChaptersFound = errors.New("no chapters found in index")
 	ErrParsingPage     = errors.New("error parsing page")
 )
 const (
 	colRed    = "\033[31m"
 	colYellow = "\033[33m"
 	colReset  = "\033[m"
 )
 func usage(arg0 string, exitStatus int) {
 	fmt.Fprintln(os.Stderr, `Usage:
  `+arg0+` [options...] <BOOK_URL>
 Book URL format:
  http[s]://[www.]projekt-gutenberg.org/<author>/<book>[/whateverdoesntmatter]
 Options:
  -dir <DIRECTORY>  --  Output directory (default: ".").
 Output types:
  * <INFO>
  `+colYellow+`! <WARNING>`+colReset+`
  `+colRed+`! <ERROR>`+colReset)
 	os.Exit(exitStatus)
 }
 func printInfo(f string, v ...interface{}) {
 	fmt.Printf("* "+f+"\n", v...)
 }
 func printWarn(f string, v ...interface{}) {
 	fmt.Fprintf(os.Stderr, colYellow+"! "+f+colReset+"\n", v...)
 }
 func printErr(f string, v ...interface{}) {
 	fmt.Fprintf(os.Stderr, colRed+"! "+f+colReset+"\n", v...)
 	os.Exit(1)
 }
 func getBaseUrl(rawurl string) (string, error) {
 	url, err := url.Parse(rawurl)
 	if err != nil {
 		return "", err
 	}
 	if !(url.Scheme == "http" || url.Scheme == "https") {
 		return "", ErrInvalidURL
 	}
 	if !(url.Host == "projekt-gutenberg.org" || url.Host == "www.projekt-gutenberg.org") {
 		return "", ErrInvalidURL
 	}
 	spPath := strings.Split(strings.Trim(url.Path, "/"), "/")
 	if len(spPath) < 2 {
 		return "", ErrInvalidURL
 	}
 	basePath := strings.Join(spPath[:2], "/")
 	return url.Scheme + "://projekt-gutenberg.org/" + basePath, nil
 }
 // Returns a slice containing the links to the chapters.
 func getChapters(baseUrl string, doc *gq.Document) ([]string, error) {
 	chapterUrls := make([]string, 0, 8)
 	doc.Find("body ul li").Each(func(i int, s *gq.Selection) {
 		// The website has a strange bug where the 'a' element is separate from
 		// the text element. That's why we have to search the entire 'li'
 		// element for an 'a' element with a link.
 		s = s.Find("a[href]")
 		if len(s.Nodes) == 0 {
 			// This should really never happen, that's why we're using panic.
 			panic("missing link in chapter index")
 		}
 		relUrl, _ := s.Attr("href") // We now know it must have the href attribute.
 		chapterUrls = append(chapterUrls, baseUrl+"/"+relUrl)
 	})
 	if len(chapterUrls) == 0 {
 		return nil, ErrNoChaptersFound
 	}
 	return chapterUrls, nil
 }
 type MetaInfo struct {
 	Author string
 	Title  string
 	Year   string
 }
 func getMetaInfo(doc *gq.Document) MetaInfo {
 	metas := doc.Find("head meta")
 	return MetaInfo{
 		Author: metas.Filter("[name=\"author\"]").AttrOr("content", "Unknown"),
 		Title:  metas.Filter("[name=\"title\"]").AttrOr("content", "Unknown"),
 		Year:   metas.Filter("[name=\"firstpub\"]").AttrOr("content", "Unknown"),
 	}
 }
 func (m MetaInfo) ToTitle() string {
 	return fmt.Sprintf("%s -- %s, %s", m.Author, m.Title, m.Year)
 }
 type Extractor struct {
 	BaseUrl     string
 	Meta        MetaInfo
 	ChapterUrls []string
 	W           io.Writer
 }
 func NewExtractor(rawurl string, w io.Writer) (*Extractor, error) {
 	baseUrl, err := getBaseUrl(rawurl)
 	if err != nil {
 		return nil, err
 	}
 	return &Extractor{
 		BaseUrl: baseUrl,
 		W: w,
 	}, nil
 }
 func (e *Extractor) FetchAndProcessIndex() error {
 	// Get HTML document.
 	resp, err := http.Get(e.BaseUrl)
 	if err != nil {
 		return err
 	}
 	defer resp.Body.Close()
 	// Parse HTML via Goquery.
 	doc, err := gq.NewDocumentFromReader(resp.Body)
 	if err != nil {
 		return err
 	}
 	// Get metadata.
 	metaInfo := getMetaInfo(doc)
 	e.Meta = metaInfo
 	// Get chapter URLs from index.
 	chapterUrls, err := getChapters(e.BaseUrl, doc)
 	if err != nil {
 		return err
 	}
 	e.ChapterUrls = chapterUrls
 	return nil
 }
 func (e *Extractor) parseAdditionalPage(doc *gq.Document) error {
 	// Every document has two main <hr> elements with the given properties.
 	// They are a way to mark the contained text.
 	var passedHrs int
 	var err error
 	content := doc.Find("body").Children().FilterFunction(func(i int, s *gq.Selection) bool {
 		if s.Is("hr[size=\"1\"][color=\"#808080\"]") {
 			passedHrs++
 			return false
 		} else if s.Is("a") && (s.Text() == "<<\u00A0zurück" || s.Text() == "weiter\u00A0>>") {
 			// We don't want the "zurück"/"weiter"-buttons
 			return false
 		}
 		switch passedHrs {
 		case 0:
 			return false
 		case 1:
 			return true
 		case 2:
 			return false
 		default:
 			err = ErrParsingPage
 			return false
 		}
 	})
 	if err != nil {
 		return err
 	}
 	// Now that we've extracted the actual content, convert it into markdown.
 	var process func(*html.Node) string
 	process = func(n *html.Node) string {
 		processChildren := func() string {
 			var ret string
 			for i := n.FirstChild; i != nil; i = i.NextSibling {
 				ret += process(i)
 			}
 			return ret
 		}
 		// Checks if `n` has the given HTML class.
 		hasClass := func(class string) bool {
 			for _, v := range n.Attr {
 				if v.Key == "class" {
 					classes := strings.Split(v.Val, " ")
 					for _, cl := range classes {
 						if cl == class {
 							return true
 						}
 					}
 					return false
 				}
 			}
 			return false
 		}
 		var ret string
 		switch n.Type {
 		case html.TextNode:
 			// If we have a text node, return the actual text after some
 			// post-processing.
 			ret = strings.ReplaceAll(n.Data, "\n", "")
 			var newRet string
 			// Replace all sequences of spaces consisting of more than one space
 			// with just one space.
 			var prevWasSpace bool
 			for _, c := range ret {
 				if c == ' ' {
 					if prevWasSpace {
 						continue
 					}
 					prevWasSpace = true
 				} else {
 					prevWasSpace = false
 				}
 				newRet += string(c)
 			}
 			ret = newRet
 		case html.ElementNode:
 			// Transform the individual HTML elements.
 			switch n.DataAtom {
 			case atom.Br:
 				ret = "\n"
 			case atom.H1:
 				ret = "# " + processChildren() + "\n"
 			case atom.H2:
 				ret = "## " + processChildren() + "\n"
 			case atom.H3:
 				ret = "### " + processChildren() + "\n"
 			case atom.H4:
 				ret = "#### " + processChildren() + "\n"
 			case atom.H5:
 				ret = "##### " + processChildren() + "\n"
 			case atom.H6:
 				ret = "###### " + processChildren() + "\n"
 			case atom.P:
 				if hasClass("centerbig") {
 					ret = "#### " + processChildren() + "\n\n"
 				} else {
 					ret = /*"    " + */processChildren() + "\n\n"
 				}
 			case atom.Div:
 				ret = processChildren()
 			case atom.Tt:
 				ret = "`" + processChildren() + "`"
 			case atom.I:
 				ret = "_" + processChildren() + "_"
 			case atom.A:
 				ret = processChildren()
 			case atom.Span:
 				ret = processChildren()
 			case atom.Img:
 			default:
 				printWarn("Unknown data atom:", n.Data)
 			}
 			// Add some CSS effects.
 			if hasClass("spaced") {
 				// Add spaced effect.
 				var newRet string
 				var runes []rune = []rune(ret)
 				var nRunes = len(runes)
 				for i := 0; i < nRunes; i++ {
 					newRet += string(runes[i])
 					if i < nRunes-1 {
 						newRet += " "
 					}
 				}
 				ret = newRet
 			}
 		default:
 			printWarn("Unknown type:", n.Type)
 		}
 		return ret
 	}
 	for _, n := range content.Nodes {
 		fmt.Fprint(e.W, process(n))
 	}
 	return nil
 }
 func (e *Extractor) FetchAndProcessChapter(chapterUrl string) error {
 	// Get HTML document.
 	resp, err := http.Get(chapterUrl)
 	if err != nil {
 		return err
 	}
 	defer resp.Body.Close()
 	// Parse HTML via Goquery (or really x/net/html).
 	doc, err := gq.NewDocumentFromReader(resp.Body)
 	if err != nil {
 		return err
 	}
 	// Parse page.
 	err = e.parseAdditionalPage(doc)
 	if err != nil {
 		return err
 	}
 	// Add horizontal rule after title page.
 	if path.Base(chapterUrl) == "titlepage.html" {
 		fmt.Fprintln(e.W, "----------------")
 	}
 	return nil
 }
 func main() {
 	var url string
 	dir := "."
 	if len(os.Args) < 2 {
 		usage(os.Args[0], 1)
 	}
 	// Parse command line arguments.
 	for i := 1; i < len(os.Args); i++ {
 		// Returns the argument after the given option. Errors if there is no
 		// argument.
 		expectArg := func(currArg string) string {
 			i++
 			if i >= len(os.Args) {
 				printErr("Expected argument after option '%v'", currArg)
 			}
 			return os.Args[i]
 		}
 		arg := os.Args[i]
 		if len(arg) >= 1 && arg[0] == '-' {
 			switch arg {
 			case "-dir":
 				dir = expectArg(arg)
 			case "--help", "-h":
 				usage(os.Args[0], 0)
 			default:
 				printErr("Unknown option: '%v'", arg)
 			}
 		} else {
 			if url == "" {
 				url = arg
 			} else {
 				printErr("Expected option, but got '%v'", arg)
 			}
 		}
 	}
 	if url == "" {
 		printInfo("Please specify a book URL")
 		os.Exit(1)
 	}
 	printInfo("Book URL: %v", url)
 	// Initial scraping.
 	var b bytes.Buffer
 	e, err := NewExtractor(url, &b)
 	if err != nil {
 		panic(err)
 	}
 	err = e.FetchAndProcessIndex()
 	if err != nil {
 		panic(err)
 	}
 	bookName := e.Meta.ToTitle()
 	printInfo("Book: %v", bookName)
 	// Download the actual chapters.
 	for i, chapter := range e.ChapterUrls {
 		fmt.Printf("\033[2K* Downloading chapter %v/%v...\r", i+1, len(e.ChapterUrls))
 		err = e.FetchAndProcessChapter(chapter)
 		if err != nil {
 			panic(err)
 		}
 	}
 	// Write the generated markdown text to a file.
 	filename := path.Join(dir, bookName + ".md")
 	os.WriteFile(filename, b.Bytes(), 0666)
 	printInfo("Saved as: %v", filename)
 }