diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..c1d82d8 --- /dev/null +++ b/go.mod @@ -0,0 +1,5 @@ +module projekt_gutenberg_de_dl + +go 1.16 + +require github.com/PuerkitoBio/goquery v1.7.1 diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..986201f --- /dev/null +++ b/go.sum @@ -0,0 +1,12 @@ +github.com/PuerkitoBio/goquery v1.7.1 h1:oE+T06D+1T7LNrn91B4aERsRIeCLJ/oPSa6xB9FPnz4= +github.com/PuerkitoBio/goquery v1.7.1/go.mod h1:XY0pP4kfraEmmV1O7Uf6XyjoslwsneBbgeDjLYuN8xY= +github.com/andybalholm/cascadia v1.2.0 h1:vuRCkM5Ozh/BfmsaTm26kbjm0mIOM3yS5Ek/F5h18aE= +github.com/andybalholm/cascadia v1.2.0/go.mod h1:YCyR8vOZT9aZ1CHEd8ap0gMVm2aFgxBp0T0eFw1RUQY= +golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20210614182718-04defd469f4e h1:XpT3nA5TvE525Ne3hInMh6+GETgn27Zfm9dxsThnX2Q= +golang.org/x/net v0.0.0-20210614182718-04defd469f4e/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= diff --git a/main.go b/main.go new file mode 100644 index 0000000..d30bef9 --- /dev/null +++ b/main.go @@ -0,0 +1,396 @@ +package main + +import ( + "bytes" + "errors" + "fmt" + "io" + "net/http" + "net/url" + "os" + "path" + "strings" + + gq "github.com/PuerkitoBio/goquery" + "golang.org/x/net/html" + "golang.org/x/net/html/atom" +) + +var ( + ErrInvalidURL = errors.New("invalid url") + ErrNoChaptersFound = errors.New("no chapters found in index") + ErrParsingPage = errors.New("error parsing page") +) + +const ( + colRed = "\033[31m" + colYellow = "\033[33m" + colReset = "\033[m" +) + +func usage(arg0 string, exitStatus int) { + fmt.Fprintln(os.Stderr, `Usage: + `+arg0+` [options...] + +Book URL format: + http[s]://[www.]projekt-gutenberg.org//[/whateverdoesntmatter] + +Options: + -dir -- Output directory (default: "."). + +Output types: + * + `+colYellow+`! `+colReset+` + `+colRed+`! `+colReset) + os.Exit(exitStatus) +} + +func printInfo(f string, v ...interface{}) { + fmt.Printf("* "+f+"\n", v...) +} + +func printWarn(f string, v ...interface{}) { + fmt.Fprintf(os.Stderr, colYellow+"! "+f+colReset+"\n", v...) +} + +func printErr(f string, v ...interface{}) { + fmt.Fprintf(os.Stderr, colRed+"! "+f+colReset+"\n", v...) + os.Exit(1) +} + +func getBaseUrl(rawurl string) (string, error) { + url, err := url.Parse(rawurl) + if err != nil { + return "", err + } + if !(url.Scheme == "http" || url.Scheme == "https") { + return "", ErrInvalidURL + } + if !(url.Host == "projekt-gutenberg.org" || url.Host == "www.projekt-gutenberg.org") { + return "", ErrInvalidURL + } + spPath := strings.Split(strings.Trim(url.Path, "/"), "/") + if len(spPath) < 2 { + return "", ErrInvalidURL + } + basePath := strings.Join(spPath[:2], "/") + return url.Scheme + "://projekt-gutenberg.org/" + basePath, nil +} + +// Returns a slice containing the links to the chapters. +func getChapters(baseUrl string, doc *gq.Document) ([]string, error) { + chapterUrls := make([]string, 0, 8) + doc.Find("body ul li").Each(func(i int, s *gq.Selection) { + // The website has a strange bug where the 'a' element is separate from + // the text element. That's why we have to search the entire 'li' + // element for an 'a' element with a link. + s = s.Find("a[href]") + if len(s.Nodes) == 0 { + // This should really never happen, that's why we're using panic. + panic("missing link in chapter index") + } + relUrl, _ := s.Attr("href") // We now know it must have the href attribute. + chapterUrls = append(chapterUrls, baseUrl+"/"+relUrl) + }) + if len(chapterUrls) == 0 { + return nil, ErrNoChaptersFound + } + return chapterUrls, nil +} + +type MetaInfo struct { + Author string + Title string + Year string +} + +func getMetaInfo(doc *gq.Document) MetaInfo { + metas := doc.Find("head meta") + return MetaInfo{ + Author: metas.Filter("[name=\"author\"]").AttrOr("content", "Unknown"), + Title: metas.Filter("[name=\"title\"]").AttrOr("content", "Unknown"), + Year: metas.Filter("[name=\"firstpub\"]").AttrOr("content", "Unknown"), + } +} + +func (m MetaInfo) ToTitle() string { + return fmt.Sprintf("%s -- %s, %s", m.Author, m.Title, m.Year) +} + +type Extractor struct { + BaseUrl string + Meta MetaInfo + ChapterUrls []string + W io.Writer +} + +func NewExtractor(rawurl string, w io.Writer) (*Extractor, error) { + baseUrl, err := getBaseUrl(rawurl) + if err != nil { + return nil, err + } + return &Extractor{ + BaseUrl: baseUrl, + W: w, + }, nil +} + +func (e *Extractor) FetchAndProcessIndex() error { + // Get HTML document. + resp, err := http.Get(e.BaseUrl) + if err != nil { + return err + } + defer resp.Body.Close() + // Parse HTML via Goquery. + doc, err := gq.NewDocumentFromReader(resp.Body) + if err != nil { + return err + } + // Get metadata. + metaInfo := getMetaInfo(doc) + e.Meta = metaInfo + // Get chapter URLs from index. + chapterUrls, err := getChapters(e.BaseUrl, doc) + if err != nil { + return err + } + e.ChapterUrls = chapterUrls + return nil +} + +func (e *Extractor) parseAdditionalPage(doc *gq.Document) error { + // Every document has two main
elements with the given properties. + // They are a way to mark the contained text. + var passedHrs int + var err error + content := doc.Find("body").Children().FilterFunction(func(i int, s *gq.Selection) bool { + if s.Is("hr[size=\"1\"][color=\"#808080\"]") { + passedHrs++ + return false + } else if s.Is("a") && (s.Text() == "<<\u00A0zurück" || s.Text() == "weiter\u00A0>>") { + // We don't want the "zurück"/"weiter"-buttons + return false + } + switch passedHrs { + case 0: + return false + case 1: + return true + case 2: + return false + default: + err = ErrParsingPage + return false + } + }) + if err != nil { + return err + } + + // Now that we've extracted the actual content, convert it into markdown. + var process func(*html.Node) string + process = func(n *html.Node) string { + processChildren := func() string { + var ret string + for i := n.FirstChild; i != nil; i = i.NextSibling { + ret += process(i) + } + return ret + } + + // Checks if `n` has the given HTML class. + hasClass := func(class string) bool { + for _, v := range n.Attr { + if v.Key == "class" { + classes := strings.Split(v.Val, " ") + for _, cl := range classes { + if cl == class { + return true + } + } + return false + } + } + return false + } + + var ret string + switch n.Type { + case html.TextNode: + // If we have a text node, return the actual text after some + // post-processing. + ret = strings.ReplaceAll(n.Data, "\n", "") + var newRet string + // Replace all sequences of spaces consisting of more than one space + // with just one space. + var prevWasSpace bool + for _, c := range ret { + if c == ' ' { + if prevWasSpace { + continue + } + prevWasSpace = true + } else { + prevWasSpace = false + } + newRet += string(c) + } + ret = newRet + case html.ElementNode: + // Transform the individual HTML elements. + switch n.DataAtom { + case atom.Br: + ret = "\n" + case atom.H1: + ret = "# " + processChildren() + "\n" + case atom.H2: + ret = "## " + processChildren() + "\n" + case atom.H3: + ret = "### " + processChildren() + "\n" + case atom.H4: + ret = "#### " + processChildren() + "\n" + case atom.H5: + ret = "##### " + processChildren() + "\n" + case atom.H6: + ret = "###### " + processChildren() + "\n" + case atom.P: + if hasClass("centerbig") { + ret = "#### " + processChildren() + "\n\n" + } else { + ret = /*" " + */processChildren() + "\n\n" + } + case atom.Div: + ret = processChildren() + case atom.Tt: + ret = "`" + processChildren() + "`" + case atom.I: + ret = "_" + processChildren() + "_" + case atom.A: + ret = processChildren() + case atom.Span: + ret = processChildren() + case atom.Img: + default: + printWarn("Unknown data atom:", n.Data) + } + // Add some CSS effects. + if hasClass("spaced") { + // Add spaced effect. + var newRet string + var runes []rune = []rune(ret) + var nRunes = len(runes) + for i := 0; i < nRunes; i++ { + newRet += string(runes[i]) + if i < nRunes-1 { + newRet += " " + } + } + ret = newRet + } + default: + printWarn("Unknown type:", n.Type) + } + return ret + } + for _, n := range content.Nodes { + fmt.Fprint(e.W, process(n)) + } + return nil +} + +func (e *Extractor) FetchAndProcessChapter(chapterUrl string) error { + // Get HTML document. + resp, err := http.Get(chapterUrl) + if err != nil { + return err + } + defer resp.Body.Close() + // Parse HTML via Goquery (or really x/net/html). + doc, err := gq.NewDocumentFromReader(resp.Body) + if err != nil { + return err + } + // Parse page. + err = e.parseAdditionalPage(doc) + if err != nil { + return err + } + // Add horizontal rule after title page. + if path.Base(chapterUrl) == "titlepage.html" { + fmt.Fprintln(e.W, "----------------") + } + return nil +} + +func main() { + var url string + dir := "." + + if len(os.Args) < 2 { + usage(os.Args[0], 1) + } + + // Parse command line arguments. + for i := 1; i < len(os.Args); i++ { + // Returns the argument after the given option. Errors if there is no + // argument. + expectArg := func(currArg string) string { + i++ + if i >= len(os.Args) { + printErr("Expected argument after option '%v'", currArg) + } + return os.Args[i] + } + + arg := os.Args[i] + if len(arg) >= 1 && arg[0] == '-' { + switch arg { + case "-dir": + dir = expectArg(arg) + case "--help", "-h": + usage(os.Args[0], 0) + default: + printErr("Unknown option: '%v'", arg) + } + } else { + if url == "" { + url = arg + } else { + printErr("Expected option, but got '%v'", arg) + } + } + } + if url == "" { + printInfo("Please specify a book URL") + os.Exit(1) + } + printInfo("Book URL: %v", url) + + // Initial scraping. + var b bytes.Buffer + e, err := NewExtractor(url, &b) + if err != nil { + panic(err) + } + err = e.FetchAndProcessIndex() + if err != nil { + panic(err) + } + bookName := e.Meta.ToTitle() + printInfo("Book: %v", bookName) + + // Download the actual chapters. + for i, chapter := range e.ChapterUrls { + fmt.Printf("\033[2K* Downloading chapter %v/%v...\r", i+1, len(e.ChapterUrls)) + err = e.FetchAndProcessChapter(chapter) + if err != nil { + panic(err) + } + } + + // Write the generated markdown text to a file. + filename := path.Join(dir, bookName + ".md") + os.WriteFile(filename, b.Bytes(), 0666) + printInfo("Saved as: %v", filename) +}