package main import ( "bytes" "errors" "fmt" "io" "net/http" "net/url" "os" "path" "strings" gq "github.com/PuerkitoBio/goquery" "golang.org/x/net/html" "golang.org/x/net/html/atom" ) var ( ErrInvalidURL = errors.New("invalid url") ErrNoChaptersFound = errors.New("no chapters found in index") ErrParsingPage = errors.New("error parsing page") ErrBookNotFound = errors.New("book not found") ) const ( colRed = "\033[31;1m" colYellow = "\033[33;1m" colReset = "\033[m" ) func usage(arg0 string, exitStatus int) { fmt.Fprintln(os.Stderr, `Usage: `+arg0+` [options...] Book URL format: http[s]://[www.]projekt-gutenberg.org//[/whateverdoesntmatter] Options: -dir -- Output directory (default: "."). Output types: * `+colYellow+`! `+colReset+` `+colRed+`! `+colReset) os.Exit(exitStatus) } func clearLine() { fmt.Print("\033[2K") } func printInfo(f string, v ...interface{}) { fmt.Printf("* "+f+"\n", v...) } func printWarn(f string, v ...interface{}) { fmt.Fprintf(os.Stderr, colYellow+"! "+f+colReset+"\n", v...) } func printErr(f string, v ...interface{}) { fmt.Fprintf(os.Stderr, colRed+"! "+f+colReset+"\n", v...) os.Exit(1) } func getBaseUrl(rawurl string) (string, error) { url, err := url.Parse(rawurl) if err != nil { return "", err } if !(url.Scheme == "http" || url.Scheme == "https") { return "", ErrInvalidURL } if !(url.Host == "projekt-gutenberg.org" || url.Host == "www.projekt-gutenberg.org") { return "", ErrInvalidURL } spPath := strings.Split(strings.Trim(url.Path, "/"), "/") if len(spPath) < 2 { return "", ErrInvalidURL } basePath := strings.Join(spPath[:2], "/") return url.Scheme + "://projekt-gutenberg.org/" + basePath, nil } // Returns a slice containing the links to the chapters. func getChapters(baseUrl string, doc *gq.Document) ([]string, error) { chapterUrls := make([]string, 0, 8) doc.Find("body ul li").Each(func(i int, s *gq.Selection) { // The website has a strange bug where the 'a' element is separate from // the text element. That's why we have to search the entire 'li' // element for an 'a' element with a link. s = s.Find("a[href]") if len(s.Nodes) == 0 { // This should really never happen, that's why we're using panic. panic("missing link in chapter index") } relUrl, _ := s.Attr("href") // We now know it must have the href attribute. chapterUrls = append(chapterUrls, baseUrl+"/"+relUrl) }) if len(chapterUrls) == 0 { return nil, ErrNoChaptersFound } return chapterUrls, nil } type MetaInfo struct { Author string Title string Year string } func getMetaInfo(doc *gq.Document) MetaInfo { metas := doc.Find("head meta") return MetaInfo{ Author: metas.Filter("[name=\"author\"]").AttrOr("content", "Unknown"), Title: metas.Filter("[name=\"title\"]").AttrOr("content", "Unknown"), Year: metas.Filter("[name=\"firstpub\"]").AttrOr("content", "Unknown"), } } func (m MetaInfo) ToTitle() string { return fmt.Sprintf("%s -- %s, %s", m.Author, m.Title, m.Year) } type Extractor struct { BaseUrl string Meta MetaInfo ChapterUrls []string W io.Writer } func NewExtractor(rawurl string, w io.Writer) (*Extractor, error) { baseUrl, err := getBaseUrl(rawurl) if err != nil { return nil, err } return &Extractor{ BaseUrl: baseUrl, W: w, }, nil } func (e *Extractor) FetchAndProcessIndex() error { // Get HTML document. resp, err := http.Get(e.BaseUrl) if err != nil { return err } if resp.StatusCode == 404 { return ErrBookNotFound } defer resp.Body.Close() // Parse HTML via Goquery. doc, err := gq.NewDocumentFromReader(resp.Body) if err != nil { return err } // Get metadata. metaInfo := getMetaInfo(doc) e.Meta = metaInfo // Get chapter URLs from index. chapterUrls, err := getChapters(e.BaseUrl, doc) if err != nil { return err } e.ChapterUrls = chapterUrls return nil } func (e *Extractor) parseAdditionalPage(doc *gq.Document) error { // Every document has two main
elements with the given properties. // They are a way to mark the contained text. var passedHrs int var err error content := doc.Find("body").Children().FilterFunction(func(i int, s *gq.Selection) bool { if s.Is("hr[size=\"1\"][color=\"#808080\"]") { passedHrs++ return false } else if s.Is("a") && (s.Text() == "<<\u00A0zurück" || s.Text() == "weiter\u00A0>>") { // We don't want the "zurück"/"weiter"-buttons return false } switch passedHrs { case 0: return false case 1: return true case 2: return false default: err = ErrParsingPage return false } }) if err != nil { return err } // Now that we've extracted the actual content, convert it into markdown. var process func(*html.Node) string process = func(n *html.Node) string { processChildren := func() string { var ret string for i := n.FirstChild; i != nil; i = i.NextSibling { ret += process(i) } return ret } // Checks if `n` has the given HTML class. hasClass := func(class string) bool { for _, v := range n.Attr { if v.Key == "class" { classes := strings.Split(v.Val, " ") for _, cl := range classes { if cl == class { return true } } return false } } return false } var ret string switch n.Type { case html.TextNode: // If we have a text node, return the actual text after some // post-processing. ret = strings.ReplaceAll(n.Data, "\n", "") var newRet string // Replace all sequences of spaces consisting of more than one space // with just one space. var prevWasSpace bool for _, c := range ret { if c == ' ' { if prevWasSpace { continue } prevWasSpace = true } else { prevWasSpace = false } newRet += string(c) } ret = newRet case html.ElementNode: // Transform the individual HTML elements. switch n.DataAtom { case atom.Br: ret = "\n\n" case atom.H1: ret = "# " + processChildren() + "\n" case atom.H2: ret = "## " + processChildren() + "\n" case atom.H3: ret = "### " + processChildren() + "\n" case atom.H4: ret = "#### " + processChildren() + "\n" case atom.H5: ret = "##### " + processChildren() + "\n" case atom.H6: ret = "###### " + processChildren() + "\n" case atom.P: if hasClass("centerbig") { ret = "#### " + processChildren() + "\n\n" } else { ret = /*" " + */ processChildren() + "\n\n" } case atom.Div: ret = processChildren() case atom.Tt: ret = "`" + processChildren() + "`" case atom.I: ret = "_" + processChildren() + "_" case atom.A: ret = processChildren() case atom.Span: ret = processChildren() case atom.Img: default: clearLine() printWarn("Unknown data atom: %v", n.Data) } // Add some CSS effects. if hasClass("spaced") { // Add spaced effect. var newRet string var runes []rune = []rune(ret) var nRunes = len(runes) for i := 0; i < nRunes; i++ { newRet += string(runes[i]) if i < nRunes-1 { newRet += " " } } ret = newRet } default: clearLine() printWarn("Unknown type: %v", n.Type) } return ret } for _, n := range content.Nodes { fmt.Fprint(e.W, process(n)) } return nil } func (e *Extractor) FetchAndProcessChapter(chapterUrl string) error { // Get HTML document. resp, err := http.Get(chapterUrl) if err != nil { return err } defer resp.Body.Close() // Parse HTML via Goquery (or really x/net/html). doc, err := gq.NewDocumentFromReader(resp.Body) if err != nil { return err } // Parse page. err = e.parseAdditionalPage(doc) if err != nil { return err } // Add horizontal rule after title page. if path.Base(chapterUrl) == "titlepage.html" { fmt.Fprintln(e.W, "\n----------------\n") } return nil } func main() { var url string dir := "." if len(os.Args) < 2 { usage(os.Args[0], 1) } // Parse command line arguments. for i := 1; i < len(os.Args); i++ { // Returns the argument after the given option. Errors if there is no // argument. expectArg := func(currArg string) string { i++ if i >= len(os.Args) { printErr("Expected argument after option '%v'", currArg) } return os.Args[i] } arg := os.Args[i] if len(arg) >= 1 && arg[0] == '-' { switch arg { case "-dir": dir = expectArg(arg) case "--help", "-h": usage(os.Args[0], 0) default: printErr("Unknown option: '%v'", arg) } } else { if url == "" { url = arg } else { printErr("Expected option, but got '%v'", arg) } } } if url == "" { printInfo("Please specify a book URL") os.Exit(1) } printInfo("Book URL: %v", url) // Initial scraping. var b bytes.Buffer e, err := NewExtractor(url, &b) if err != nil { printErr("Error: %v", err) } err = e.FetchAndProcessIndex() if err != nil { printErr("Error: %v", err) } bookName := e.Meta.ToTitle() printInfo("Book: %v", bookName) // Download the actual chapters. for i, chapter := range e.ChapterUrls { clearLine() fmt.Printf("* Downloading chapter %v/%v...\r", i+1, len(e.ChapterUrls)) err = e.FetchAndProcessChapter(chapter) if err != nil { printErr("Error: %v", err) } } // Write the generated markdown text to a file. filename := path.Join(dir, bookName+".md") os.WriteFile(filename, b.Bytes(), 0666) clearLine() printInfo("Saved as: %v", filename) }