parent
ca11b81540
commit
feafe19aa1
3 changed files with 413 additions and 0 deletions
@ -0,0 +1,5 @@ |
||||
module projekt_gutenberg_de_dl |
||||
|
||||
go 1.16 |
||||
|
||||
require github.com/PuerkitoBio/goquery v1.7.1 |
@ -0,0 +1,12 @@ |
||||
github.com/PuerkitoBio/goquery v1.7.1 h1:oE+T06D+1T7LNrn91B4aERsRIeCLJ/oPSa6xB9FPnz4= |
||||
github.com/PuerkitoBio/goquery v1.7.1/go.mod h1:XY0pP4kfraEmmV1O7Uf6XyjoslwsneBbgeDjLYuN8xY= |
||||
github.com/andybalholm/cascadia v1.2.0 h1:vuRCkM5Ozh/BfmsaTm26kbjm0mIOM3yS5Ek/F5h18aE= |
||||
github.com/andybalholm/cascadia v1.2.0/go.mod h1:YCyR8vOZT9aZ1CHEd8ap0gMVm2aFgxBp0T0eFw1RUQY= |
||||
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= |
||||
golang.org/x/net v0.0.0-20210614182718-04defd469f4e h1:XpT3nA5TvE525Ne3hInMh6+GETgn27Zfm9dxsThnX2Q= |
||||
golang.org/x/net v0.0.0-20210614182718-04defd469f4e/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= |
||||
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= |
||||
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= |
||||
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= |
||||
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= |
||||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= |
@ -0,0 +1,396 @@ |
||||
package main |
||||
|
||||
import ( |
||||
"bytes" |
||||
"errors" |
||||
"fmt" |
||||
"io" |
||||
"net/http" |
||||
"net/url" |
||||
"os" |
||||
"path" |
||||
"strings" |
||||
|
||||
gq "github.com/PuerkitoBio/goquery" |
||||
"golang.org/x/net/html" |
||||
"golang.org/x/net/html/atom" |
||||
) |
||||
|
||||
var ( |
||||
ErrInvalidURL = errors.New("invalid url") |
||||
ErrNoChaptersFound = errors.New("no chapters found in index") |
||||
ErrParsingPage = errors.New("error parsing page") |
||||
) |
||||
|
||||
const ( |
||||
colRed = "\033[31m" |
||||
colYellow = "\033[33m" |
||||
colReset = "\033[m" |
||||
) |
||||
|
||||
func usage(arg0 string, exitStatus int) { |
||||
fmt.Fprintln(os.Stderr, `Usage: |
||||
`+arg0+` [options...] <BOOK_URL> |
||||
|
||||
Book URL format: |
||||
http[s]://[www.]projekt-gutenberg.org/<author>/<book>[/whateverdoesntmatter]
|
||||
|
||||
Options: |
||||
-dir <DIRECTORY> -- Output directory (default: "."). |
||||
|
||||
Output types: |
||||
* <INFO> |
||||
`+colYellow+`! <WARNING>`+colReset+` |
||||
`+colRed+`! <ERROR>`+colReset) |
||||
os.Exit(exitStatus) |
||||
} |
||||
|
||||
func printInfo(f string, v ...interface{}) { |
||||
fmt.Printf("* "+f+"\n", v...) |
||||
} |
||||
|
||||
func printWarn(f string, v ...interface{}) { |
||||
fmt.Fprintf(os.Stderr, colYellow+"! "+f+colReset+"\n", v...) |
||||
} |
||||
|
||||
func printErr(f string, v ...interface{}) { |
||||
fmt.Fprintf(os.Stderr, colRed+"! "+f+colReset+"\n", v...) |
||||
os.Exit(1) |
||||
} |
||||
|
||||
func getBaseUrl(rawurl string) (string, error) { |
||||
url, err := url.Parse(rawurl) |
||||
if err != nil { |
||||
return "", err |
||||
} |
||||
if !(url.Scheme == "http" || url.Scheme == "https") { |
||||
return "", ErrInvalidURL |
||||
} |
||||
if !(url.Host == "projekt-gutenberg.org" || url.Host == "www.projekt-gutenberg.org") { |
||||
return "", ErrInvalidURL |
||||
} |
||||
spPath := strings.Split(strings.Trim(url.Path, "/"), "/") |
||||
if len(spPath) < 2 { |
||||
return "", ErrInvalidURL |
||||
} |
||||
basePath := strings.Join(spPath[:2], "/") |
||||
return url.Scheme + "://projekt-gutenberg.org/" + basePath, nil |
||||
} |
||||
|
||||
// Returns a slice containing the links to the chapters.
|
||||
func getChapters(baseUrl string, doc *gq.Document) ([]string, error) { |
||||
chapterUrls := make([]string, 0, 8) |
||||
doc.Find("body ul li").Each(func(i int, s *gq.Selection) { |
||||
// The website has a strange bug where the 'a' element is separate from
|
||||
// the text element. That's why we have to search the entire 'li'
|
||||
// element for an 'a' element with a link.
|
||||
s = s.Find("a[href]") |
||||
if len(s.Nodes) == 0 { |
||||
// This should really never happen, that's why we're using panic.
|
||||
panic("missing link in chapter index") |
||||
} |
||||
relUrl, _ := s.Attr("href") // We now know it must have the href attribute.
|
||||
chapterUrls = append(chapterUrls, baseUrl+"/"+relUrl) |
||||
}) |
||||
if len(chapterUrls) == 0 { |
||||
return nil, ErrNoChaptersFound |
||||
} |
||||
return chapterUrls, nil |
||||
} |
||||
|
||||
type MetaInfo struct { |
||||
Author string |
||||
Title string |
||||
Year string |
||||
} |
||||
|
||||
func getMetaInfo(doc *gq.Document) MetaInfo { |
||||
metas := doc.Find("head meta") |
||||
return MetaInfo{ |
||||
Author: metas.Filter("[name=\"author\"]").AttrOr("content", "Unknown"), |
||||
Title: metas.Filter("[name=\"title\"]").AttrOr("content", "Unknown"), |
||||
Year: metas.Filter("[name=\"firstpub\"]").AttrOr("content", "Unknown"), |
||||
} |
||||
} |
||||
|
||||
func (m MetaInfo) ToTitle() string { |
||||
return fmt.Sprintf("%s -- %s, %s", m.Author, m.Title, m.Year) |
||||
} |
||||
|
||||
type Extractor struct { |
||||
BaseUrl string |
||||
Meta MetaInfo |
||||
ChapterUrls []string |
||||
W io.Writer |
||||
} |
||||
|
||||
func NewExtractor(rawurl string, w io.Writer) (*Extractor, error) { |
||||
baseUrl, err := getBaseUrl(rawurl) |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
return &Extractor{ |
||||
BaseUrl: baseUrl, |
||||
W: w, |
||||
}, nil |
||||
} |
||||
|
||||
func (e *Extractor) FetchAndProcessIndex() error { |
||||
// Get HTML document.
|
||||
resp, err := http.Get(e.BaseUrl) |
||||
if err != nil { |
||||
return err |
||||
} |
||||
defer resp.Body.Close() |
||||
// Parse HTML via Goquery.
|
||||
doc, err := gq.NewDocumentFromReader(resp.Body) |
||||
if err != nil { |
||||
return err |
||||
} |
||||
// Get metadata.
|
||||
metaInfo := getMetaInfo(doc) |
||||
e.Meta = metaInfo |
||||
// Get chapter URLs from index.
|
||||
chapterUrls, err := getChapters(e.BaseUrl, doc) |
||||
if err != nil { |
||||
return err |
||||
} |
||||
e.ChapterUrls = chapterUrls |
||||
return nil |
||||
} |
||||
|
||||
func (e *Extractor) parseAdditionalPage(doc *gq.Document) error { |
||||
// Every document has two main <hr> elements with the given properties.
|
||||
// They are a way to mark the contained text.
|
||||
var passedHrs int |
||||
var err error |
||||
content := doc.Find("body").Children().FilterFunction(func(i int, s *gq.Selection) bool { |
||||
if s.Is("hr[size=\"1\"][color=\"#808080\"]") { |
||||
passedHrs++ |
||||
return false |
||||
} else if s.Is("a") && (s.Text() == "<<\u00A0zurück" || s.Text() == "weiter\u00A0>>") { |
||||
// We don't want the "zurück"/"weiter"-buttons
|
||||
return false |
||||
} |
||||
switch passedHrs { |
||||
case 0: |
||||
return false |
||||
case 1: |
||||
return true |
||||
case 2: |
||||
return false |
||||
default: |
||||
err = ErrParsingPage |
||||
return false |
||||
} |
||||
}) |
||||
if err != nil { |
||||
return err |
||||
} |
||||
|
||||
// Now that we've extracted the actual content, convert it into markdown.
|
||||
var process func(*html.Node) string |
||||
process = func(n *html.Node) string { |
||||
processChildren := func() string { |
||||
var ret string |
||||
for i := n.FirstChild; i != nil; i = i.NextSibling { |
||||
ret += process(i) |
||||
} |
||||
return ret |
||||
} |
||||
|
||||
// Checks if `n` has the given HTML class.
|
||||
hasClass := func(class string) bool { |
||||
for _, v := range n.Attr { |
||||
if v.Key == "class" { |
||||
classes := strings.Split(v.Val, " ") |
||||
for _, cl := range classes { |
||||
if cl == class { |
||||
return true |
||||
} |
||||
} |
||||
return false |
||||
} |
||||
} |
||||
return false |
||||
} |
||||
|
||||
var ret string |
||||
switch n.Type { |
||||
case html.TextNode: |
||||
// If we have a text node, return the actual text after some
|
||||
// post-processing.
|
||||
ret = strings.ReplaceAll(n.Data, "\n", "") |
||||
var newRet string |
||||
// Replace all sequences of spaces consisting of more than one space
|
||||
// with just one space.
|
||||
var prevWasSpace bool |
||||
for _, c := range ret { |
||||
if c == ' ' { |
||||
if prevWasSpace { |
||||
continue |
||||
} |
||||
prevWasSpace = true |
||||
} else { |
||||
prevWasSpace = false |
||||
} |
||||
newRet += string(c) |
||||
} |
||||
ret = newRet |
||||
case html.ElementNode: |
||||
// Transform the individual HTML elements.
|
||||
switch n.DataAtom { |
||||
case atom.Br: |
||||
ret = "\n" |
||||
case atom.H1: |
||||
ret = "# " + processChildren() + "\n" |
||||
case atom.H2: |
||||
ret = "## " + processChildren() + "\n" |
||||
case atom.H3: |
||||
ret = "### " + processChildren() + "\n" |
||||
case atom.H4: |
||||
ret = "#### " + processChildren() + "\n" |
||||
case atom.H5: |
||||
ret = "##### " + processChildren() + "\n" |
||||
case atom.H6: |
||||
ret = "###### " + processChildren() + "\n" |
||||
case atom.P: |
||||
if hasClass("centerbig") { |
||||
ret = "#### " + processChildren() + "\n\n" |
||||
} else { |
||||
ret = /*" " + */processChildren() + "\n\n" |
||||
} |
||||
case atom.Div: |
||||
ret = processChildren() |
||||
case atom.Tt: |
||||
ret = "`" + processChildren() + "`" |
||||
case atom.I: |
||||
ret = "_" + processChildren() + "_" |
||||
case atom.A: |
||||
ret = processChildren() |
||||
case atom.Span: |
||||
ret = processChildren() |
||||
case atom.Img: |
||||
default: |
||||
printWarn("Unknown data atom:", n.Data) |
||||
} |
||||
// Add some CSS effects.
|
||||
if hasClass("spaced") { |
||||
// Add spaced effect.
|
||||
var newRet string |
||||
var runes []rune = []rune(ret) |
||||
var nRunes = len(runes) |
||||
for i := 0; i < nRunes; i++ { |
||||
newRet += string(runes[i]) |
||||
if i < nRunes-1 { |
||||
newRet += " " |
||||
} |
||||
} |
||||
ret = newRet |
||||
} |
||||
default: |
||||
printWarn("Unknown type:", n.Type) |
||||
} |
||||
return ret |
||||
} |
||||
for _, n := range content.Nodes { |
||||
fmt.Fprint(e.W, process(n)) |
||||
} |
||||
return nil |
||||
} |
||||
|
||||
func (e *Extractor) FetchAndProcessChapter(chapterUrl string) error { |
||||
// Get HTML document.
|
||||
resp, err := http.Get(chapterUrl) |
||||
if err != nil { |
||||
return err |
||||
} |
||||
defer resp.Body.Close() |
||||
// Parse HTML via Goquery (or really x/net/html).
|
||||
doc, err := gq.NewDocumentFromReader(resp.Body) |
||||
if err != nil { |
||||
return err |
||||
} |
||||
// Parse page.
|
||||
err = e.parseAdditionalPage(doc) |
||||
if err != nil { |
||||
return err |
||||
} |
||||
// Add horizontal rule after title page.
|
||||
if path.Base(chapterUrl) == "titlepage.html" { |
||||
fmt.Fprintln(e.W, "----------------") |
||||
} |
||||
return nil |
||||
} |
||||
|
||||
func main() { |
||||
var url string |
||||
dir := "." |
||||
|
||||
if len(os.Args) < 2 { |
||||
usage(os.Args[0], 1) |
||||
} |
||||
|
||||
// Parse command line arguments.
|
||||
for i := 1; i < len(os.Args); i++ { |
||||
// Returns the argument after the given option. Errors if there is no
|
||||
// argument.
|
||||
expectArg := func(currArg string) string { |
||||
i++ |
||||
if i >= len(os.Args) { |
||||
printErr("Expected argument after option '%v'", currArg) |
||||
} |
||||
return os.Args[i] |
||||
} |
||||
|
||||
arg := os.Args[i] |
||||
if len(arg) >= 1 && arg[0] == '-' { |
||||
switch arg { |
||||
case "-dir": |
||||
dir = expectArg(arg) |
||||
case "--help", "-h": |
||||
usage(os.Args[0], 0) |
||||
default: |
||||
printErr("Unknown option: '%v'", arg) |
||||
} |
||||
} else { |
||||
if url == "" { |
||||
url = arg |
||||
} else { |
||||
printErr("Expected option, but got '%v'", arg) |
||||
} |
||||
} |
||||
} |
||||
if url == "" { |
||||
printInfo("Please specify a book URL") |
||||
os.Exit(1) |
||||
} |
||||
printInfo("Book URL: %v", url) |
||||
|
||||
// Initial scraping.
|
||||
var b bytes.Buffer |
||||
e, err := NewExtractor(url, &b) |
||||
if err != nil { |
||||
panic(err) |
||||
} |
||||
err = e.FetchAndProcessIndex() |
||||
if err != nil { |
||||
panic(err) |
||||
} |
||||
bookName := e.Meta.ToTitle() |
||||
printInfo("Book: %v", bookName) |
||||
|
||||
// Download the actual chapters.
|
||||
for i, chapter := range e.ChapterUrls { |
||||
fmt.Printf("\033[2K* Downloading chapter %v/%v...\r", i+1, len(e.ChapterUrls)) |
||||
err = e.FetchAndProcessChapter(chapter) |
||||
if err != nil { |
||||
panic(err) |
||||
} |
||||
} |
||||
|
||||
// Write the generated markdown text to a file.
|
||||
filename := path.Join(dir, bookName + ".md") |
||||
os.WriteFile(filename, b.Bytes(), 0666) |
||||
printInfo("Saved as: %v", filename) |
||||
} |
Loading…
Reference in new issue