bookscrape/se.go

136 lines
3.5 KiB
Go

package main
import (
"bytes"
"encoding/json"
"fmt"
"net/url"
"os"
"path"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
)
const se string = "https://standardebooks.org/ebooks?page=%d"
const seRoot string = "https://standardebooks.com"
func buildSEBookList() []string {
fmt.Println(" Gathering Standard Ebooks book list")
list := make([]string, 0, 50)
for i := 1;;i++{
fmt.Print(" Page ", i, "\r")
body, end := getUrl(fmt.Sprintf(se, i))
if end {
break
}
d, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
if err != nil {
panic(err)
}
d.Find("ol.ebooks-list li > p:not(.author) > a").Each(func(i int, s *goquery.Selection) {
href, exists := s.Attr("href")
if exists {
book, err := url.JoinPath(seRoot,href)
if err != nil {
return
}
list = append(list, book)
}
})
count := d.Find("main nav.pagination > a:last-child[aria-disabled]").Length()
if count > 0 {
break
}
}
fmt.Println("")
return list
}
func getSEBookDoc(u string) doc {
body, fail := getUrl(u)
if fail {
return doc{}
}
d, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
if err != nil {
panic(err)
}
var out doc
out.Subjects = make([]string, 0, 1)
out.Files = make([]fileSource, 0, 3)
out.Title = strings.TrimSpace(d.Find("main article header hgroup h1").First().Text())
out.Author = strings.TrimSpace(d.Find("main article header hgroup h2 a span").First().Text())
d.Find("main article aside ul.tags li").Each(func(i int, s *goquery.Selection) {
out.Subjects = append(out.Subjects, strings.ToLower(strings.TrimSpace(s.Text())))
})
d.Find("main article section#description p").Each(func(i int, s *goquery.Selection) {
out.Description = out.Description + s.Text() + "\\n\\n"
})
out.Description = strings.TrimSpace(out.Description)
out.License = "CCO 1.0 / Public Domain"
out.LastUpdate = d.Find("main article section#history ol li:first-child time").First().AttrOr("datetime", "")
d.Find("main article section#download ul li p span:first-child a").Each(func(i int, s *goquery.Selection) {
var ext string = ""
var fs fileSource
switch i {
case 0:
fs.Format = "epub"
ext = "epub"
case 1:
fs.Format = "azw3"
ext = "azw3"
case 2:
fs.Format = "kepub"
ext = "kepub.epub"
default:
return
}
href, exists := s.Attr("href")
if !exists {
return
}
r := path.Dir(href)
book := path.Base(r)
author := path.Base(path.Dir(r))
fs.Url, _ = url.JoinPath(seRoot, r, "downloads", fmt.Sprintf("%s_%s.%s", author, book, ext))
out.Files = append(out.Files, fs)
})
return out
}
func standardEbooks() {
fmt.Println("\033[1mStandard Ebooks\033[0m")
t := time.Now().Format("2006/01/02 15:04PM")
bookList := buildSEBookList()
sd := sourceData{
"Standard Ebooks",
t,
seRoot,
make([]doc, 0, len(bookList)),
}
fmt.Println(" Gathering book data")
for i := range bookList {
fmt.Print( " ",i+1, " / ", len(bookList), "\r")
sd.Documents = append(sd.Documents, getSEBookDoc(bookList[i]))
}
fmt.Println("")
fmt.Println(" Marshaling JSON")
b, err := json.MarshalIndent(sd, "", " ")
if err != nil {
panic(err)
}
fmt.Println(" Creating file")
f, err := os.Create("standard-ebooks.json")
if err != nil {
panic(err)
}
defer f.Close()
fmt.Println(" Writing file")
f.Write(b)
fmt.Println("Done.")
}