136 lines
3.5 KiB
Go
136 lines
3.5 KiB
Go
package main
|
|
|
|
import (
|
|
"bytes"
|
|
"encoding/json"
|
|
"fmt"
|
|
"net/url"
|
|
"os"
|
|
"path"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
)
|
|
|
|
const se string = "https://standardebooks.org/ebooks?page=%d"
|
|
const seRoot string = "https://standardebooks.com"
|
|
|
|
|
|
func buildSEBookList() []string {
|
|
fmt.Println(" Gathering Standard Ebooks book list")
|
|
list := make([]string, 0, 50)
|
|
for i := 1;;i++{
|
|
fmt.Print(" Page ", i, "\r")
|
|
body, end := getUrl(fmt.Sprintf(se, i))
|
|
if end {
|
|
break
|
|
}
|
|
d, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
d.Find("ol.ebooks-list li > p:not(.author) > a").Each(func(i int, s *goquery.Selection) {
|
|
href, exists := s.Attr("href")
|
|
if exists {
|
|
book, err := url.JoinPath(seRoot,href)
|
|
if err != nil {
|
|
return
|
|
}
|
|
list = append(list, book)
|
|
}
|
|
})
|
|
|
|
count := d.Find("main nav.pagination > a:last-child[aria-disabled]").Length()
|
|
if count > 0 {
|
|
break
|
|
}
|
|
}
|
|
fmt.Println("")
|
|
return list
|
|
}
|
|
|
|
func getSEBookDoc(u string) doc {
|
|
body, fail := getUrl(u)
|
|
if fail {
|
|
return doc{}
|
|
}
|
|
d, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
var out doc
|
|
out.Subjects = make([]string, 0, 1)
|
|
out.Files = make([]fileSource, 0, 3)
|
|
out.Title = strings.TrimSpace(d.Find("main article header hgroup h1").First().Text())
|
|
out.Author = strings.TrimSpace(d.Find("main article header hgroup h2 a span").First().Text())
|
|
d.Find("main article aside ul.tags li").Each(func(i int, s *goquery.Selection) {
|
|
out.Subjects = append(out.Subjects, strings.ToLower(strings.TrimSpace(s.Text())))
|
|
})
|
|
d.Find("main article section#description p").Each(func(i int, s *goquery.Selection) {
|
|
out.Description = out.Description + s.Text() + "\\n\\n"
|
|
})
|
|
out.Description = strings.TrimSpace(out.Description)
|
|
out.License = "CCO 1.0 / Public Domain"
|
|
out.LastUpdate = d.Find("main article section#history ol li:first-child time").First().AttrOr("datetime", "")
|
|
d.Find("main article section#download ul li p span:first-child a").Each(func(i int, s *goquery.Selection) {
|
|
var ext string = ""
|
|
var fs fileSource
|
|
switch i {
|
|
case 0:
|
|
fs.Format = "epub"
|
|
ext = "epub"
|
|
case 1:
|
|
fs.Format = "azw3"
|
|
ext = "azw3"
|
|
case 2:
|
|
fs.Format = "kepub"
|
|
ext = "kepub.epub"
|
|
default:
|
|
return
|
|
}
|
|
href, exists := s.Attr("href")
|
|
if !exists {
|
|
return
|
|
}
|
|
r := path.Dir(href)
|
|
book := path.Base(r)
|
|
author := path.Base(path.Dir(r))
|
|
fs.Url, _ = url.JoinPath(seRoot, r, "downloads", fmt.Sprintf("%s_%s.%s", author, book, ext))
|
|
out.Files = append(out.Files, fs)
|
|
})
|
|
return out
|
|
}
|
|
|
|
func standardEbooks() {
|
|
fmt.Println("\033[1mStandard Ebooks\033[0m")
|
|
t := time.Now().Format("2006/01/02 15:04PM")
|
|
bookList := buildSEBookList()
|
|
sd := sourceData{
|
|
"Standard Ebooks",
|
|
t,
|
|
seRoot,
|
|
make([]doc, 0, len(bookList)),
|
|
}
|
|
fmt.Println(" Gathering book data")
|
|
for i := range bookList {
|
|
fmt.Print( " ",i+1, " / ", len(bookList), "\r")
|
|
sd.Documents = append(sd.Documents, getSEBookDoc(bookList[i]))
|
|
}
|
|
fmt.Println("")
|
|
fmt.Println(" Marshaling JSON")
|
|
b, err := json.MarshalIndent(sd, "", " ")
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
fmt.Println(" Creating file")
|
|
f, err := os.Create("standard-ebooks.json")
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
defer f.Close()
|
|
fmt.Println(" Writing file")
|
|
f.Write(b)
|
|
fmt.Println("Done.")
|
|
}
|