Initial commit

This commit is contained in:
sloum 2024-03-26 19:51:53 -07:00
commit a3ee46c254
10 changed files with 503 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
bookscrape
*.json

27
README.md Normal file
View File

@ -0,0 +1,27 @@
# bookscrape
A scraper for Standard Ebooks, Project Gutenberg, and Global Grey Ebooks. It produces json output listing books in a format compatible with [libman](https://tildegit.org/sloum/libman). The goal being to have a searchable, but modular, ebook manager (like a package manager, but for ebooks and their sources). That said, the json documents produced are flexible enough to be ingested and used by any number of other systems that wish to use these book catalogs.
## Building
```sh
go build
```
or
```sh
go install
```
## Running
```sh
bookscrape -se # fetch standard ebooks
bookscrape -gg # fetch global grey
bookscrape -pg # fetch project gutenberg
# There is also a convenient `-all` flag to do all of the above in one command
```
They will produce a json file each (even when `-all` is used). The sizes vary. Gutenberg is the largest file since their catalog is many times larger than the other two combined. However, Gutenberg is also the fastest to build since their website does not need to be crawled and scraped: they provide a CSV file, which this program ingests and modifies into the, much larger, json file.

129
globalGrey.go Normal file
View File

@ -0,0 +1,129 @@
package main
import (
"bytes"
"encoding/json"
"fmt"
"os"
"path"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
)
const gg string = "https://www.globalgreyebooks.com/category/ebooks/all-ebooks-page-%d.html"
func buildGGBookList() []string {
fmt.Println(" Gathering Global Grey book list")
list := make([]string, 0, 50)
for i := 1;;i++{
fmt.Print(" Page ", i, "\r")
body, end := getUrl(fmt.Sprintf(gg, i))
if end {
break
}
d, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
if err != nil {
panic(err)
}
d.Find("aside.aside-books > a:first-child").Each(func(i int, s *goquery.Selection) {
href, exists := s.Attr("href")
if exists {
list = append(list, href)
}
})
var moreExists = false
d.Find(".page-nav .pagination > a").Each(func(i int, s *goquery.Selection) {
if s.Text() == "Last" {
moreExists = true
}
})
if !moreExists {
break
}
}
fmt.Println("")
return list
}
func getGGBookDoc(u string) doc {
body, fail := getUrl(u)
if fail {
return doc{}
}
d, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
if err != nil {
panic(err)
}
var out doc
out.Subjects = make([]string, 0, 1)
out.Files = make([]fileSource, 0, 3)
out.Title = strings.TrimSpace(d.Find("h1").First().Text())
out.Author = strings.TrimSpace(d.Find("h1 + h2").First().Text())
d.Find("section.related > a > button").Each(func(i int, s *goquery.Selection) {
out.Subjects = append(out.Subjects, strings.ToLower(strings.TrimSpace(s.Text())))
})
var desc strings.Builder
d.Find("section.description > p").Each(func(i int, s *goquery.Selection) {
desc.WriteString(strings.TrimSpace(s.Text()))
desc.WriteString("\\n\\n")
})
out.Description = strings.TrimSpace(desc.String())
out.License = "Public Domain"
d.Find("section.downloads strong > a").Each(func(i int, s *goquery.Selection) {
var fs fileSource
href, exists := s.Attr("href")
if !exists {
return
}
if strings.Contains(href, "donate") {
return
}
fs.Url = href
ext := path.Ext(href)
if len(ext) > 1 {
ext = ext[1:]
}
fs.Format = strings.ToLower(ext)
out.Files = append(out.Files, fs)
})
return out
}
func globalGrey() {
fmt.Println("\033[1mGlobal Grey\033[0m")
t := time.Now().Format("2006/01/02 15:04PM")
bookList := buildGGBookList()
sd := sourceData{
"Global Grey",
t,
"https://www.globalgreyebooks.com",
make([]doc, 0, len(bookList)),
}
fmt.Println(" Gathering book data")
for i := range bookList {
fmt.Print( " ",i+1, " / ", len(bookList), "\r")
sd.Documents = append(sd.Documents, getGGBookDoc(bookList[i]))
}
fmt.Println("")
fmt.Println(" Marshaling JSON")
b, err := json.MarshalIndent(sd, "", " ")
if err != nil {
panic(err)
}
fmt.Println(" Creating file")
f, err := os.Create("global-grey.json")
if err != nil {
panic(err)
}
defer f.Close()
fmt.Println(" Writing file")
f.Write(b)
fmt.Println("Done.")
}

10
go.mod Normal file
View File

@ -0,0 +1,10 @@
module tildegit.org/sloum/bookscrape
go 1.22.1
require github.com/PuerkitoBio/goquery v1.9.1
require (
github.com/andybalholm/cascadia v1.3.2 // indirect
golang.org/x/net v0.21.0 // indirect
)

40
go.sum Normal file
View File

@ -0,0 +1,40 @@
github.com/PuerkitoBio/goquery v1.9.1 h1:mTL6XjbJTZdpfL+Gwl5U2h1l9yEkJjhmlTeV9VPW7UI=
github.com/PuerkitoBio/goquery v1.9.1/go.mod h1:cW1n6TmIMDoORQU5IU/P1T3tGFunOeXEpGP2WHRwkbY=
github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns=
golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4=
golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=

88
gutenberg.go Normal file
View File

@ -0,0 +1,88 @@
package main
import (
"bytes"
"encoding/csv"
"encoding/json"
"fmt"
"os"
"regexp"
"strings"
"time"
)
const pgCsvURL string = "https://www.gutenberg.org/cache/epub/feeds/pg_catalog.csv"
const pgRoot string = "https://www.gutenberg.org"
const pgDownload string = "https://www.gutenberg.org/cache/epub/%s/pg%s.%s"
const pgReplacement string = `$2 $1`
var pgAuthorRE = regexp.MustCompile(`(?m)^([^,]+),\s+([^,]+).*$`)
var pgFormats []string = []string{"epub", "mobi", "txt", "html"}
func retrieveCSV() ([][]string, error) {
fmt.Println(" Gathering book cache")
body, err := getUrl(pgCsvURL)
if err {
return [][]string{}, fmt.Errorf("Error retrieving book cache from Project Gutenberg")
}
r := csv.NewReader(bytes.NewReader(body))
r.Read()
return r.ReadAll()
}
func projectGutenberg() {
fmt.Println("\033[1mStandard Ebooks\033[0m")
t := time.Now().Format("2006/01/02 15:04PM")
bookList, err := retrieveCSV()
if err != nil {
fmt.Fprintln(os.Stderr, err.Error())
return
}
sd := sourceData{
"Project Gutenberg",
t,
pgRoot,
make([]doc, 0, len(bookList)),
}
for i, ln := range bookList {
fmt.Printf("Book %d\r", i)
if len(ln) < 7 || ln[4] != "en" || ln[1] != "Text" {
continue
}
var b doc
b.Files = make([]fileSource, len(pgFormats))
b.Title = ln[3]
b.Author = pgAuthorRE.ReplaceAllString(ln[5], pgReplacement)
b.Subjects = strings.Split(ln[6], ";")
b.Description = "Not provided by source"
for i := range b.Subjects {
b.Subjects[i] = strings.TrimSpace(b.Subjects[i])
}
b.LastUpdate = ln[2]
for i := range b.Files {
var fs fileSource
fs.Format = pgFormats[i]
fs.Url = fmt.Sprintf(pgDownload, ln[0], ln[0], pgFormats[i])
b.Files[i] = fs
}
b.License = "Public Domain (USA)"
sd.Documents = append(sd.Documents, b)
}
fmt.Println("")
fmt.Println(" Marshaling JSON")
b, err := json.MarshalIndent(sd, "", " ")
if err != nil {
panic(err)
}
fmt.Println(" Creating file")
f, err := os.Create("project-gutenberg.json")
if err != nil {
panic(err)
}
defer f.Close()
fmt.Println(" Writing file")
f.Write(b)
fmt.Println("Done.")
}

23
helpers.go Normal file
View File

@ -0,0 +1,23 @@
package main
import (
"fmt"
"net/http"
"io"
"os"
)
func getUrl(u string) ([]byte, bool) {
r, err := http.Get(u)
if err != nil {
fmt.Fprintf(os.Stderr, "Could not retrieve %s\n%s", u, err.Error())
return []byte{}, true
}
defer r.Body.Close()
body, err := io.ReadAll(r.Body)
if err != nil || r.StatusCode > 299 {
fmt.Fprintf(os.Stderr, "Could not retrieve %s\n%s", u, err.Error())
return []byte{}, true
}
return body, false
}

26
main.go Normal file
View File

@ -0,0 +1,26 @@
package main
import (
"flag"
)
func main() {
se := flag.Bool("se", false, "Update 'Standard Ebooks'")
gg := flag.Bool("gg", false, "Update 'Global Grey'")
pg := flag.Bool("pg", false, "Update 'Project Gutenberg'")
all := flag.Bool("all", false, "Update all sources (overrides other flags)")
flag.Parse()
if *se || *all {
standardEbooks()
}
if *gg || *all {
globalGrey()
}
if *pg || *all {
projectGutenberg()
}
if ! *gg && ! *se && ! *pg && ! *all {
flag.PrintDefaults()
}
}

135
se.go Normal file
View File

@ -0,0 +1,135 @@
package main
import (
"bytes"
"encoding/json"
"fmt"
"net/url"
"os"
"path"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
)
const se string = "https://standardebooks.org/ebooks?page=%d"
const seRoot string = "https://standardebooks.com"
func buildSEBookList() []string {
fmt.Println(" Gathering Standard Ebooks book list")
list := make([]string, 0, 50)
for i := 1;;i++{
fmt.Print(" Page ", i, "\r")
body, end := getUrl(fmt.Sprintf(se, i))
if end {
break
}
d, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
if err != nil {
panic(err)
}
d.Find("ol.ebooks-list li > p:not(.author) > a").Each(func(i int, s *goquery.Selection) {
href, exists := s.Attr("href")
if exists {
book, err := url.JoinPath(seRoot,href)
if err != nil {
return
}
list = append(list, book)
}
})
count := d.Find("main nav.pagination > a:last-child[aria-disabled]").Length()
if count > 0 {
break
}
}
fmt.Println("")
return list
}
func getSEBookDoc(u string) doc {
body, fail := getUrl(u)
if fail {
return doc{}
}
d, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
if err != nil {
panic(err)
}
var out doc
out.Subjects = make([]string, 0, 1)
out.Files = make([]fileSource, 0, 3)
out.Title = strings.TrimSpace(d.Find("main article header hgroup h1").First().Text())
out.Author = strings.TrimSpace(d.Find("main article header hgroup h2 a span").First().Text())
d.Find("main article aside ul.tags li").Each(func(i int, s *goquery.Selection) {
out.Subjects = append(out.Subjects, strings.ToLower(strings.TrimSpace(s.Text())))
})
d.Find("main article section#description p").Each(func(i int, s *goquery.Selection) {
out.Description = out.Description + s.Text() + "\\n\\n"
})
out.Description = strings.TrimSpace(out.Description)
out.License = "CCO 1.0 / Public Domain"
out.LastUpdate = d.Find("main article section#history ol li:first-child time").First().AttrOr("datetime", "")
d.Find("main article section#download ul li p span:first-child a").Each(func(i int, s *goquery.Selection) {
var ext string = ""
var fs fileSource
switch i {
case 0:
fs.Format = "epub"
ext = "epub"
case 1:
fs.Format = "azw3"
ext = "azw3"
case 2:
fs.Format = "kepub"
ext = "kepub.epub"
default:
return
}
href, exists := s.Attr("href")
if !exists {
return
}
r := path.Dir(href)
book := path.Base(r)
author := path.Base(path.Dir(r))
fs.Url, _ = url.JoinPath(seRoot, r, "downloads", fmt.Sprintf("%s_%s.%s", author, book, ext))
out.Files = append(out.Files, fs)
})
return out
}
func standardEbooks() {
fmt.Println("\033[1mStandard Ebooks\033[0m")
t := time.Now().Format("2006/01/02 15:04PM")
bookList := buildSEBookList()
sd := sourceData{
"Standard Ebooks",
t,
seRoot,
make([]doc, 0, len(bookList)),
}
fmt.Println(" Gathering book data")
for i := range bookList {
fmt.Print( " ",i+1, " / ", len(bookList), "\r")
sd.Documents = append(sd.Documents, getSEBookDoc(bookList[i]))
}
fmt.Println("")
fmt.Println(" Marshaling JSON")
b, err := json.MarshalIndent(sd, "", " ")
if err != nil {
panic(err)
}
fmt.Println(" Creating file")
f, err := os.Create("standard-ebooks.json")
if err != nil {
panic(err)
}
defer f.Close()
fmt.Println(" Writing file")
f.Write(b)
fmt.Println("Done.")
}

23
types.go Normal file
View File

@ -0,0 +1,23 @@
package main
type fileSource struct {
Url string
Format string
}
type doc struct {
Title string
Files []fileSource
Author string
Subjects []string
Description string
LastUpdate string
License string
}
type sourceData struct {
Name string
LastUpdate string
Url string
Documents []doc
}