bookscrape/globalGrey.go

130 lines
3.0 KiB
Go

package main
import (
"bytes"
"encoding/json"
"fmt"
"os"
"path"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
)
const gg string = "https://www.globalgreyebooks.com/category/ebooks/all-ebooks-page-%d.html"
func buildGGBookList() []string {
fmt.Println(" Gathering Global Grey book list")
list := make([]string, 0, 50)
for i := 1;;i++{
fmt.Print(" Page ", i, "\r")
body, end := getUrl(fmt.Sprintf(gg, i))
if end {
break
}
d, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
if err != nil {
panic(err)
}
d.Find("aside.aside-books > a:first-child").Each(func(i int, s *goquery.Selection) {
href, exists := s.Attr("href")
if exists {
list = append(list, href)
}
})
var moreExists = false
d.Find(".page-nav .pagination > a").Each(func(i int, s *goquery.Selection) {
if s.Text() == "Last" {
moreExists = true
}
})
if !moreExists {
break
}
}
fmt.Println("")
return list
}
func getGGBookDoc(u string) doc {
body, fail := getUrl(u)
if fail {
return doc{}
}
d, err := goquery.NewDocumentFromReader(bytes.NewReader(body))
if err != nil {
panic(err)
}
var out doc
out.Subjects = make([]string, 0, 1)
out.Files = make([]fileSource, 0, 3)
out.Title = strings.TrimSpace(d.Find("h1").First().Text())
out.Author = strings.TrimSpace(d.Find("h1 + h2").First().Text())
d.Find("section.related > a > button").Each(func(i int, s *goquery.Selection) {
out.Subjects = append(out.Subjects, strings.ToLower(strings.TrimSpace(s.Text())))
})
var desc strings.Builder
d.Find("section.description > p").Each(func(i int, s *goquery.Selection) {
desc.WriteString(strings.TrimSpace(s.Text()))
desc.WriteString("\\n\\n")
})
out.Description = strings.TrimSpace(desc.String())
out.License = "Public Domain"
d.Find("section.downloads strong > a").Each(func(i int, s *goquery.Selection) {
var fs fileSource
href, exists := s.Attr("href")
if !exists {
return
}
if strings.Contains(href, "donate") {
return
}
fs.Url = href
ext := path.Ext(href)
if len(ext) > 1 {
ext = ext[1:]
}
fs.Format = strings.ToLower(ext)
out.Files = append(out.Files, fs)
})
return out
}
func globalGrey() {
fmt.Println("\033[1mGlobal Grey\033[0m")
t := time.Now().Format("2006/01/02 15:04PM")
bookList := buildGGBookList()
sd := sourceData{
"Global Grey",
t,
"https://www.globalgreyebooks.com",
make([]doc, 0, len(bookList)),
}
fmt.Println(" Gathering book data")
for i := range bookList {
fmt.Print( " ",i+1, " / ", len(bookList), "\r")
sd.Documents = append(sd.Documents, getGGBookDoc(bookList[i]))
}
fmt.Println("")
fmt.Println(" Marshaling JSON")
b, err := json.MarshalIndent(sd, "", " ")
if err != nil {
panic(err)
}
fmt.Println(" Creating file")
f, err := os.Create("global-grey.json")
if err != nil {
panic(err)
}
defer f.Close()
fmt.Println(" Writing file")
f.Write(b)
fmt.Println("Done.")
}