bookscrape/gutenberg.go

89 lines
2.2 KiB
Go

package main
import (
"bytes"
"encoding/csv"
"encoding/json"
"fmt"
"os"
"regexp"
"strings"
"time"
)
const pgCsvURL string = "https://www.gutenberg.org/cache/epub/feeds/pg_catalog.csv"
const pgRoot string = "https://www.gutenberg.org"
const pgDownload string = "https://www.gutenberg.org/cache/epub/%s/pg%s.%s"
const pgReplacement string = `$2 $1`
var pgAuthorRE = regexp.MustCompile(`(?m)^([^,]+),\s+([^,]+).*$`)
var pgFormats []string = []string{"epub", "mobi", "txt", "html"}
func retrieveCSV() ([][]string, error) {
fmt.Println(" Gathering book cache")
body, err := getUrl(pgCsvURL)
if err {
return [][]string{}, fmt.Errorf("Error retrieving book cache from Project Gutenberg")
}
r := csv.NewReader(bytes.NewReader(body))
r.Read()
return r.ReadAll()
}
func projectGutenberg() {
fmt.Println("\033[1mStandard Ebooks\033[0m")
t := time.Now().Format("2006/01/02 15:04PM")
bookList, err := retrieveCSV()
if err != nil {
fmt.Fprintln(os.Stderr, err.Error())
return
}
sd := sourceData{
"Project Gutenberg",
t,
pgRoot,
make([]doc, 0, len(bookList)),
}
for i, ln := range bookList {
fmt.Printf("Book %d\r", i)
if len(ln) < 7 || ln[4] != "en" || ln[1] != "Text" {
continue
}
var b doc
b.Files = make([]fileSource, len(pgFormats))
b.Title = ln[3]
b.Author = pgAuthorRE.ReplaceAllString(ln[5], pgReplacement)
b.Subjects = strings.Split(ln[6], ";")
b.Description = "Not provided by source"
for i := range b.Subjects {
b.Subjects[i] = strings.TrimSpace(b.Subjects[i])
}
b.LastUpdate = ln[2]
for i := range b.Files {
var fs fileSource
fs.Format = pgFormats[i]
fs.Url = fmt.Sprintf(pgDownload, ln[0], ln[0], pgFormats[i])
b.Files[i] = fs
}
b.License = "Public Domain (USA)"
sd.Documents = append(sd.Documents, b)
}
fmt.Println("")
fmt.Println(" Marshaling JSON")
b, err := json.MarshalIndent(sd, "", " ")
if err != nil {
panic(err)
}
fmt.Println(" Creating file")
f, err := os.Create("project-gutenberg.json")
if err != nil {
panic(err)
}
defer f.Close()
fmt.Println(" Writing file")
f.Write(b)
fmt.Println("Done.")
}