pgc-db/bin/load-gutenberg.pl

134 lines
3.9 KiB
Perl
Executable File

#!/usr/bin/perl
#
# A script for loading Gutenberg's book catalog from their .csv file
# into the postgres database.
# Requires DBD::CSV, packaged in debian as libdbd-csv-perl.
use English qw(-no_match_vars);
use common::sense;
use File::Basename;
use DBI;
if (1 != scalar @ARGV) {
my $me = basename($PROGRAM_NAME);
print(<<eof);
Usage: $me FILENAME
where FILENAME is the path to the .csv file, probably named
pg_catalog.csv, that you downloaded from
<https://gutenberg.org/cache/epub/feeds/pg_catalog.csv>. This script
will attempt to load from the file into postgres. It won't try to
load the whole file, only records with sane names (not blanks, not
very long). Should be about 12,000 books.
Example: $me ~/data/pg_catalog.csv
eof
exit(0);
}
my $catpath = $ARGV[0];
my $dirpath = dirname($catpath);
my $fname = fileparse($catpath, '.csv');
# "Connect" to the Gutenberg catalog file.
#
my $dbh_csv = DBI->connect(
"dbi:CSV:", undef, undef, {
f_dir => $dirpath,
f_ext => ".csv",
f_encoding => "utf8",
csv_tables => {
$fname => {
col_names => [
qw(
pgid type issued title lang authors subjects locc bookshelves
)
],
}
},
RaiseError => 1,
PrintError => 1,
}
) or die $DBI::errstr;
# If these print OK, we can probably read the catalog.
my ($count_all) = $dbh_csv->selectrow_array("SELECT COUNT(*) FROM $fname");
printf("Count all: %d\n", $count_all);
my ($count_auth) = $dbh_csv->selectrow_array(
"SELECT COUNT(DISTINCT authors) FROM $fname"
);
printf("Count authors: %d\n", $count_auth);
# Connect to postgres.
my $dbh_pg = DBI->connect(
"dbi:Pg:dbname=pg_book_catalog", "pgc_owner", undef,
{ AutoCommit => 0, RaiseError => 1, PrintError => 1 },
) or die $DBI::errstr;
# Some preliminaries.
$dbh_pg->do("SET client_min_messages TO 'warning'");
$dbh_pg->do('TRUNCATE TABLE book, author');
# Sadly, DBD:Pg lacks the ability to prepare a DO block. So we resort
# to creating a function to load a book-author pair.
$dbh_pg->do(<<EOF1);
CREATE OR REPLACE FUNCTION addbook (
IN i_gutenberg_id book.gutenberg_id\%TYPE,
IN i_title book.title\%TYPE,
IN i_auth author.name\%TYPE
) RETURNS book.id\%TYPE
AS \$\$
DECLARE
bid book.id\%TYPE;
BEGIN
INSERT INTO author AS a (name) VALUES (i_auth)
ON CONFLICT (name) DO NOTHING;
INSERT INTO book AS b (gutenberg_id, title, author_id)
SELECT i_gutenberg_id, i_title, id FROM author WHERE name = i_auth
ON CONFLICT DO NOTHING
RETURNING b.id INTO bid;
RETURN bid;
END;\$\$
LANGUAGE plpgsql
EOF1
# Statements for reading a row from the catalog and loading it into postgres.
my $sth_pg = $dbh_pg->prepare('SELECT addbook(?, ?, ?)');
my $sth_csv = $dbh_csv->prepare("SELECT pgid, title, authors FROM $fname");
$sth_csv->execute();
# The load itself.
my ($rowcount, $loadcount) = (0, 0);
STDOUT->autoflush(1);
print("Dots should now appear, per thousand books: ");
while (my $row = $sth_csv->fetchrow_arrayref) {
$rowcount++;
next if (1 == $rowcount); # It ignores skip_first_row so DIY.
my ($pgid, $title, $auth) = @$row;
# Discard what we deem excessively long.
if (length($title) > 128 || length($auth) > 128) {
next;
}
# Discard the weird 'No title' records.
next if ($title eq 'No title');
# Some titles contain a newline. Turn occurrences of one or more
# whitespace characters into a single space.
$title =~ s/\s+/ /g;
# Some books have no author.
$auth = "[no author]" if (0 == length($auth));
$sth_pg->execute($pgid, $title, $auth);
my ($bid) = $sth_pg->fetchrow_array;
if (defined $bid) {
print(".") if (0 == ++$loadcount % 1000);
}
}
printf("\n%d books loaded from %d records.\n", $loadcount, $rowcount);
# Cleanup.
$sth_pg->finish;
$dbh_pg->do('DROP FUNCTION addbook (INTEGER, VARCHAR, VARCHAR)');
$dbh_pg->commit;
$dbh_pg->disconnect;
exit(0);