134 lines
3.9 KiB
Perl
Executable File
134 lines
3.9 KiB
Perl
Executable File
#!/usr/bin/perl
|
|
#
|
|
# A script for loading Gutenberg's book catalog from their .csv file
|
|
# into the postgres database.
|
|
# Requires DBD::CSV, packaged in debian as libdbd-csv-perl.
|
|
|
|
use English qw(-no_match_vars);
|
|
use common::sense;
|
|
use File::Basename;
|
|
use DBI;
|
|
|
|
if (1 != scalar @ARGV) {
|
|
my $me = basename($PROGRAM_NAME);
|
|
print(<<eof);
|
|
Usage: $me FILENAME
|
|
|
|
where FILENAME is the path to the .csv file, probably named
|
|
pg_catalog.csv, that you downloaded from
|
|
<https://gutenberg.org/cache/epub/feeds/pg_catalog.csv>. This script
|
|
will attempt to load from the file into postgres. It won't try to
|
|
load the whole file, only records with sane names (not blanks, not
|
|
very long). Should be about 12,000 books.
|
|
|
|
Example: $me ~/data/pg_catalog.csv
|
|
eof
|
|
exit(0);
|
|
}
|
|
|
|
my $catpath = $ARGV[0];
|
|
my $dirpath = dirname($catpath);
|
|
my $fname = fileparse($catpath, '.csv');
|
|
|
|
# "Connect" to the Gutenberg catalog file.
|
|
#
|
|
my $dbh_csv = DBI->connect(
|
|
"dbi:CSV:", undef, undef, {
|
|
f_dir => $dirpath,
|
|
f_ext => ".csv",
|
|
f_encoding => "utf8",
|
|
csv_tables => {
|
|
$fname => {
|
|
col_names => [
|
|
qw(
|
|
pgid type issued title lang authors subjects locc bookshelves
|
|
)
|
|
],
|
|
}
|
|
},
|
|
RaiseError => 1,
|
|
PrintError => 1,
|
|
}
|
|
) or die $DBI::errstr;
|
|
|
|
# If these print OK, we can probably read the catalog.
|
|
my ($count_all) = $dbh_csv->selectrow_array("SELECT COUNT(*) FROM $fname");
|
|
printf("Count all: %d\n", $count_all);
|
|
my ($count_auth) = $dbh_csv->selectrow_array(
|
|
"SELECT COUNT(DISTINCT authors) FROM $fname"
|
|
);
|
|
printf("Count authors: %d\n", $count_auth);
|
|
|
|
# Connect to postgres.
|
|
my $dbh_pg = DBI->connect(
|
|
"dbi:Pg:dbname=pg_book_catalog", "pgc_owner", undef,
|
|
{ AutoCommit => 0, RaiseError => 1, PrintError => 1 },
|
|
) or die $DBI::errstr;
|
|
|
|
# Some preliminaries.
|
|
$dbh_pg->do("SET client_min_messages TO 'warning'");
|
|
$dbh_pg->do('TRUNCATE TABLE book, author');
|
|
|
|
# Sadly, DBD:Pg lacks the ability to prepare a DO block. So we resort
|
|
# to creating a function to load a book-author pair.
|
|
$dbh_pg->do(<<EOF1);
|
|
CREATE OR REPLACE FUNCTION addbook (
|
|
IN i_gutenberg_id book.gutenberg_id\%TYPE,
|
|
IN i_title book.title\%TYPE,
|
|
IN i_auth author.name\%TYPE
|
|
) RETURNS book.id\%TYPE
|
|
AS \$\$
|
|
DECLARE
|
|
bid book.id\%TYPE;
|
|
BEGIN
|
|
INSERT INTO author AS a (name) VALUES (i_auth)
|
|
ON CONFLICT (name) DO NOTHING;
|
|
INSERT INTO book AS b (gutenberg_id, title, author_id)
|
|
SELECT i_gutenberg_id, i_title, id FROM author WHERE name = i_auth
|
|
ON CONFLICT DO NOTHING
|
|
RETURNING b.id INTO bid;
|
|
RETURN bid;
|
|
END;\$\$
|
|
LANGUAGE plpgsql
|
|
EOF1
|
|
|
|
# Statements for reading a row from the catalog and loading it into postgres.
|
|
my $sth_pg = $dbh_pg->prepare('SELECT addbook(?, ?, ?)');
|
|
my $sth_csv = $dbh_csv->prepare("SELECT pgid, title, authors FROM $fname");
|
|
$sth_csv->execute();
|
|
|
|
# The load itself.
|
|
my ($rowcount, $loadcount) = (0, 0);
|
|
STDOUT->autoflush(1);
|
|
print("Dots should now appear, per thousand books: ");
|
|
while (my $row = $sth_csv->fetchrow_arrayref) {
|
|
$rowcount++;
|
|
next if (1 == $rowcount); # It ignores skip_first_row so DIY.
|
|
my ($pgid, $title, $auth) = @$row;
|
|
# Discard what we deem excessively long.
|
|
if (length($title) > 128 || length($auth) > 128) {
|
|
next;
|
|
}
|
|
# Discard the weird 'No title' records.
|
|
next if ($title eq 'No title');
|
|
# Some titles contain a newline. Turn occurrences of one or more
|
|
# whitespace characters into a single space.
|
|
$title =~ s/\s+/ /g;
|
|
# Some books have no author.
|
|
$auth = "[no author]" if (0 == length($auth));
|
|
$sth_pg->execute($pgid, $title, $auth);
|
|
my ($bid) = $sth_pg->fetchrow_array;
|
|
if (defined $bid) {
|
|
print(".") if (0 == ++$loadcount % 1000);
|
|
}
|
|
}
|
|
printf("\n%d books loaded from %d records.\n", $loadcount, $rowcount);
|
|
|
|
# Cleanup.
|
|
$sth_pg->finish;
|
|
$dbh_pg->do('DROP FUNCTION addbook (INTEGER, VARCHAR, VARCHAR)');
|
|
$dbh_pg->commit;
|
|
$dbh_pg->disconnect;
|
|
|
|
exit(0);
|