g2e/g2e.awk

252 lines
5.3 KiB
Awk
Raw Normal View History

2021-12-14 03:54:19 +00:00
# g2e - an opinionated gempub to epub converter written in awk
# THIS IS CHAOTIC SOFTWARE BEWARE <//xj-ix.luxe/wiki/chaotic-software/>
# usage:
2021-12-14 00:34:15 +00:00
# awk -f g2e.awk example-gpub/
2021-12-14 03:54:19 +00:00
# where example-gpub/ is a directory containing an uncompressed gempub file
2021-12-14 00:34:15 +00:00
BEGIN{
gpubdir = ARGV[1]
2021-12-14 02:49:28 +00:00
tdir = "templates/" # templates dir
2021-12-14 01:07:14 +00:00
epubodir = "out/"
2021-12-14 00:34:15 +00:00
metadatafile = gpubdir "metadata.txt"
# read metadata file fields
while( getline < metadatafile ){
2021-12-14 03:54:19 +00:00
key = $1; sub(":","",key)
2021-12-14 00:34:15 +00:00
value = $2
for(i=3;i<=NF;i++) value = value " " $i
2022-01-09 01:29:20 +00:00
if(key=="cover"){
sub(/^.+\//,"",value) # get basename
}
2021-12-14 00:34:15 +00:00
meta[key] = value
# set index file as an argument to be read
if(key == "index" ){
ARGV[1] = (gpubdir value) # overwrite argument 1
# store index dirname:
indexdirname = value
sub(/[^/]+\.gmi$/,"",indexdirname)
}
}
2021-12-14 02:56:05 +00:00
# read linked files from index and append as arguments to process
2021-12-14 00:34:15 +00:00
while( getline < (gpubdir meta["index"]) ){
if($1~"=>") ARGV[ARGC++] = gpubdir indexdirname $2
}
2021-12-14 01:07:14 +00:00
# setup epub
2021-12-14 02:49:28 +00:00
system( "mkdir -p " epubodir "META-INF" )
2021-12-14 20:09:25 +00:00
system( "mkdir -p " epubodir indexdirname "img" )
2021-12-14 02:49:28 +00:00
system( "cp -u " tdir "container.xml " epubodir "META-INF/" )
system( "cp -u " tdir "mimetype " epubodir )
2021-12-14 19:10:39 +00:00
system( "cp -u " tdir "style.css " epubodir )
2021-12-14 20:09:25 +00:00
2021-12-14 02:49:28 +00:00
# read templates
2021-12-14 03:54:19 +00:00
templatefiles = "find " tdir " -type f -not -name '.*'"
2021-12-14 02:49:28 +00:00
while( (templatefiles | getline )>0 ){
tpath = $0
2021-12-14 03:54:19 +00:00
tkey = tpath; sub(tdir, "", tkey)
2021-12-14 02:49:28 +00:00
RS = "\f" # to get the whole file in one getline
2021-12-14 03:54:19 +00:00
getline templates[tkey] < tpath # read template
2021-12-14 02:49:28 +00:00
RS = "\n"
}
2021-12-14 01:34:01 +00:00
2021-12-14 20:16:06 +00:00
# start writing metadata files
2022-01-09 01:29:20 +00:00
content = write_template("content-header.opf", meta )
2021-12-14 01:34:01 +00:00
toc = write_template( "toc-header.ncx", meta )
2021-12-14 01:07:14 +00:00
spinetoc = " <spine toc=\"ncx\">\n"
2021-12-14 00:34:15 +00:00
}
2021-12-14 01:34:01 +00:00
# write the template into output, replacing {keys} with contents of values["keys"]
function write_template( templatek, values ){
2021-12-14 01:07:14 +00:00
output = ""
2021-12-14 01:34:01 +00:00
line = templates[templatek]
while(match(line,/\{[^{}]+\}/)){ # has {key}
key = substr(line,RSTART+1,RLENGTH-2)
output = output substr(line, 1, RSTART-1) # before {key}
output = output values[key]
line = substr(line, RSTART+RLENGTH)
2021-12-14 01:07:14 +00:00
}
2021-12-14 01:34:01 +00:00
output = output line "\n"
2021-12-14 01:07:14 +00:00
return output
}
2021-12-14 20:16:06 +00:00
# ----------------
2021-12-14 01:07:14 +00:00
# index file links
2021-12-14 20:16:06 +00:00
# ----------------
2021-12-14 01:07:14 +00:00
ARGIND==1 && /^=>/{
id = $2
sub(/.gmi$/,"",id)
2021-12-14 01:34:01 +00:00
name = $3
for(i=4;i<=NF;i++) name = name " " $i
ch["id"] = id
ch["name"] = name
ch["num"]++
2021-12-14 19:10:39 +00:00
ch["dir"] = indexdirname
content = content " <item id=\"" id "\" href=\"" indexdirname id ".xhtml\" media-type=\"application/xhtml+xml\"/>\n"
2021-12-14 01:34:01 +00:00
spinetoc = spinetoc " <itemref idref=\"" id "\" />\n"
2021-12-14 19:10:39 +00:00
toc = toc write_template( "toc-navpoint.ncx", ch )
2021-12-14 04:07:36 +00:00
next
}
ARGIND==1{ # skip other lines of the index
next
2021-12-14 00:34:15 +00:00
}
2021-12-14 02:49:28 +00:00
2021-12-14 01:34:01 +00:00
# when finished reading the index:
2021-12-14 01:07:14 +00:00
ARGIND==2 && FNR==1{
2021-12-14 20:09:25 +00:00
line = $0
# add images to manifest
while( "find " gpubdir indexdirname "img/ -type f -regextype awk -iregex \".+(png|jpg|gif)$\"" | getline){
path = $0
dest = path; sub(gpubdir, "", dest)
system( "cp -u " $0 " " epubodir dest) # copy images
2021-12-14 20:16:06 +00:00
mediatype = dest~/gif$/ ? "image/gif" : dest~/jpg$/ ? "image/jpeg" : "image/png"
id = dest
sub(/^.+\//,"",id) # get basename
2022-01-09 00:54:15 +00:00
2022-01-09 01:22:07 +00:00
properties = ""
2022-01-09 01:29:20 +00:00
if(id~meta["cover"]){
2022-01-09 01:22:07 +00:00
properties = "properties=\"cover-image\""
}
2022-01-09 00:54:15 +00:00
content = content " <item id=\"" id "\" href=\"" dest "\" media-type=\"" mediatype "\" " properties "/>\n"
2021-12-14 20:09:25 +00:00
}
$0 = line
# finalize metadata files
2021-12-14 01:07:14 +00:00
content = content " </manifest>\n\n" spinetoc " </spine>\n\n</package>"
2021-12-15 02:24:52 +00:00
print content > epubodir "content.opf"
2021-12-14 01:34:01 +00:00
2021-12-14 19:10:39 +00:00
toc = toc " </navMap>\n</ncx>"
2021-12-15 02:24:52 +00:00
print toc > epubodir "toc.ncx"
2021-12-14 01:07:14 +00:00
}
2021-12-14 01:34:01 +00:00
2021-12-14 20:16:06 +00:00
# -------------
2021-12-14 01:34:01 +00:00
# content files
2021-12-14 20:16:06 +00:00
# -------------
2021-12-14 03:54:19 +00:00
function finishfile(){
# finish writing the previous content file
2021-12-15 02:27:23 +00:00
out = out "\t</main>\n </body>\n</html>"
2021-12-15 02:24:52 +00:00
print out > nameout
2021-12-14 03:54:19 +00:00
}
function append(line){
2021-12-14 19:10:39 +00:00
out = out "\t" line "\n"
2021-12-14 00:34:15 +00:00
}
2021-12-14 02:49:28 +00:00
2021-12-14 03:54:19 +00:00
# setup the writing for this content file
2021-12-14 04:07:36 +00:00
FNR==1 {
if(ARGIND>2) finishfile()
2021-12-14 03:54:19 +00:00
id=FILENAME
match(id,/[^/]+.gmi$/)
filenamestart = RSTART
match(id,/.gmi$/)
name = substr(id,filenamestart,RSTART-filenamestart)
2021-12-14 19:10:39 +00:00
nameout = epubodir indexdirname name ".xhtml"
2021-12-14 03:54:19 +00:00
2022-01-08 19:18:11 +00:00
sub(/#{1,3}[[:blank:]]+/,"", $0) # use first line as document title
2021-12-14 03:54:19 +00:00
m["title"] = $0
out = write_template("header.xhtml",m)
2021-12-14 04:07:36 +00:00
is_pre = 0
is_list = 0
2021-12-14 17:59:48 +00:00
test = "hola"
2021-12-14 04:07:36 +00:00
next
}
2021-12-15 02:54:32 +00:00
function sanitize(){
gsub("&","\\&amp;")
gsub("<","\\&lt;")
gsub(">","\\&gt;")
}
2021-12-14 20:16:06 +00:00
# --------------------------
2021-12-14 17:59:48 +00:00
# gemtext to html conversion
2021-12-14 20:16:06 +00:00
# --------------------------
2021-12-14 17:59:48 +00:00
# pre-formatted
2021-12-14 04:07:36 +00:00
/^```/{
is_pre = !is_pre
if(is_pre) append("<pre>")
else append("</pre>")
2021-12-14 03:54:19 +00:00
next
}
2021-12-15 02:54:32 +00:00
2021-12-14 17:59:48 +00:00
is_pre{
2021-12-15 02:54:32 +00:00
sanitize()
2021-12-14 17:59:48 +00:00
append($0)
next
}
# empty lines
/^$/{
if(is_list){ append("</ul>"); is_list = 0 }
next
}
# lists
sub(/^\*[[:space:]]*/,""){
if(!is_list){ append("<ul>"); is_list = 1 }
2021-12-15 02:54:32 +00:00
sanitize()
2021-12-14 17:59:48 +00:00
append("<li>" $0 "</li>")
next
}
2021-12-14 04:07:36 +00:00
# headers
match($0,/^#{1,3}+/){
2021-12-14 17:59:48 +00:00
sub(/^#{1,3}[[:space:]]+/,"",$0)
2021-12-15 02:54:32 +00:00
sanitize()
2021-12-14 17:59:48 +00:00
append( "<h" RLENGTH ">" $0 "</h" RLENGTH ">")
2021-12-14 03:54:19 +00:00
next
}
2021-12-14 17:59:48 +00:00
# blockquote
sub(/^>[[:space:]]*/,""){
2021-12-15 02:54:32 +00:00
sanitize()
2021-12-14 17:59:48 +00:00
append("<blockquote>" $0 "</blockquote>")
next
}
# links?
sub(/^=>[[:space:]]*/,""){
link = $1
text = $2
for(i=3;i<=NF;i++) text = text " " $i
2021-12-14 20:09:25 +00:00
if(link~/gmi$/){
sub(/gmi$/,"xhtml",link)
append("<p><a href=\"" link "\">" text "</a></p>")
}
else if(link~/(gif|jpg|png)$/){
append("<img src=\"" link "\" alt=\"" text "\" />")
}
else{
append("<p><a href=\"" link "\">" text "</a></p>")
}
2021-12-14 17:59:48 +00:00
next
}
# raw html +
sub(/^\+[[:space:]]*/,""){
append($0)
next
2021-12-14 00:34:15 +00:00
}
2021-12-14 17:59:48 +00:00
# paragraphs
{
2021-12-15 02:54:32 +00:00
sanitize()
2021-12-14 17:59:48 +00:00
append("<p>" $0 "</p>")
}
2021-12-14 00:34:15 +00:00
2021-12-14 17:59:48 +00:00
END{
finishfile()
}