g2e/g2e.awk

# g2e - an opinionated gempub to epub converter written in awk
# THIS IS CHAOTIC SOFTWARE BEWARE <//xj-ix.luxe/wiki/chaotic-software/>

# usage:
# awk -f g2e.awk example-gpub/
# where example-gpub/ is a directory containing an uncompressed gempub file

BEGIN{
	gpubdir = ARGV[1]
	tdir = "templates/" # templates dir
	epubodir = "out/"
	metadatafile = gpubdir "metadata.txt"

	# read metadata file fields
	while( getline < metadatafile ){
		key = $1; sub(":","",key)
		value = $2
		for(i=3;i<=NF;i++) value = value " " $i

		if(key=="cover"){
			sub(/^.+\//,"",value) # get basename
		}

		meta[key] = value

		# set index file as an argument to be read
		if(key == "index" ){
			ARGV[1] = (gpubdir value) # overwrite argument 1
			# store index dirname:
			indexdirname = value
			sub(/[^/]+\.gmi$/,"",indexdirname)
		}
	}

	# read linked files from index and append as arguments to process
	while( getline < (gpubdir meta["index"]) ){
		if($1~"=>") ARGV[ARGC++] = gpubdir indexdirname $2
	}

	# setup epub
	system( "mkdir -p " epubodir "META-INF" )
	system( "mkdir -p " epubodir indexdirname "img" )
	system( "cp -u " tdir "container.xml " epubodir "META-INF/" )
	system( "cp -u " tdir "mimetype " epubodir )
	system( "cp -u " tdir "style.css " epubodir )


	# read templates
	templatefiles = "find " tdir " -type f -not -name '.*'"
	while( (templatefiles | getline )>0 ){
		tpath = $0
		tkey = tpath; sub(tdir, "", tkey)

		RS = "\f" # to get the whole file in one getline
		getline templates[tkey] < tpath # read template
		RS = "\n"
	}

	# start writing metadata files
	content = write_template("content-header.opf", meta )
	toc = write_template( "toc-header.ncx", meta )
	spinetoc = "  <spine toc=\"ncx\">\n"
}

# write the template into output, replacing {keys} with contents of values["keys"]
function write_template( templatek, values ){
	output = ""
	line = templates[templatek]
	while(match(line,/\{[^{}]+\}/)){ # has {key}
		key = substr(line,RSTART+1,RLENGTH-2)
		output = output substr(line, 1, RSTART-1) # before {key}
		output = output values[key]
		line = substr(line, RSTART+RLENGTH)
	}
	output = output line "\n"
	return output
}

# ----------------
# index file links
# ----------------

ARGIND==1 && /^=>/{
	id = $2
	sub(/.gmi$/,"",id)
	name = $3
	for(i=4;i<=NF;i++) name = name " " $i

	ch["id"] = id
	ch["name"] = name
	ch["num"]++
	ch["dir"] = indexdirname
	content = content "    <item id=\"" id "\" href=\"" indexdirname id ".xhtml\" media-type=\"application/xhtml+xml\"/>\n"
	spinetoc = spinetoc "    <itemref idref=\"" id "\" />\n"
	toc = toc write_template( "toc-navpoint.ncx", ch )
	next
}

ARGIND==1{ # skip other lines of the index
	next
}

# when finished reading the index:
ARGIND==2 && FNR==1{
	line = $0
	# add images to manifest
	while( "find " gpubdir indexdirname "img/ -type f -regextype awk -iregex \".+(png|jpg|gif)$\""  | getline){
		path = $0
		dest = path; sub(gpubdir, "", dest)
		system( "cp -u " $0 " " epubodir dest) # copy images
		mediatype = dest~/gif$/ ? "image/gif" : dest~/jpg$/ ? "image/jpeg" : "image/png"
		id = dest
		sub(/^.+\//,"",id) # get basename

		properties = ""
		if(id~meta["cover"]){
			properties = "properties=\"cover-image\""
		}
		content = content "    <item id=\"" id "\" href=\"" dest "\" media-type=\"" mediatype "\" " properties "/>\n"
	}
	$0 = line

	# finalize metadata files
	content = content "  </manifest>\n\n" spinetoc "  </spine>\n\n</package>"
	print content > epubodir "content.opf"

	toc = toc "  </navMap>\n</ncx>"
	print toc > epubodir "toc.ncx"
}


# -------------
# content files
# -------------

function finishfile(){
	# finish writing the previous content file
	out = out "\t</main>\n  </body>\n</html>"
	print out > nameout
}
function append(line){
	out = out "\t" line "\n"
}

# setup the writing for this content file
FNR==1 {
	if(ARGIND>2) finishfile()

	id=FILENAME
	match(id,/[^/]+.gmi$/)
	filenamestart = RSTART
	match(id,/.gmi$/)
	name = substr(id,filenamestart,RSTART-filenamestart)
	nameout = epubodir indexdirname name ".xhtml"

	sub(/#{1,3}[[:blank:]]+/,"", $0) # use first line as document title
	m["title"] = $0
	out = write_template("header.xhtml",m)
	is_pre = 0
	is_list = 0
	test = "hola"
	next
}

function sanitize(){
	gsub("&","\\&amp;")
	gsub("<","\\&lt;")
	gsub(">","\\&gt;")
}

# --------------------------
# gemtext to html conversion
# --------------------------

# pre-formatted
/^```/{
	is_pre = !is_pre
	if(is_pre) append("<pre>")
	else append("</pre>")
	next
}


is_pre{
	sanitize()
	append($0)
	next
}

# empty lines
/^$/{
	if(is_list){ append("</ul>"); is_list = 0 }
	next
}

# lists
sub(/^\*[[:space:]]*/,""){
	if(!is_list){ append("<ul>"); is_list = 1 }
	sanitize()
	append("<li>" $0 "</li>")
	next
}

# headers
match($0,/^#{1,3}+/){
	sub(/^#{1,3}[[:space:]]+/,"",$0)
	sanitize()
	append( "<h" RLENGTH ">" $0 "</h" RLENGTH ">")
	next
}

# blockquote
sub(/^>[[:space:]]*/,""){
	sanitize()
	append("<blockquote>" $0 "</blockquote>")
	next
}

# links?
sub(/^=>[[:space:]]*/,""){
	link = $1
	text = $2
	for(i=3;i<=NF;i++) text = text " " $i
	if(link~/gmi$/){
		sub(/gmi$/,"xhtml",link)
		append("<p><a href=\"" link "\">" text "</a></p>")
	}
	else if(link~/(gif|jpg|png)$/){
		append("<img src=\"" link "\" alt=\"" text "\" />")
	}
	else{
		append("<p><a href=\"" link "\">" text "</a></p>")
	}
	next
}

# raw html +
sub(/^\+[[:space:]]*/,""){
	append($0)
	next
}

# paragraphs
{
	sanitize()
	append("<p>" $0 "</p>")
}

END{
	finishfile()
}
header parsing 2021-12-14 03:54:19 +00:00			`# g2e - an opinionated gempub to epub converter written in awk`
			`# THIS IS CHAOTIC SOFTWARE BEWARE <//xj-ix.luxe/wiki/chaotic-software/>`

			`# usage:`
created template and template reading 2021-12-14 00:34:15 +00:00			`# awk -f g2e.awk example-gpub/`
header parsing 2021-12-14 03:54:19 +00:00			`# where example-gpub/ is a directory containing an uncompressed gempub file`

created template and template reading 2021-12-14 00:34:15 +00:00			`BEGIN{`
			`gpubdir = ARGV[1]`
streamlined templates 2021-12-14 02:49:28 +00:00			`tdir = "templates/" # templates dir`
build content.opf from index gmi files 2021-12-14 01:07:14 +00:00			`epubodir = "out/"`
created template and template reading 2021-12-14 00:34:15 +00:00			`metadatafile = gpubdir "metadata.txt"`

			`# read metadata file fields`
			`while( getline < metadatafile ){`
header parsing 2021-12-14 03:54:19 +00:00			`key = $1; sub(":","",key)`
created template and template reading 2021-12-14 00:34:15 +00:00			`value = $2`
			`for(i=3;i<=NF;i++) value = value " " $i`

use header image from metadata.txt 2022-01-09 01:29:20 +00:00			`if(key=="cover"){`
			`sub(/^.+\//,"",value) # get basename`
			`}`

created template and template reading 2021-12-14 00:34:15 +00:00			`meta[key] = value`

			`# set index file as an argument to be read`
			`if(key == "index" ){`
			`ARGV[1] = (gpubdir value) # overwrite argument 1`
			`# store index dirname:`
			`indexdirname = value`
			`sub(/[^/]+\.gmi$/,"",indexdirname)`
			`}`
			`}`

license 2021-12-14 02:56:05 +00:00			`# read linked files from index and append as arguments to process`
created template and template reading 2021-12-14 00:34:15 +00:00			`while( getline < (gpubdir meta["index"]) ){`
			`if($1~"=>") ARGV[ARGC++] = gpubdir indexdirname $2`
			`}`
build content.opf from index gmi files 2021-12-14 01:07:14 +00:00
			`# setup epub`
streamlined templates 2021-12-14 02:49:28 +00:00			`system( "mkdir -p " epubodir "META-INF" )`
support for images 2021-12-14 20:09:25 +00:00			`system( "mkdir -p " epubodir indexdirname "img" )`
streamlined templates 2021-12-14 02:49:28 +00:00			`system( "cp -u " tdir "container.xml " epubodir "META-INF/" )`
			`system( "cp -u " tdir "mimetype " epubodir )`
generate valid epub 2021-12-14 19:10:39 +00:00			`system( "cp -u " tdir "style.css " epubodir )`
support for images 2021-12-14 20:09:25 +00:00
streamlined templates 2021-12-14 02:49:28 +00:00
			`# read templates`
header parsing 2021-12-14 03:54:19 +00:00			`templatefiles = "find " tdir " -type f -not -name '.*'"`
streamlined templates 2021-12-14 02:49:28 +00:00			`while( (templatefiles \| getline )>0 ){`
			`tpath = $0`
header parsing 2021-12-14 03:54:19 +00:00			`tkey = tpath; sub(tdir, "", tkey)`
streamlined templates 2021-12-14 02:49:28 +00:00
			`RS = "\f" # to get the whole file in one getline`
header parsing 2021-12-14 03:54:19 +00:00			`getline templates[tkey] < tpath # read template`
streamlined templates 2021-12-14 02:49:28 +00:00			`RS = "\n"`
			`}`
generate toc.ncx from index 2021-12-14 01:34:01 +00:00
added some comments 2021-12-14 20:16:06 +00:00			`# start writing metadata files`
use header image from metadata.txt 2022-01-09 01:29:20 +00:00			`content = write_template("content-header.opf", meta )`
generate toc.ncx from index 2021-12-14 01:34:01 +00:00			`toc = write_template( "toc-header.ncx", meta )`
build content.opf from index gmi files 2021-12-14 01:07:14 +00:00			`spinetoc = " <spine toc=\"ncx\">\n"`
created template and template reading 2021-12-14 00:34:15 +00:00			`}`

generate toc.ncx from index 2021-12-14 01:34:01 +00:00			`# write the template into output, replacing {keys} with contents of values["keys"]`
			`function write_template( templatek, values ){`
build content.opf from index gmi files 2021-12-14 01:07:14 +00:00			`output = ""`
generate toc.ncx from index 2021-12-14 01:34:01 +00:00			`line = templates[templatek]`
			`while(match(line,/\{[^{}]+\}/)){ # has {key}`
			`key = substr(line,RSTART+1,RLENGTH-2)`
			`output = output substr(line, 1, RSTART-1) # before {key}`
			`output = output values[key]`
			`line = substr(line, RSTART+RLENGTH)`
build content.opf from index gmi files 2021-12-14 01:07:14 +00:00			`}`
generate toc.ncx from index 2021-12-14 01:34:01 +00:00			`output = output line "\n"`
build content.opf from index gmi files 2021-12-14 01:07:14 +00:00			`return output`
			`}`

added some comments 2021-12-14 20:16:06 +00:00			`# ----------------`
build content.opf from index gmi files 2021-12-14 01:07:14 +00:00			`# index file links`
added some comments 2021-12-14 20:16:06 +00:00			`# ----------------`

build content.opf from index gmi files 2021-12-14 01:07:14 +00:00			`ARGIND==1 && /^=>/{`
			`id = $2`
			`sub(/.gmi$/,"",id)`
generate toc.ncx from index 2021-12-14 01:34:01 +00:00			`name = $3`
			`for(i=4;i<=NF;i++) name = name " " $i`

			`ch["id"] = id`
			`ch["name"] = name`
			`ch["num"]++`
generate valid epub 2021-12-14 19:10:39 +00:00			`ch["dir"] = indexdirname`
			`content = content " <item id=\"" id "\" href=\"" indexdirname id ".xhtml\" media-type=\"application/xhtml+xml\"/>\n"`
generate toc.ncx from index 2021-12-14 01:34:01 +00:00			`spinetoc = spinetoc " <itemref idref=\"" id "\" />\n"`
generate valid epub 2021-12-14 19:10:39 +00:00			`toc = toc write_template( "toc-navpoint.ncx", ch )`
progress on <pre> 2021-12-14 04:07:36 +00:00			`next`
			`}`

			`ARGIND==1{ # skip other lines of the index`
			`next`
created template and template reading 2021-12-14 00:34:15 +00:00			`}`
streamlined templates 2021-12-14 02:49:28 +00:00
generate toc.ncx from index 2021-12-14 01:34:01 +00:00			`# when finished reading the index:`
build content.opf from index gmi files 2021-12-14 01:07:14 +00:00			`ARGIND==2 && FNR==1{`
support for images 2021-12-14 20:09:25 +00:00			`line = $0`
			`# add images to manifest`
			`while( "find " gpubdir indexdirname "img/ -type f -regextype awk -iregex \".+(png\|jpg\|gif)$\"" \| getline){`
			`path = $0`
			`dest = path; sub(gpubdir, "", dest)`
			`system( "cp -u " $0 " " epubodir dest) # copy images`
added some comments 2021-12-14 20:16:06 +00:00			`mediatype = dest~/gif$/ ? "image/gif" : dest~/jpg$/ ? "image/jpeg" : "image/png"`
			`id = dest`
			`sub(/^.+\//,"",id) # get basename`
check for cover image 2022-01-09 00:54:15 +00:00
include cover image in metadata 2022-01-09 01:22:07 +00:00			`properties = ""`
use header image from metadata.txt 2022-01-09 01:29:20 +00:00			`if(id~meta["cover"]){`
include cover image in metadata 2022-01-09 01:22:07 +00:00			`properties = "properties=\"cover-image\""`
			`}`
check for cover image 2022-01-09 00:54:15 +00:00			`content = content " <item id=\"" id "\" href=\"" dest "\" media-type=\"" mediatype "\" " properties "/>\n"`
support for images 2021-12-14 20:09:25 +00:00			`}`
			`$0 = line`

			`# finalize metadata files`
build content.opf from index gmi files 2021-12-14 01:07:14 +00:00			`content = content " </manifest>\n\n" spinetoc " </spine>\n\n</package>"`
replaced printf with print 2021-12-15 02:24:52 +00:00			`print content > epubodir "content.opf"`
generate toc.ncx from index 2021-12-14 01:34:01 +00:00
generate valid epub 2021-12-14 19:10:39 +00:00			`toc = toc " </navMap>\n</ncx>"`
replaced printf with print 2021-12-15 02:24:52 +00:00			`print toc > epubodir "toc.ncx"`
build content.opf from index gmi files 2021-12-14 01:07:14 +00:00			`}`
generate toc.ncx from index 2021-12-14 01:34:01 +00:00
added some comments 2021-12-14 20:16:06 +00:00
			`# -------------`
generate toc.ncx from index 2021-12-14 01:34:01 +00:00			`# content files`
added some comments 2021-12-14 20:16:06 +00:00			`# -------------`

header parsing 2021-12-14 03:54:19 +00:00			`function finishfile(){`
			`# finish writing the previous content file`
added <main> 2021-12-15 02:27:23 +00:00			`out = out "\t</main>\n </body>\n</html>"`
replaced printf with print 2021-12-15 02:24:52 +00:00			`print out > nameout`
header parsing 2021-12-14 03:54:19 +00:00			`}`
			`function append(line){`
generate valid epub 2021-12-14 19:10:39 +00:00			`out = out "\t" line "\n"`
created template and template reading 2021-12-14 00:34:15 +00:00			`}`
streamlined templates 2021-12-14 02:49:28 +00:00
header parsing 2021-12-14 03:54:19 +00:00			`# setup the writing for this content file`
progress on <pre> 2021-12-14 04:07:36 +00:00			`FNR==1 {`
			`if(ARGIND>2) finishfile()`

header parsing 2021-12-14 03:54:19 +00:00			`id=FILENAME`
			`match(id,/[^/]+.gmi$/)`
			`filenamestart = RSTART`
			`match(id,/.gmi$/)`
			`name = substr(id,filenamestart,RSTART-filenamestart)`
generate valid epub 2021-12-14 19:10:39 +00:00			`nameout = epubodir indexdirname name ".xhtml"`
header parsing 2021-12-14 03:54:19 +00:00
posix compatible 2022-01-08 19:18:11 +00:00			`sub(/#{1,3}[[:blank:]]+/,"", $0) # use first line as document title`
header parsing 2021-12-14 03:54:19 +00:00			`m["title"] = $0`
			`out = write_template("header.xhtml",m)`
progress on <pre> 2021-12-14 04:07:36 +00:00			`is_pre = 0`
			`is_list = 0`
gemtext to html conversion 2021-12-14 17:59:48 +00:00			`test = "hola"`
progress on <pre> 2021-12-14 04:07:36 +00:00			`next`
			`}`

sanitize html symbols 2021-12-15 02:54:32 +00:00			`function sanitize(){`
			`gsub("&","\\&")`
			`gsub("<","\\<")`
			`gsub(">","\\>")`
			`}`

added some comments 2021-12-14 20:16:06 +00:00			`# --------------------------`
gemtext to html conversion 2021-12-14 17:59:48 +00:00			`# gemtext to html conversion`
added some comments 2021-12-14 20:16:06 +00:00			`# --------------------------`
gemtext to html conversion 2021-12-14 17:59:48 +00:00
			`# pre-formatted`
progress on <pre> 2021-12-14 04:07:36 +00:00			/^```/{
			`is_pre = !is_pre`
			`if(is_pre) append("<pre>")`
			`else append("</pre>")`
header parsing 2021-12-14 03:54:19 +00:00			`next`
			`}`

sanitize html symbols 2021-12-15 02:54:32 +00:00
gemtext to html conversion 2021-12-14 17:59:48 +00:00			`is_pre{`
sanitize html symbols 2021-12-15 02:54:32 +00:00			`sanitize()`
gemtext to html conversion 2021-12-14 17:59:48 +00:00			`append($0)`
			`next`
			`}`

			`# empty lines`
			`/^$/{`
			`if(is_list){ append("</ul>"); is_list = 0 }`
			`next`
			`}`

			`# lists`
			`sub(/^\[[:space:]]/,""){`
			`if(!is_list){ append("<ul>"); is_list = 1 }`
sanitize html symbols 2021-12-15 02:54:32 +00:00			`sanitize()`
gemtext to html conversion 2021-12-14 17:59:48 +00:00			`append("<li>" $0 "</li>")`
			`next`
			`}`

progress on <pre> 2021-12-14 04:07:36 +00:00			`# headers`
			`match($0,/^#{1,3}+/){`
gemtext to html conversion 2021-12-14 17:59:48 +00:00			`sub(/^#{1,3}[[:space:]]+/,"",$0)`
sanitize html symbols 2021-12-15 02:54:32 +00:00			`sanitize()`
gemtext to html conversion 2021-12-14 17:59:48 +00:00			`append( "<h" RLENGTH ">" $0 "</h" RLENGTH ">")`
header parsing 2021-12-14 03:54:19 +00:00			`next`
			`}`

gemtext to html conversion 2021-12-14 17:59:48 +00:00			`# blockquote`
			`sub(/^>[[:space:]]*/,""){`
sanitize html symbols 2021-12-15 02:54:32 +00:00			`sanitize()`
gemtext to html conversion 2021-12-14 17:59:48 +00:00			`append("<blockquote>" $0 "</blockquote>")`
			`next`
			`}`

			`# links?`
			`sub(/^=>[[:space:]]*/,""){`
			`link = $1`
			`text = $2`
			`for(i=3;i<=NF;i++) text = text " " $i`
support for images 2021-12-14 20:09:25 +00:00			`if(link~/gmi$/){`
			`sub(/gmi$/,"xhtml",link)`
			`append("<p><a href=\"" link "\">" text "</a></p>")`
			`}`
			`else if(link~/(gif\|jpg\|png)$/){`
			`append("<img src=\"" link "\" alt=\"" text "\" />")`
			`}`
			`else{`
			`append("<p><a href=\"" link "\">" text "</a></p>")`
			`}`
gemtext to html conversion 2021-12-14 17:59:48 +00:00			`next`
			`}`

			`# raw html +`
			`sub(/^\+[[:space:]]*/,""){`
			`append($0)`
			`next`
created template and template reading 2021-12-14 00:34:15 +00:00			`}`

gemtext to html conversion 2021-12-14 17:59:48 +00:00			`# paragraphs`
			`{`
sanitize html symbols 2021-12-15 02:54:32 +00:00			`sanitize()`
gemtext to html conversion 2021-12-14 17:59:48 +00:00			`append("<p>" $0 "</p>")`
			`}`
created template and template reading 2021-12-14 00:34:15 +00:00
gemtext to html conversion 2021-12-14 17:59:48 +00:00			`END{`
			`finishfile()`
			`}`