g2e/g2e.awk

# g2e - an opinionated gempub to epub converter written in awk
# THIS IS CHAOTIC SOFTWARE BEWARE <//xj-ix.luxe/wiki/chaotic-software/>

# usage:
# awk -f g2e.awk example-gpub/
# where example-gpub/ is a directory containing an uncompressed gempub file

BEGIN{
	gpubdir = ARGV[1]
	tdir = "templates/" # templates dir
	epubodir = "out/"
	metadatafile = gpubdir "metadata.txt"

	# read metadata file fields
	while( getline < metadatafile ){
		key = $1; sub(":","",key)
		value = $2
		for(i=3;i<=NF;i++) value = value " " $i

		if(key=="cover"){
			sub(/^.+\//,"",value) # get basename
		}

		meta[key] = value

		# set index file as an argument to be read
		if(key == "index" ){
			ARGV[1] = (gpubdir value) # overwrite argument 1
			# store index dirname:
			indexdirname = value
			sub(/[^/]+\.gmi$/,"",indexdirname)
		}
	}

	# read linked files from index and append as arguments to process
	while( getline < (gpubdir meta["index"]) ){
		if($1~"=>") ARGV[ARGC++] = gpubdir indexdirname $2
	}

	# setup epub
	system( "mkdir -p " epubodir "META-INF" )
	system( "mkdir -p " epubodir indexdirname "img" )
	system( "cp -u " tdir "container.xml " epubodir "META-INF/" )
	system( "cp -u " tdir "mimetype " epubodir )
	system( "cp -u " tdir "style.css " epubodir )


	# read templates
	templatefiles = "find " tdir " -type f -not -name '.*'"
	while( (templatefiles | getline )>0 ){
		tpath = $0
		tkey = tpath; sub(tdir, "", tkey)

		RS = "\f" # to get the whole file in one getline
		getline templates[tkey] < tpath # read template
		RS = "\n"
	}

	# start writing metadata files
	content = write_template("content-header.opf", meta )
	toc = write_template( "toc-header.ncx", meta )
	spinetoc = "  <spine toc=\"ncx\">\n"
}

# write the template into output, replacing {keys} with contents of values["keys"]
function write_template( templatek, values ){
	output = ""
	line = templates[templatek]
	while(match(line,/\{[^{}]+\}/)){ # has {key}
		key = substr(line,RSTART+1,RLENGTH-2)
		output = output substr(line, 1, RSTART-1) # before {key}
		output = output values[key]
		line = substr(line, RSTART+RLENGTH)
	}
	output = output line "\n"
	return output
}

# ----------------
# index file links
# ----------------

ARGIND==1 && /^=>/{
	id = $2
	sub(/.gmi$/,"",id)
	name = $3
	for(i=4;i<=NF;i++) name = name " " $i

	ch["id"] = id
	ch["name"] = name
	ch["num"]++
	ch["dir"] = indexdirname
	content = content "    <item id=\"" id "\" href=\"" indexdirname id ".xhtml\" media-type=\"application/xhtml+xml\"/>\n"
	spinetoc = spinetoc "    <itemref idref=\"" id "\" />\n"
	toc = toc write_template( "toc-navpoint.ncx", ch )
	next
}

ARGIND==1{ # skip other lines of the index
	next
}

# when finished reading the index:
ARGIND==2 && FNR==1{
	line = $0
	# add images to manifest
	while( "find " gpubdir indexdirname "img/ -type f -regextype awk -iregex \".+(png|jpg|gif)$\""  | getline){
		path = $0
		dest = path; sub(gpubdir, "", dest)
		system( "cp -u " $0 " " epubodir dest) # copy images
		mediatype = dest~/gif$/ ? "image/gif" : dest~/jpg$/ ? "image/jpeg" : "image/png"
		id = dest
		sub(/^.+\//,"",id) # get basename

		properties = ""
		if(id~meta["cover"]){
			properties = "properties=\"cover-image\""
		}
		content = content "    <item id=\"" id "\" href=\"" dest "\" media-type=\"" mediatype "\" " properties "/>\n"
	}
	$0 = line

	# finalize metadata files
	content = content "  </manifest>\n\n" spinetoc "  </spine>\n\n</package>"
	print content > epubodir "content.opf"

	toc = toc "  </navMap>\n</ncx>"
	print toc > epubodir "toc.ncx"
}


# -------------
# content files
# -------------

function finishfile(){
	# finish writing the previous content file
	if(is_list){ append("</ul>"); is_list = 0 }
	if(is_pre){ append("</pre>"); is_pre = 0 }
	out = out "\t</main>\n  </body>\n</html>"
	print out > nameout
}
function append(line){
	out = out "\t" line "\n"
}

# setup the writing for this content file
FNR==1 {
	if(ARGIND>2) finishfile()

	id=FILENAME
	match(id,/[^/]+.gmi$/)
	filenamestart = RSTART
	match(id,/.gmi$/)
	name = substr(id,filenamestart,RSTART-filenamestart)
	nameout = epubodir indexdirname name ".xhtml"

	sub(/#{1,3}[[:blank:]]+/,"", $0) # use first line as document title
	m["title"] = $0
	out = write_template("header.xhtml",m)
	is_pre = 0
	is_list = 0
	test = "hola"
	next
}

function sanitize(){
	gsub("&","\\&amp;")
	gsub("<","\\&lt;")
	gsub(">","\\&gt;")
}

# --------------------------
# gemtext to html conversion
# --------------------------

# pre-formatted
/^```/{
	is_pre = !is_pre
	if(is_pre) append("<pre>")
	else append("</pre>")
	next
}


is_pre{
	sanitize()
	append($0)
	next
}

# empty lines
/^$/{
	if(is_list){ append("</ul>"); is_list = 0 }
	next
}

# lists
sub(/^\*[[:space:]]*/,""){
	if(!is_list){ append("<ul>"); is_list = 1 }
	sanitize()
	append("<li>" $0 "</li>")
	next
}

# headers
match($0,/^#{1,3}+/){
	sub(/^#{1,3}[[:space:]]+/,"",$0)
	sanitize()
	append( "<h" RLENGTH ">" $0 "</h" RLENGTH ">")
	next
}

# blockquote
sub(/^>[[:space:]]*/,""){
	sanitize()
	append("<blockquote>" $0 "</blockquote>")
	next
}

# links?
sub(/^=>[[:space:]]*/,""){
	link = $1
	text = $2
	for(i=3;i<=NF;i++) text = text " " $i
	if(link~/gmi$/){
		sub(/gmi$/,"xhtml",link)
		append("<p><a href=\"" link "\">" text "</a></p>")
	}
	else if(link~/(gif|jpg|png)$/){
		append("<img src=\"" link "\" alt=\"" text "\" />")
	}
	else{
		append("<p><a href=\"" link "\">" text "</a></p>")
	}
	next
}

# raw html +
sub(/^\+[[:space:]]*/,""){
	append($0)
	next
}

# paragraphs
{
	sanitize()
	append("<p>" $0 "</p>")
}

END{
	finishfile()
}